diff options
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | contrib/llvm/lib/Target/X86/X86ISelLowering.cpp | 3024 |
1 files changed, 2134 insertions, 890 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp index 4af12e44fd40..b35fb514bf94 100644 --- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -85,7 +85,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -118,7 +118,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128) * ElemsPerChunk); - SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32); + SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } @@ -158,10 +158,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) Subtarget = &TM.getSubtarget<X86Subtarget>(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); - X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; RegInfo = TM.getRegisterInfo(); - TD = getTargetData(); + TD = getDataLayout(); // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; @@ -180,7 +179,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); - setStackPointerRegisterToSaveRestore(X86StackPtr); + setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); + + // Bypass i32 with i8 on Atom when compiling with O2 + if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) + addBypassSlowDiv(32, 8); if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) { // Setup Windows compiler runtime calls. @@ -453,6 +456,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::SETCC , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); + // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support + // SjLj exception handling but a light-weight setjmp/longjmp replacement to + // support continuation, user-level threading, and etc.. As a result, no + // other SjLj exception interfaces are implemented and please don't build + // your own exception handling based on them. + // LLVM/Clang supports zero-cost DWARF exception handling. + setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); + setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // Darwin ABI issue. setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); @@ -510,6 +521,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom); + setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom); } if (Subtarget->hasCmpxchg16b()) { @@ -541,6 +556,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); + setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); @@ -737,6 +753,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand); + setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand); setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand); @@ -826,6 +843,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); + setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); @@ -859,6 +877,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); + setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); @@ -927,6 +946,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); + + setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); + // As there is no 64-bit GPR available, we need build a special custom + // sequence to convert from v2i32 to v2f32. + if (!Subtarget->is64Bit()) + setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); + + setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); + setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); } if (Subtarget->hasSSE41()) { @@ -941,6 +972,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1018,19 +1052,33 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); + setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); + setOperationAction(ISD::FABS, MVT::v4f64, Custom); + + setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); + + setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); + + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); + setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); @@ -1054,7 +1102,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::VSELECT, MVT::v8i32, Legal); setOperationAction(ISD::VSELECT, MVT::v8f32, Legal); - if (Subtarget->hasFMA()) { + if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Custom); setOperationAction(ISD::FMA, MVT::v4f64, Custom); setOperationAction(ISD::FMA, MVT::v4f32, Custom); @@ -1219,10 +1267,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::TRUNCATE); - setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); - setTargetDAGCombine(ISD::FP_TO_SINT); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); @@ -1320,7 +1366,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, // cases like PR2962. This should be removed when PR2962 is fixed. const Function *F = MF.getFunction(); if (IsZeroVal && - !F->hasFnAttr(Attribute::NoImplicitFloat)) { + !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && @@ -1988,7 +2034,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain, unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); - bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = Fn->getFnAttributes(). + hasAttribute(Attributes::NoImplicitFloat); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && @@ -2136,16 +2183,14 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, - SDValue Chain, SDValue RetAddrFrIdx, - bool Is64Bit, int FPDiff, DebugLoc dl) { + SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, + unsigned SlotSize, int FPDiff, DebugLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. - int SlotSize = Is64Bit ? 8 : 4; int NewReturnAddrFI = MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false); - EVT VT = Is64Bit ? MVT::i64 : MVT::i32; - SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); + SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(NewReturnAddrFI), false, false, 0); @@ -2180,7 +2225,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, - MF.getFunction()->hasStructRetAttr(), + MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require @@ -2220,14 +2265,15 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, int FPDiff = 0; if (isTailCall && !IsSibcall) { // Lower arguments at fp - stackoffset + fpdiff. - unsigned NumBytesCallerPushed = - MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); + X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); + unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); + FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. - if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) - MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); + if (FPDiff < X86Info->getTCReturnAddrDelta()) + X86Info->setTCReturnAddrDelta(FPDiff); } if (!IsSibcall) @@ -2304,7 +2350,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy()); + StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + getPointerTy()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } @@ -2392,7 +2439,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); if (StackPtr.getNode() == 0) - StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, + StackPtr = DAG.getCopyFromReg(Chain, dl, + RegInfo->getStackRegister(), getPointerTy()); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); @@ -2414,7 +2462,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, &MemOpChains2[0], MemOpChains2.size()); // Store the return address to the appropriate stack slot. - Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit, + Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, + getPointerTy(), RegInfo->getSlotSize(), FPDiff, dl); } @@ -2464,7 +2513,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, OpFlags = X86II::MO_DARWIN_STUB; } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) && - cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) { + cast<Function>(GV)->getFnAttributes(). + hasAttribute(Attributes::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). @@ -2625,7 +2675,7 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; - uint64_t SlotSize = TD->getPointerSize(); + unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); @@ -2700,6 +2750,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, + Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, @@ -2711,6 +2762,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = DAG.getMachineFunction().getFunction(); + + // If the function return type is x86_fp80 and the callee return type is not, + // then the FP_EXTEND of the call result is not a nop. It's not safe to + // perform a tailcall optimization here. + if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) + return false; + CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; @@ -2834,7 +2892,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = - ((X86TargetMachine&)getTargetMachine()).getInstrInfo(); + ((const X86TargetMachine&)getTargetMachine()).getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; @@ -2985,7 +3043,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. - uint64_t SlotSize = TD->getPointerSize(); + unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); @@ -3508,25 +3566,26 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) MatchOddMask = false; } - static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1}; - static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1}; - const int *CompactionMask; - if (MatchEvenMask) - CompactionMask = CompactionMaskEven; - else if (MatchOddMask) - CompactionMask = CompactionMaskOdd; - else + if (!MatchEvenMask && !MatchOddMask) return SDValue(); SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); - SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0), - UndefNode, CompactionMask); - SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1), - UndefNode, CompactionMask); - static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13}; - return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask); + SDValue Op0 = SVOp->getOperand(0); + SDValue Op1 = SVOp->getOperand(1); + + if (MatchEvenMask) { + // Shift the second operand right to 32 bits. + static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; + Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); + } else { + // Shift the first operand left to 32 bits. + static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; + Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); + } + static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; + return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); } /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand @@ -4577,7 +4636,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, MVT ShufVT = V.getValueType().getSimpleVT(); unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector<int, 16> ShuffleMask; - SDValue ImmN; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) @@ -4979,6 +5037,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); + + // Make sure the newly-created LOAD is in the same position as LDBase in + // terms of dependency. We create a TokenFactor for LDBase and ResNode, and + // update uses of LDBase's output chain to use the TokenFactor. + if (LDBase->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), + SDValue(ResNode.getNode(), 1)); + } + return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); @@ -4992,7 +5062,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts, /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. SDValue -X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { +X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget->hasAVX()) return SDValue(); @@ -5116,80 +5186,78 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const { return SDValue(); } -// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64 -// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the -// constraint of matching input/output vector elements. SDValue -X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const { - DebugLoc DL = Op.getDebugLoc(); - SDNode *N = Op.getNode(); +X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - unsigned NumElts = Op.getNumOperands(); - // Check supported types and sub-targets. - // - // Only v2f32 -> v2f64 needs special handling. - if (VT != MVT::v2f64 || !Subtarget->hasSSE2()) + // Skip if insert_vec_elt is not supported. + if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); - SDValue VecIn; - EVT VecInVT; - SmallVector<int, 8> Mask; - EVT SrcVT = MVT::Other; + DebugLoc DL = Op.getDebugLoc(); + unsigned NumElems = Op.getNumOperands(); + + SDValue VecIn1; + SDValue VecIn2; + SmallVector<unsigned, 4> InsertIndices; + SmallVector<int, 8> Mask(NumElems, -1); - // Check the patterns could be translated into X86vfpext. - for (unsigned i = 0; i < NumElts; ++i) { - SDValue In = N->getOperand(i); - unsigned Opcode = In.getOpcode(); + for (unsigned i = 0; i != NumElems; ++i) { + unsigned Opc = Op.getOperand(i).getOpcode(); - // Skip if the element is undefined. - if (Opcode == ISD::UNDEF) { - Mask.push_back(-1); + if (Opc == ISD::UNDEF) continue; - } - // Quit if one of the elements is not defined from 'fpext'. - if (Opcode != ISD::FP_EXTEND) - return SDValue(); + if (Opc != ISD::EXTRACT_VECTOR_ELT) { + // Quit if more than 1 elements need inserting. + if (InsertIndices.size() > 1) + return SDValue(); + + InsertIndices.push_back(i); + continue; + } - // Check how the source of 'fpext' is defined. - SDValue L2In = In.getOperand(0); - EVT L2InVT = L2In.getValueType(); + SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); + SDValue ExtIdx = Op.getOperand(i).getOperand(1); - // Check the original type - if (SrcVT == MVT::Other) - SrcVT = L2InVT; - else if (SrcVT != L2InVT) // Quit if non-homogenous typed. + // Quit if extracted from vector of different type. + if (ExtractedFromVec.getValueType() != VT) return SDValue(); - // Check whether the value being 'fpext'ed is extracted from the same - // source. - Opcode = L2In.getOpcode(); - - // Quit if it's not extracted with a constant index. - if (Opcode != ISD::EXTRACT_VECTOR_ELT || - !isa<ConstantSDNode>(L2In.getOperand(1))) + // Quit if non-constant index. + if (!isa<ConstantSDNode>(ExtIdx)) return SDValue(); - SDValue ExtractedFromVec = L2In.getOperand(0); + if (VecIn1.getNode() == 0) + VecIn1 = ExtractedFromVec; + else if (VecIn1 != ExtractedFromVec) { + if (VecIn2.getNode() == 0) + VecIn2 = ExtractedFromVec; + else if (VecIn2 != ExtractedFromVec) + // Quit if more than 2 vectors to shuffle + return SDValue(); + } - if (VecIn.getNode() == 0) { - VecIn = ExtractedFromVec; - VecInVT = ExtractedFromVec.getValueType(); - } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec. - return SDValue(); + unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); - Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue()); + if (ExtractedFromVec == VecIn1) + Mask[i] = Idx; + else if (ExtractedFromVec == VecIn2) + Mask[i] = Idx + NumElems; } - // Fill the remaining mask as undef. - for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i) - Mask.push_back(-1); + if (VecIn1.getNode() == 0) + return SDValue(); + + VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); + SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); + for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { + unsigned Idx = InsertIndices[i]; + NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), + DAG.getIntPtrConstant(Idx)); + } - return DAG.getNode(X86ISD::VFPEXT, DL, VT, - DAG.getVectorShuffle(VecInVT, DL, - VecIn, DAG.getUNDEF(VecInVT), - &Mask[0])); + return NV; } SDValue @@ -5224,10 +5292,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (Broadcast.getNode()) return Broadcast; - SDValue FpExt = LowerVectorFpExtend(Op, DAG); - if (FpExt.getNode()) - return FpExt; - unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; @@ -5472,6 +5536,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (LD.getNode()) return LD; + // Check for a build vector from mostly shuffle plus few inserting. + SDValue Sh = buildFromShuffleMostly(Op, DAG); + if (Sh.getNode()) + return Sh; + // For SSE 4.1, use insertps to put the high elements into the low element. if (getSubtarget()->hasSSE41()) { SDValue Result; @@ -5538,8 +5607,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } -SDValue -X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { assert(Op.getNumOperands() == 2); // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors @@ -5548,9 +5616,9 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const { } // Try to lower a shuffle node into a simple blend instruction. -static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, - const X86Subtarget *Subtarget, - SelectionDAG &DAG) { +static SDValue +LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, + const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); DebugLoc dl = SVOp->getDebugLoc(); @@ -5620,9 +5688,9 @@ static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, // 2. [ssse3] 1 x pshufb // 3. [ssse3] 2 x pshufb + 1 x por // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) -SDValue -X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op, - SelectionDAG &DAG) const { +static SDValue +LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); @@ -5879,8 +5947,6 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, DebugLoc dl = SVOp->getDebugLoc(); ArrayRef<int> MaskVals = SVOp->getMask(); - bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; - // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. @@ -5904,7 +5970,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, &pshufbMask[0], 16)); - if (V2IsUndef) + + // As PSHUFB will zero elements with negative indices, it's safe to ignore + // the 2nd operand if it's undefined or zero. + if (V2.getOpcode() == ISD::UNDEF || + ISD::isBuildVectorAllZeros(V2.getNode())) return V1; // Calculate the shuffle mask for the second input, shuffle it, and @@ -5990,6 +6060,51 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } +// v32i8 shuffles - Translate to VPSHUFB if possible. +static +SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, + const X86Subtarget *Subtarget, + SelectionDAG &DAG) { + EVT VT = SVOp->getValueType(0); + SDValue V1 = SVOp->getOperand(0); + SDValue V2 = SVOp->getOperand(1); + DebugLoc dl = SVOp->getDebugLoc(); + SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); + + bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; + bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); + bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); + + // VPSHUFB may be generated if + // (1) one of input vector is undefined or zeroinitializer. + // The mask value 0x80 puts 0 in the corresponding slot of the vector. + // And (2) the mask indexes don't cross the 128-bit lane. + if (VT != MVT::v32i8 || !Subtarget->hasAVX2() || + (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) + return SDValue(); + + if (V1IsAllZero && !V2IsAllZero) { + CommuteVectorShuffleMask(MaskVals, 32); + V1 = V2; + } + SmallVector<SDValue, 32> pshufbMask; + for (unsigned i = 0; i != 32; i++) { + int EltIdx = MaskVals[i]; + if (EltIdx < 0 || EltIdx >= 32) + EltIdx = 0x80; + else { + if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16)) + // Cross lane is not allowed. + return SDValue(); + EltIdx &= 0xf; + } + pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); + } + return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1, + DAG.getNode(ISD::BUILD_VECTOR, dl, + MVT::v32i8, &pshufbMask[0], 32)); +} + /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in @@ -6324,17 +6439,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { } static bool MayFoldVectorLoad(SDValue V) { - if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) + while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); + if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR) V = V.getOperand(0); if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR && V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF) // BUILD_VECTOR (load), undef V = V.getOperand(0); - if (MayFoldLoad(V)) - return true; - return false; + + return MayFoldLoad(V); } // FIXME: the version above should always be used. Since there's @@ -6457,6 +6572,81 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } +// Reduce a vector shuffle to zext. +SDValue +X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const { + // PMOVZX is only available from SSE41. + if (!Subtarget->hasSSE41()) + return SDValue(); + + EVT VT = Op.getValueType(); + + // Only AVX2 support 256-bit vector integer extending. + if (!Subtarget->hasAVX2() && VT.is256BitVector()) + return SDValue(); + + ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); + DebugLoc DL = Op.getDebugLoc(); + SDValue V1 = Op.getOperand(0); + SDValue V2 = Op.getOperand(1); + unsigned NumElems = VT.getVectorNumElements(); + + // Extending is an unary operation and the element type of the source vector + // won't be equal to or larger than i64. + if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() || + VT.getVectorElementType() == MVT::i64) + return SDValue(); + + // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4. + unsigned Shift = 1; // Start from 2, i.e. 1 << 1. + while ((1U << Shift) < NumElems) { + if (SVOp->getMaskElt(1U << Shift) == 1) + break; + Shift += 1; + // The maximal ratio is 8, i.e. from i8 to i64. + if (Shift > 3) + return SDValue(); + } + + // Check the shuffle mask. + unsigned Mask = (1U << Shift) - 1; + for (unsigned i = 0; i != NumElems; ++i) { + int EltIdx = SVOp->getMaskElt(i); + if ((i & Mask) != 0 && EltIdx != -1) + return SDValue(); + if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift)) + return SDValue(); + } + + unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift; + EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift); + + if (!isTypeLegal(NVT)) + return SDValue(); + + // Simplify the operand as it's prepared to be fed into shuffle. + unsigned SignificantBits = NVT.getSizeInBits() >> Shift; + if (V1.getOpcode() == ISD::BITCAST && + V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && + V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + V1.getOperand(0) + .getOperand(0).getValueType().getSizeInBits() == SignificantBits) { + // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x) + SDValue V = V1.getOperand(0).getOperand(0).getOperand(0); + ConstantSDNode *CIdx = + dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1)); + // If it's foldable, i.e. normal load with single use, we will let code + // selection to fold it. Otherwise, we will short the conversion sequence. + if (CIdx && CIdx->getZExtValue() == 0 && + (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) + V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V); + } + + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getNode(X86ISD::VZEXT, DL, NVT, V1)); +} + SDValue X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); @@ -6487,6 +6677,11 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const { return PromoteSplat(SVOp, DAG); } + // Check integer expanding shuffles. + SDValue NewOp = lowerVectorIntExtend(Op, DAG); + if (NewOp.getNode()) + return NewOp; + // If the shuffle can be profitably rewritten as a narrower shuffle, then // do it! if (VT == MVT::v8i16 || VT == MVT::v16i8 || @@ -6536,7 +6731,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { bool HasAVX = Subtarget->hasAVX(); bool HasAVX2 = Subtarget->hasAVX2(); MachineFunction &MF = DAG.getMachineFunction(); - bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); + bool OptForSize = MF.getFunction()->getFnAttributes(). + hasAttribute(Attributes::OptimizeForSize); assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); @@ -6805,7 +7001,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { // Handle v8i16 specifically since SSE can do byte extraction and insertion. if (VT == MVT::v8i16) { - SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG); + SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG); if (NewOp.getNode()) return NewOp; } @@ -6816,6 +7012,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { return NewOp; } + if (VT == MVT::v32i8) { + SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG); + if (NewOp.getNode()) + return NewOp; + } + // Handle all 128-bit wide vectors with 4 elements, and match them with // several different shuffle types. if (NumElems == 4 && VT.is128BitVector()) @@ -6839,9 +7041,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -6856,9 +7058,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, Op.getOperand(0)), Op.getOperand(1))); SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -6942,9 +7144,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Transform it so it match pextrw which produces a 32-bit result. EVT EltVT = MVT::i32; SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT, - Op.getOperand(0), Op.getOperand(1)); + Op.getOperand(0), Op.getOperand(1)); SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract, - DAG.getValueType(VT)); + DAG.getValueType(VT)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); } @@ -7087,8 +7289,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue -X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT OpVT = Op.getValueType(); @@ -7120,8 +7321,8 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const { // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in // a simple subregister reference or explicit instructions to grab // upper bits of a vector. -SDValue -X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { if (Subtarget->hasAVX()) { DebugLoc dl = Op.getNode()->getDebugLoc(); SDValue Vec = Op.getNode()->getOperand(0); @@ -7140,8 +7341,8 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. -SDValue -X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { if (Subtarget->hasAVX()) { DebugLoc dl = Op.getNode()->getDebugLoc(); SDValue Vec = Op.getNode()->getOperand(0); @@ -7284,9 +7485,10 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { Subtarget->ClassifyBlockAddressReference(); CodeModel::Model M = getTargetMachine().getCodeModel(); const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); + int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); DebugLoc dl = Op.getDebugLoc(); - SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), - /*isTarget=*/true, OpFlags); + SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset, + OpFlags); if (Subtarget->isPICStyleRIPRel() && (M == CodeModel::Small || M == CodeModel::Kernel)) @@ -7395,8 +7597,8 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, SDValue InFlag; DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, - DAG.getNode(X86ISD::GlobalBaseReg, - DebugLoc(), PtrVT), InFlag); + DAG.getNode(X86ISD::GlobalBaseReg, + DebugLoc(), PtrVT), InFlag); InFlag = Chain.getValue(1); return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD); @@ -7897,11 +8099,29 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, return Sub; } +SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, + SelectionDAG &DAG) const { + SDValue N0 = Op.getOperand(0); + EVT SVT = N0.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + + assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 || + SVT == MVT::v8i8 || SVT == MVT::v8i16) && + "Custom UINT_TO_FP is not supported!"); + + EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements()); + return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), + DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); +} + SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); + if (Op.getValueType().isVector()) + return lowerUINT_TO_FP_vec(Op, DAG); + // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform // the optimization here. @@ -8075,10 +8295,66 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co } } +SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue In = Op.getOperand(0); + EVT SVT = In.getValueType(); + + if (!VT.is256BitVector() || !SVT.is128BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()) + return SDValue(); + + assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!"); + + // AVX2 has better support of integer extending. + if (Subtarget->hasAVX2()) + return DAG.getNode(X86ISD::VZEXT, DL, VT, In); + + SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In); + static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1}; + SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, + DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0])); + + return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi); +} + +SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + EVT SVT = Op.getOperand(0).getValueType(); + + if (!VT.is128BitVector() || !SVT.is256BitVector() || + VT.getVectorNumElements() != SVT.getVectorNumElements()) + return SDValue(); + + assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!"); + + unsigned NumElems = VT.getVectorNumElements(); + EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + NumElems * 2); + + SDValue In = Op.getOperand(0); + SmallVector<int, 16> MaskVec(NumElems * 2, -1); + // Prepare truncation shuffle mask + for (unsigned i = 0; i != NumElems; ++i) + MaskVec[i] = i * 2; + SDValue V = DAG.getVectorShuffle(NVT, DL, + DAG.getNode(ISD::BITCAST, DL, NVT, In), + DAG.getUNDEF(NVT), &MaskVec[0]); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, + DAG.getIntPtrConstant(0)); +} + SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isVector()) + if (Op.getValueType().isVector()) { + if (Op.getValueType() == MVT::v8i16) + return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(), + DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(), + MVT::v8i32, Op.getOperand(0))); return SDValue(); + } std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, /*IsSigned=*/ true, /*IsReplace=*/ false); @@ -8113,26 +8389,49 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, return FIST; } -SDValue X86TargetLowering::LowerFABS(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + EVT VT = Op.getValueType(); + SDValue In = Op.getOperand(0); + EVT SVT = In.getValueType(); + + assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"); + + return DAG.getNode(X86ISD::VFPEXT, DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, + In, DAG.getUNDEF(SVT))); +} + +SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const { LLVMContext *Context = DAG.getContext(); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); EVT EltVT = VT; - if (VT.isVector()) + unsigned NumElts = VT == MVT::f64 ? 2 : 4; + if (VT.isVector()) { EltVT = VT.getVectorElementType(); - Constant *C; - if (EltVT == MVT::f64) { - C = ConstantVector::getSplat(2, - ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))))); - } else { - C = ConstantVector::getSplat(4, - ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))))); + NumElts = VT.getVectorNumElements(); } - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + Constant *C; + if (EltVT == MVT::f64) + C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))); + else + C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))); + C = ConstantVector::getSplat(NumElts, C); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); + if (VT.isVector()) { + MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + return DAG.getNode(ISD::BITCAST, dl, VT, + DAG.getNode(ISD::AND, dl, ANDVT, + DAG.getNode(ISD::BITCAST, dl, ANDVT, + Op.getOperand(0)), + DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask))); + } return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask); } @@ -8152,10 +8451,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const { else C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))); C = ConstantVector::getSplat(NumElts, C); - SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy()); + unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment(); SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(), - false, false, false, 16); + false, false, false, Alignment); if (VT.isVector()) { MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; return DAG.getNode(ISD::BITCAST, dl, VT, @@ -8241,7 +8541,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit); } -SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { SDValue N0 = Op.getOperand(0); DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); @@ -8252,6 +8552,98 @@ SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT)); } +// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able. +// +SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const { + assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree."); + + if (!Subtarget->hasSSE41()) + return SDValue(); + + if (!Op->hasOneUse()) + return SDValue(); + + SDNode *N = Op.getNode(); + DebugLoc DL = N->getDebugLoc(); + + SmallVector<SDValue, 8> Opnds; + DenseMap<SDValue, unsigned> VecInMap; + EVT VT = MVT::Other; + + // Recognize a special case where a vector is casted into wide integer to + // test all 0s. + Opnds.push_back(N->getOperand(0)); + Opnds.push_back(N->getOperand(1)); + + for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { + SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot; + // BFS traverse all OR'd operands. + if (I->getOpcode() == ISD::OR) { + Opnds.push_back(I->getOperand(0)); + Opnds.push_back(I->getOperand(1)); + // Re-evaluate the number of nodes to be traversed. + e += 2; // 2 more nodes (LHS and RHS) are pushed. + continue; + } + + // Quit if a non-EXTRACT_VECTOR_ELT + if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // Quit if without a constant index. + SDValue Idx = I->getOperand(1); + if (!isa<ConstantSDNode>(Idx)) + return SDValue(); + + SDValue ExtractedFromVec = I->getOperand(0); + DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec); + if (M == VecInMap.end()) { + VT = ExtractedFromVec.getValueType(); + // Quit if not 128/256-bit vector. + if (!VT.is128BitVector() && !VT.is256BitVector()) + return SDValue(); + // Quit if not the same type. + if (VecInMap.begin() != VecInMap.end() && + VT != VecInMap.begin()->first.getValueType()) + return SDValue(); + M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first; + } + M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue(); + } + + assert((VT.is128BitVector() || VT.is256BitVector()) && + "Not extracted from 128-/256-bit vector."); + + unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U; + SmallVector<SDValue, 8> VecIns; + + for (DenseMap<SDValue, unsigned>::const_iterator + I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) { + // Quit if not all elements are used. + if (I->second != FullMask) + return SDValue(); + VecIns.push_back(I->first); + } + + EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; + + // Cast all vectors into TestVT for PTEST. + for (unsigned i = 0, e = VecIns.size(); i < e; ++i) + VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]); + + // If more than one full vectors are evaluated, OR them first before PTEST. + for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) { + // Each iteration will OR 2 nodes and append the result until there is only + // 1 node left, i.e. the final OR'd value of all vectors. + SDValue LHS = VecIns[Slot]; + SDValue RHS = VecIns[Slot + 1]; + VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS)); + } + + return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, + VecIns.back(), VecIns.back()); +} + /// Emit nodes that will be selected as "test Op0,Op0", or something /// equivalent. SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, @@ -8285,7 +8677,33 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, unsigned Opcode = 0; unsigned NumOperands = 0; - switch (Op.getNode()->getOpcode()) { + + // Truncate operations may prevent the merge of the SETCC instruction + // and the arithmetic intruction before it. Attempt to truncate the operands + // of the arithmetic instruction and use a reduced bit-width instruction. + bool NeedTruncation = false; + SDValue ArithOp = Op; + if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) { + SDValue Arith = Op->getOperand(0); + // Both the trunc and the arithmetic op need to have one user each. + if (Arith->hasOneUse()) + switch (Arith.getOpcode()) { + default: break; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: { + NeedTruncation = true; + ArithOp = Arith; + } + } + } + + // NOTICE: In the code below we use ArithOp to hold the arithmetic operation + // which may be the result of a CAST. We use the variable 'Op', which is the + // non-casted variable when we check for possible users. + switch (ArithOp.getOpcode()) { case ISD::ADD: // Due to an isel shortcoming, be conservative if this add is likely to be // selected as part of a load-modify-store instruction. When the root node @@ -8305,7 +8723,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, goto default_case; if (ConstantSDNode *C = - dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) { + dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) { // An add of one will be selected as an INC. if (C->getAPIntValue() == 1) { Opcode = X86ISD::INC; @@ -8341,7 +8759,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && - (User->getOpcode() != ISD::SELECT || UOpNo != 0)) { + !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) { NonFlagUse = true; break; } @@ -8362,14 +8780,20 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, goto default_case; // Otherwise use a regular EFLAGS-setting instruction. - switch (Op.getNode()->getOpcode()) { + switch (ArithOp.getOpcode()) { default: llvm_unreachable("unexpected operator!"); - case ISD::SUB: - Opcode = X86ISD::SUB; - break; - case ISD::OR: Opcode = X86ISD::OR; break; + case ISD::SUB: Opcode = X86ISD::SUB; break; case ISD::XOR: Opcode = X86ISD::XOR; break; case ISD::AND: Opcode = X86ISD::AND; break; + case ISD::OR: { + if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { + SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG); + if (EFLAGS.getNode()) + return EFLAGS; + } + Opcode = X86ISD::OR; + break; + } } NumOperands = 2; @@ -8387,19 +8811,40 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, break; } + // If we found that truncation is beneficial, perform the truncation and + // update 'Op'. + if (NeedTruncation) { + EVT VT = Op.getValueType(); + SDValue WideVal = Op->getOperand(0); + EVT WideVT = WideVal.getValueType(); + unsigned ConvertedOp = 0; + // Use a target machine opcode to prevent further DAGCombine + // optimizations that may separate the arithmetic operations + // from the setcc node. + switch (WideVal.getOpcode()) { + default: break; + case ISD::ADD: ConvertedOp = X86ISD::ADD; break; + case ISD::SUB: ConvertedOp = X86ISD::SUB; break; + case ISD::AND: ConvertedOp = X86ISD::AND; break; + case ISD::OR: ConvertedOp = X86ISD::OR; break; + case ISD::XOR: ConvertedOp = X86ISD::XOR; break; + } + + if (ConvertedOp) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) { + SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0)); + SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1)); + Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1); + } + } + } + if (Opcode == 0) // Emit a CMP with 0, which is the TEST pattern. return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, DAG.getConstant(0, Op.getValueType())); - if (Opcode == X86ISD::CMP) { - SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0), - Op.getOperand(1)); - // We can't replace usage of SUB with CMP. - // The SUB node will be removed later because there is no use of it. - return SDValue(New.getNode(), 0); - } - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); SmallVector<SDValue, 4> Ops; for (unsigned i = 0; i != NumOperands; ++i) @@ -8958,6 +9403,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { } } + // X86 doesn't have an i8 cmov. If both operands are the result of a truncate + // widen the cmov and push the truncate through. This avoids introducing a new + // branch during isel and doesn't add any extensions. + if (Op.getValueType() == MVT::i8 && + Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { + SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); + if (T1.getValueType() == T2.getValueType() && + // Blacklist CopyFromReg to avoid partial register stalls. + T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ + SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); + SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); + return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); + } + } + // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); @@ -9312,7 +9772,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag); Flag = Chain.getValue(1); - Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1); + Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), + SPTy).getValue(1); SDValue Ops1[2] = { Chain.getValue(0), Chain }; return DAG.getMergeValues(Ops1, 2, dl); @@ -9395,7 +9856,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { EVT ArgVT = Op.getNode()->getValueType(0); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); - uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy); + uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); uint8_t ArgMode; // Decide which area this value should be read from. @@ -9415,7 +9876,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { // Sanity Check: Make sure using fp_offset makes sense. assert(!getTargetMachine().Options.UseSoftFloat && !(DAG.getMachineFunction() - .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) && + .getFunction()->getFnAttributes() + .hasAttribute(Attributes::NoImplicitFloat)) && Subtarget->hasSSE1()); } @@ -9446,7 +9908,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { false, false, false, 0); } -SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { // X86-64 va_list is a struct { i32, i32, i8*, i8* }. assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!"); SDValue Chain = Op.getOperand(0); @@ -9507,8 +9970,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT, return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); } -SDValue -X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); switch (IntNo) { @@ -9896,62 +10358,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const Op.getOperand(1), Op.getOperand(2), DAG); } - // Fix vector shift instructions where the last operand is a non-immediate - // i32 value. - case Intrinsic::x86_mmx_pslli_w: - case Intrinsic::x86_mmx_pslli_d: - case Intrinsic::x86_mmx_pslli_q: - case Intrinsic::x86_mmx_psrli_w: - case Intrinsic::x86_mmx_psrli_d: - case Intrinsic::x86_mmx_psrli_q: - case Intrinsic::x86_mmx_psrai_w: - case Intrinsic::x86_mmx_psrai_d: { - SDValue ShAmt = Op.getOperand(2); - if (isa<ConstantSDNode>(ShAmt)) - return SDValue(); - - unsigned NewIntNo; - switch (IntNo) { - default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. - case Intrinsic::x86_mmx_pslli_w: - NewIntNo = Intrinsic::x86_mmx_psll_w; - break; - case Intrinsic::x86_mmx_pslli_d: - NewIntNo = Intrinsic::x86_mmx_psll_d; - break; - case Intrinsic::x86_mmx_pslli_q: - NewIntNo = Intrinsic::x86_mmx_psll_q; - break; - case Intrinsic::x86_mmx_psrli_w: - NewIntNo = Intrinsic::x86_mmx_psrl_w; - break; - case Intrinsic::x86_mmx_psrli_d: - NewIntNo = Intrinsic::x86_mmx_psrl_d; - break; - case Intrinsic::x86_mmx_psrli_q: - NewIntNo = Intrinsic::x86_mmx_psrl_q; - break; - case Intrinsic::x86_mmx_psrai_w: - NewIntNo = Intrinsic::x86_mmx_psra_w; - break; - case Intrinsic::x86_mmx_psrai_d: - NewIntNo = Intrinsic::x86_mmx_psra_d; - break; - } - - // The vector shift intrinsics with scalars uses 32b shift amounts but - // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits - // to be zero. - ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt, - DAG.getConstant(0, MVT::i32)); -// FIXME this must be lowered to get rid of the invalid type. - - EVT VT = Op.getValueType(); - ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt); - return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, - DAG.getConstant(NewIntNo, MVT::i32), - Op.getOperand(1), ShAmt); - } case Intrinsic::x86_sse42_pcmpistria128: case Intrinsic::x86_sse42_pcmpestria128: case Intrinsic::x86_sse42_pcmpistric128: @@ -10030,11 +10436,78 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size()); } + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: { + unsigned Opc; + switch (IntNo) { + default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. + case Intrinsic::x86_fma_vfmadd_ps: + case Intrinsic::x86_fma_vfmadd_pd: + case Intrinsic::x86_fma_vfmadd_ps_256: + case Intrinsic::x86_fma_vfmadd_pd_256: + Opc = X86ISD::FMADD; + break; + case Intrinsic::x86_fma_vfmsub_ps: + case Intrinsic::x86_fma_vfmsub_pd: + case Intrinsic::x86_fma_vfmsub_ps_256: + case Intrinsic::x86_fma_vfmsub_pd_256: + Opc = X86ISD::FMSUB; + break; + case Intrinsic::x86_fma_vfnmadd_ps: + case Intrinsic::x86_fma_vfnmadd_pd: + case Intrinsic::x86_fma_vfnmadd_ps_256: + case Intrinsic::x86_fma_vfnmadd_pd_256: + Opc = X86ISD::FNMADD; + break; + case Intrinsic::x86_fma_vfnmsub_ps: + case Intrinsic::x86_fma_vfnmsub_pd: + case Intrinsic::x86_fma_vfnmsub_ps_256: + case Intrinsic::x86_fma_vfnmsub_pd_256: + Opc = X86ISD::FNMSUB; + break; + case Intrinsic::x86_fma_vfmaddsub_ps: + case Intrinsic::x86_fma_vfmaddsub_pd: + case Intrinsic::x86_fma_vfmaddsub_ps_256: + case Intrinsic::x86_fma_vfmaddsub_pd_256: + Opc = X86ISD::FMADDSUB; + break; + case Intrinsic::x86_fma_vfmsubadd_ps: + case Intrinsic::x86_fma_vfmsubadd_pd: + case Intrinsic::x86_fma_vfmsubadd_ps_256: + case Intrinsic::x86_fma_vfmsubadd_pd_256: + Opc = X86ISD::FMSUBADD; + break; + } + + return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3)); + } } } -SDValue -X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); switch (IntNo) { @@ -10072,21 +10545,21 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); DebugLoc dl = Op.getDebugLoc(); + EVT PtrVT = getPointerTy(); if (Depth > 0) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = - DAG.getConstant(TD->getPointerSize(), - Subtarget->is64Bit() ? MVT::i64 : MVT::i32); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), - DAG.getNode(ISD::ADD, dl, getPointerTy(), + DAG.getConstant(RegInfo->getSlotSize(), PtrVT); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), + DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), MachinePointerInfo(), false, false, false, 0); } // Just load the return address. SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); - return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, MachinePointerInfo(), false, false, false, 0); } @@ -10108,7 +10581,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const { - return DAG.getIntPtrConstant(2*TD->getPointerSize()); + return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize()); } SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { @@ -10123,7 +10596,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX); SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame, - DAG.getIntPtrConstant(TD->getPointerSize())); + DAG.getIntPtrConstant(RegInfo->getSlotSize())); StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset); Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(), false, false, 0); @@ -10134,8 +10607,22 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { Chain, DAG.getRegister(StoreAddrReg, getPointerTy())); } -SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, + DAG.getVTList(MVT::i32, MVT::Other), + Op.getOperand(0), Op.getOperand(1)); +} + +SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, + SelectionDAG &DAG) const { + DebugLoc DL = Op.getDebugLoc(); + return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, + Op.getOperand(0), Op.getOperand(1)); +} + +static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { return Op.getOperand(0); } @@ -10148,6 +10635,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, DebugLoc dl = Op.getDebugLoc(); const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); + const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo(); if (Subtarget->is64Bit()) { SDValue OutChains[6]; @@ -10156,8 +10644,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. - const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10); - const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11); + const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; + const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix @@ -10230,7 +10718,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, for (FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end(); I != E; ++I, ++Idx) - if (Attrs.paramHasAttr(Idx, Attribute::InReg)) + if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg)) // FIXME: should only count parameters that are lowered to integers. InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32; @@ -10259,7 +10747,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, // This is storing the opcode for MOV32ri. const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. - const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg); + const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(MOV32ri|N86Reg, MVT::i8), Trmp, MachinePointerInfo(TrmpAddr), @@ -10358,7 +10846,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal); } -SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -10392,8 +10880,7 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { return Op; } -SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); EVT OpVT = VT; unsigned NumBits = VT.getSizeInBits(); @@ -10418,7 +10905,7 @@ SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op, return Op; } -SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); unsigned NumBits = VT.getSizeInBits(); DebugLoc dl = Op.getDebugLoc(); @@ -10467,21 +10954,22 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) { DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2)); } -SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) { assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } -SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { assert(Op.getValueType().is256BitVector() && Op.getValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); return Lower256IntArith(Op, DAG); } -SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. @@ -10756,7 +11244,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus // a "setcc" instruction that checks the overflow flag. The "brcond" lowering // looks for this combo and may remove the "setcc" instruction if the "setcc" @@ -10871,7 +11359,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra); LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra); - return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);; + return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2); } // fall through case MVT::v4i32: @@ -10884,7 +11372,8 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, } -SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ +static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); // Go ahead and emit the fence on x86-64 even if we asked for no-sse2. @@ -10929,8 +11418,8 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{ return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); } -SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { DebugLoc dl = Op.getDebugLoc(); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); @@ -10968,7 +11457,8 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op, } -SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { EVT T = Op.getValueType(); DebugLoc DL = Op.getDebugLoc(); unsigned Reg = 0; @@ -10999,8 +11489,8 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const { return cpOut; } -SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, - SelectionDAG &DAG) const { +static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget, + SelectionDAG &DAG) { assert(Subtarget->is64Bit() && "Result not type legalized?"); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = Op.getOperand(0); @@ -11018,8 +11508,7 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op, return DAG.getMergeValues(Ops, 2, dl); } -SDValue X86TargetLowering::LowerBITCAST(SDValue Op, - SelectionDAG &DAG) const { +SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT SrcVT = Op.getOperand(0).getValueType(); EVT DstVT = Op.getValueType(); assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() && @@ -11039,7 +11528,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, return SDValue(); } -SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const { +static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) { SDNode *Node = Op.getNode(); DebugLoc dl = Node->getDebugLoc(); EVT T = Node->getValueType(0); @@ -11112,9 +11601,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); - case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG); - case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG); + case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG); + case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -11122,8 +11611,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); - case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG); + case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); + case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); @@ -11135,8 +11624,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); + case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG); + case ISD::ZERO_EXTEND: return lowerZERO_EXTEND(Op, DAG); case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG); case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG); + case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG); case ISD::FABS: return LowerFABS(Op, DAG); case ISD::FNEG: return LowerFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); @@ -11147,7 +11639,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); - case ISD::VACOPY: return LowerVACOPY(Op, DAG); + case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -11156,13 +11648,15 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); + case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); + case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); - case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, DAG); @@ -11172,7 +11666,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); - case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG); + case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::ADDC: case ISD::ADDE: @@ -11265,6 +11759,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } return; } + case ISD::UINT_TO_FP: { + if (N->getOperand(0).getValueType() != MVT::v2i32 && + N->getValueType(0) != MVT::v2f32) + return; + SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, + N->getOperand(0)); + SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), + MVT::f64); + SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias); + SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, + DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias)); + Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or); + SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); + Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); + return; + } + case ISD::FP_ROUND: { + SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0)); + Results.push_back(V); + return; + } case ISD::READCYCLECOUNTER: { SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); SDValue TheChain = N->getOperand(0); @@ -11332,6 +11847,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_OR: case ISD::ATOMIC_LOAD_SUB: case ISD::ATOMIC_LOAD_XOR: + case ISD::ATOMIC_LOAD_MAX: + case ISD::ATOMIC_LOAD_MIN: + case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_SWAP: { unsigned Opc; switch (N->getOpcode()) { @@ -11354,6 +11873,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, case ISD::ATOMIC_LOAD_XOR: Opc = X86ISD::ATOMXOR64_DAG; break; + case ISD::ATOMIC_LOAD_MAX: + Opc = X86ISD::ATOMMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_MIN: + Opc = X86ISD::ATOMMIN64_DAG; + break; + case ISD::ATOMIC_LOAD_UMAX: + Opc = X86ISD::ATOMUMAX64_DAG; + break; + case ISD::ATOMIC_LOAD_UMIN: + Opc = X86ISD::ATOMUMIN64_DAG; + break; case ISD::ATOMIC_SWAP: Opc = X86ISD::ATOMSWAP64_DAG; break; @@ -11420,11 +11951,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FHSUB: return "X86ISD::FHSUB"; case X86ISD::FMAX: return "X86ISD::FMAX"; case X86ISD::FMIN: return "X86ISD::FMIN"; + case X86ISD::FMAXC: return "X86ISD::FMAXC"; + case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; case X86ISD::FRCP: return "X86ISD::FRCP"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR"; case X86ISD::TLSCALL: return "X86ISD::TLSCALL"; + case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP"; + case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP"; case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN"; case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN"; case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m"; @@ -11440,7 +11975,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; + case X86ISD::VZEXT: return "X86ISD::VZEXT"; + case X86ISD::VSEXT: return "X86ISD::VSEXT"; case X86ISD::VFPEXT: return "X86ISD::VFPEXT"; + case X86ISD::VFPROUND: return "X86ISD::VFPROUND"; case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ"; case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ"; case X86ISD::VSHL: return "X86ISD::VSHL"; @@ -11507,6 +12045,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB"; case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD"; + case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; + case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; } } @@ -11655,430 +12195,724 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask, // X86 Scheduler Hooks //===----------------------------------------------------------------------===// -// private utility function -MachineBasicBlock * -X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr, - MachineBasicBlock *MBB, - unsigned regOpc, - unsigned immOpc, - unsigned LoadOpc, - unsigned CXchgOpc, - unsigned notOpc, - unsigned EAXreg, - const TargetRegisterClass *RC, - bool Invert) const { - // For the atomic bitwise operator, we generate - // thisMBB: - // newMBB: - // ld t1 = [bitinstr.addr] - // op t2 = t1, [bitinstr.val] - // not t3 = t2 (if Invert) - // mov EAX = t1 - // lcs dest = [bitinstr.addr], t3 [EAX is implicit] - // bz newMBB - // fallthrough -->nextMBB - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; +/// Utility function to emit xbegin specifying the start of an RTM region. +static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB, + const TargetInstrInfo *TII) { + DebugLoc DL = MI->getDebugLoc(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // For the v = xbegin(), we generate + // + // thisMBB: + // xbegin sinkMBB + // + // mainMBB: + // eax = -1 + // + // sinkMBB: + // v = eax - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(bInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to itself and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - // Insert instructions into newMBB based on incoming instruction - assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 && - "unexpected number of operands"); - DebugLoc dl = bInstr->getDebugLoc(); - MachineOperand& destOper = bInstr->getOperand(0); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - int numArgs = bInstr->getNumOperands() - 1; - for (int i=0; i < numArgs; ++i) - argOpers[i] = &bInstr->getOperand(i+1); - - // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - int valArgIndx = lastAddrIndx + 1; - - unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - - unsigned t2 = F->getRegInfo().createVirtualRegister(RC); - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2); - MIB.addReg(t1); - (*MIB).addOperand(*argOpers[valArgIndx]); + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); - unsigned t3 = F->getRegInfo().createVirtualRegister(RC); - if (Invert) { - MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2); - } - else - t3 = t2; + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // xbegin sinkMBB + // # fallthrough to mainMBB + // # abortion to sinkMBB + BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(sinkMBB); + + // mainMBB: + // EAX = -1 + BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + // EAX is live into the sinkMBB + sinkMBB->addLiveIn(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::EAX); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg); - MIB.addReg(t1); + MI->eraseFromParent(); + return sinkMBB; +} - MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - MIB.addReg(t3); - assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(bInstr->memoperands_begin(), - bInstr->memoperands_end()); +// Get CMPXCHG opcode for the specified data type. +static unsigned getCmpXChgOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::LCMPXCHG8; + case MVT::i16: return X86::LCMPXCHG16; + case MVT::i32: return X86::LCMPXCHG32; + case MVT::i64: return X86::LCMPXCHG64; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); - MIB.addReg(EAXreg); +// Get LOAD opcode for the specified data type. +static unsigned getLoadOpcode(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::MOV8rm; + case MVT::i16: return X86::MOV16rm; + case MVT::i32: return X86::MOV32rm; + case MVT::i64: return X86::MOV64rm; + default: + break; + } + llvm_unreachable("Invalid operand size!"); +} - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); +// Get opcode of the non-atomic one from the specified atomic instruction. +static unsigned getNonAtomicOpcode(unsigned Opc) { + switch (Opc) { + case X86::ATOMAND8: return X86::AND8rr; + case X86::ATOMAND16: return X86::AND16rr; + case X86::ATOMAND32: return X86::AND32rr; + case X86::ATOMAND64: return X86::AND64rr; + case X86::ATOMOR8: return X86::OR8rr; + case X86::ATOMOR16: return X86::OR16rr; + case X86::ATOMOR32: return X86::OR32rr; + case X86::ATOMOR64: return X86::OR64rr; + case X86::ATOMXOR8: return X86::XOR8rr; + case X86::ATOMXOR16: return X86::XOR16rr; + case X86::ATOMXOR32: return X86::XOR32rr; + case X86::ATOMXOR64: return X86::XOR64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction with +// extra opcode. +static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr; + case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr; + case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr; + case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr; + case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr; + case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr; + case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr; + case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr; + case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr; + case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr; + case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr; + case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr; + case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr; + case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr; + case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr; + case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr; + case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr; + case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr; + case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr; + case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target. +static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) { + switch (Opc) { + case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr; + case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr; + case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr; + case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr; + case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr; + case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr; + case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr; + case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr; + case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr; + case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} + +// Get opcode of the non-atomic one from the specified atomic instruction for +// 64-bit data type on 32-bit target with extra opcode. +static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc, + unsigned &HiOpc, + unsigned &ExtraOpc) { + switch (Opc) { + case X86::ATOMNAND6432: + ExtraOpc = X86::NOT32r; + HiOpc = X86::AND32rr; + return X86::AND32rr; + } + llvm_unreachable("Unhandled atomic-load-op opcode!"); +} - bInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; +// Get pseudo CMOV opcode from the specified data type. +static unsigned getPseudoCMOVOpc(EVT VT) { + switch (VT.getSimpleVT().SimpleTy) { + case MVT::i8: return X86::CMOV_GR8; + case MVT::i16: return X86::CMOV_GR16; + case MVT::i32: return X86::CMOV_GR32; + default: + break; + } + llvm_unreachable("Unknown CMOV opcode!"); } -// private utility function: 64 bit atomics on 32 bit host. +// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions. +// They will be translated into a spin-loop or compare-exchange loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// EAX = LOAD MI.addr +// loop: +// t1 = OP MI.val, EAX +// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] +// JNE loop +// sink: +// dst = EAX +// ... MachineBasicBlock * -X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr, - MachineBasicBlock *MBB, - unsigned regOpcL, - unsigned regOpcH, - unsigned immOpcL, - unsigned immOpcH, - bool Invert) const { - // For the atomic bitwise operator, we generate - // thisMBB (instructions are in pairs, except cmpxchg8b) - // ld t1,t2 = [bitinstr.addr] - // newMBB: - // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4) - // op t5, t6 <- out1, out2, [bitinstr.val] - // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val]) - // neg t7, t8 < t5, t6 (if Invert) - // mov ECX, EBX <- t5, t6 - // mov EAX, EDX <- t1, t2 - // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit] - // mov t3, t4 <- EAX, EDX - // bz newMBB - // result in out1, out2 - // fallthrough -->nextMBB - - const TargetRegisterClass *RC = &X86::GR32RegClass; - const unsigned LoadOpc = X86::MOV32rm; - const unsigned NotOpc = X86::NOT32r; +X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI, + MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + DebugLoc DL = MI->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 && + "Unexpected number of operands"); + + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op to have one memoperand"); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg, SrcReg; + unsigned MemOpndSlot; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcReg = MI->getOperand(CurOp++).getReg(); + + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + MVT::SimpleValueType VT = *RC->vt_begin(); + unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT); + + unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT); + unsigned LOADOpc = getLoadOpcode(VT); + + // For the atomic load-arith operator, we generate + // + // thisMBB: + // EAX = LOAD [MI.addr] + // mainMBB: + // t1 = OP MI.val, EAX + // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined] + // JNE mainMBB + // sinkMBB: - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(bInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to itself and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - DebugLoc dl = bInstr->getDebugLoc(); - // Insert instructions into newMBB based on incoming instruction - // There are 8 "real" operands plus 9 implicit def/uses, ignored here. - assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 && - "unexpected number of operands"); - MachineOperand& dest1Oper = bInstr->getOperand(0); - MachineOperand& dest2Oper = bInstr->getOperand(1); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - for (int i=0; i < 2 + X86::AddrNumOperands; ++i) { - argOpers[i] = &bInstr->getOperand(i+2); - - // We use some of the operands multiple times, so conservatively just - // clear any kill flags that might be present. - if (argOpers[i]->isReg() && argOpers[i]->isUse()) - argOpers[i]->setIsKill(false); - } - - // x86 address has 5 operands: base, index, scale, displacement, and segment. - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - - unsigned t1 = F->getRegInfo().createVirtualRegister(RC); - MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - unsigned t2 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2); - // add 4 to displacement. - for (int i=0; i <= lastAddrIndx-2; ++i) - (*MIB).addOperand(*argOpers[i]); - MachineOperand newOp3 = *(argOpers[3]); - if (newOp3.isImm()) - newOp3.setImm(newOp3.getImm()+4); - else - newOp3.setOffset(newOp3.getOffset()+4); - (*MIB).addOperand(newOp3); - (*MIB).addOperand(*argOpers[lastAddrIndx]); - - // t3/4 are defined later, at the bottom of the loop - unsigned t3 = F->getRegInfo().createVirtualRegister(RC); - unsigned t4 = F->getRegInfo().createVirtualRegister(RC); - BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg()) - .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB); - BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg()) - .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB); - - // The subsequent operations should be using the destination registers of - // the PHI instructions. - t1 = dest1Oper.getReg(); - t2 = dest2Oper.getReg(); - - int valArgIndx = lastAddrIndx + 1; - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - unsigned t5 = F->getRegInfo().createVirtualRegister(RC); - unsigned t6 = F->getRegInfo().createVirtualRegister(RC); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5); - if (regOpcL != X86::MOV32rr) - MIB.addReg(t1); - (*MIB).addOperand(*argOpers[valArgIndx]); - assert(argOpers[valArgIndx + 1]->isReg() == - argOpers[valArgIndx]->isReg()); - assert(argOpers[valArgIndx + 1]->isImm() == - argOpers[valArgIndx]->isImm()); - if (argOpers[valArgIndx + 1]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6); - else - MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6); - if (regOpcH != X86::MOV32rr) - MIB.addReg(t2); - (*MIB).addOperand(*argOpers[valArgIndx + 1]); - - unsigned t7, t8; - if (Invert) { - t7 = F->getRegInfo().createVirtualRegister(RC); - t8 = F->getRegInfo().createVirtualRegister(RC); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5); - MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6); - } else { - t7 = t5; - t8 = t6; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + + thisMBB->addSuccessor(mainMBB); + + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + mainMBB->addLiveIn(AccPhyReg); + + // Copy AccPhyReg as it is used more than once. + unsigned AccReg = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg) + .addReg(AccPhyReg); + + unsigned t1 = MRI.createVirtualRegister(RC); + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op opcode!"); + case X86::ATOMAND8: + case X86::ATOMAND16: + case X86::ATOMAND32: + case X86::ATOMAND64: + case X86::ATOMOR8: + case X86::ATOMOR16: + case X86::ATOMOR32: + case X86::ATOMOR64: + case X86::ATOMXOR8: + case X86::ATOMXOR16: + case X86::ATOMXOR32: + case X86::ATOMXOR64: { + unsigned ARITHOpc = getNonAtomicOpcode(Opc); + BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg) + .addReg(AccReg); + break; + } + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: + case X86::ATOMNAND64: { + unsigned t2 = MRI.createVirtualRegister(RC); + unsigned NOTOpc; + unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc); + BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg) + .addReg(AccReg); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2); + break; + } + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: + case X86::ATOMMAX64: + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: + case X86::ATOMUMAX64: + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: { + unsigned CMPOpc; + unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc); + + BuildMI(mainMBB, DL, TII->get(CMPOpc)) + .addReg(SrcReg) + .addReg(AccReg); + + if (Subtarget->hasCMov()) { + if (VT != MVT::i8) { + // Native support + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1) + .addReg(SrcReg) + .addReg(AccReg); + } else { + // Promote i8 to i32 to use CMOV32 + const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32); + unsigned SrcReg32 = MRI.createVirtualRegister(RC32); + unsigned AccReg32 = MRI.createVirtualRegister(RC32); + unsigned t2 = MRI.createVirtualRegister(RC32); + + unsigned Undef = MRI.createVirtualRegister(RC32); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32) + .addReg(Undef) + .addReg(SrcReg) + .addImm(X86::sub_8bit); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32) + .addReg(Undef) + .addReg(AccReg) + .addImm(X86::sub_8bit); + + BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2) + .addReg(SrcReg32) + .addReg(AccReg32); + + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1) + .addReg(t2, 0, X86::sub_8bit); + } + } else { + // Use pseudo select and lower them. + assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) && + "Invalid atomic-load-op transformation!"); + unsigned SelOpc = getPseudoCMOVOpc(VT); + X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc); + assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!"); + MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1) + .addReg(SrcReg).addReg(AccReg) + .addImm(CC); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + break; + } } - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); - MIB.addReg(t1); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX); - MIB.addReg(t2); + // Copy AccPhyReg back from virtual register. + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg) + .addReg(AccReg); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX); - MIB.addReg(t7); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX); - MIB.addReg(t8); + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.addReg(t1); + MIB.setMemRefs(MMOBegin, MMOEnd); - MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(bInstr->memoperands_begin(), - bInstr->memoperands_end()); + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3); - MIB.addReg(X86::EAX); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4); - MIB.addReg(X86::EDX); + // sinkMBB: + sinkMBB->addLiveIn(AccPhyReg); - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstReg) + .addReg(AccPhyReg); - bInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; + MI->eraseFromParent(); + return sinkMBB; } -// private utility function +// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic +// instructions. They will be translated into a spin-loop or compare-exchange +// loop from +// +// ... +// dst = atomic-fetch-op MI.addr, MI.val +// ... +// +// to +// +// ... +// EAX = LOAD [MI.addr + 0] +// EDX = LOAD [MI.addr + 4] +// loop: +// EBX = OP MI.val.lo, EAX +// ECX = OP MI.val.hi, EDX +// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] +// JNE loop +// sink: +// dst = EDX:EAX +// ... MachineBasicBlock * -X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr, - MachineBasicBlock *MBB, - unsigned cmovOpc) const { - // For the atomic min/max operator, we generate - // thisMBB: - // newMBB: - // ld t1 = [min/max.addr] - // mov t2 = [min/max.val] - // cmp t1, t2 - // cmov[cond] t2 = t1 - // mov EAX = t1 - // lcs dest = [bitinstr.addr], t2 [EAX is implicit] - // bz newMBB - // fallthrough -->nextMBB - // +X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI, + MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); - const BasicBlock *LLVM_BB = MBB->getBasicBlock(); - MachineFunction::iterator MBBIter = MBB; - ++MBBIter; + DebugLoc DL = MI->getDebugLoc(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 && + "Unexpected number of operands"); + + assert(MI->hasOneMemOperand() && + "Expected atomic-load-op32 to have one memoperand"); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstLoReg, DstHiReg; + unsigned SrcLoReg, SrcHiReg; + unsigned MemOpndSlot; + + unsigned CurOp = 0; + + DstLoReg = MI->getOperand(CurOp++).getReg(); + DstHiReg = MI->getOperand(CurOp++).getReg(); + MemOpndSlot = CurOp; + CurOp += X86::AddrNumOperands; + SrcLoReg = MI->getOperand(CurOp++).getReg(); + SrcHiReg = MI->getOperand(CurOp++).getReg(); + + const TargetRegisterClass *RC = &X86::GR32RegClass; + const TargetRegisterClass *RC8 = &X86::GR8RegClass; + + unsigned LCMPXCHGOpc = X86::LCMPXCHG8B; + unsigned LOADOpc = X86::MOV32rm; + + // For the atomic load-arith operator, we generate + // + // thisMBB: + // EAX = LOAD [MI.addr + 0] + // EDX = LOAD [MI.addr + 4] + // mainMBB: + // EBX = OP MI.vallo, EAX + // ECX = OP MI.valhi, EDX + // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined] + // JNE mainMBB + // sinkMBB: - /// First build the CFG - MachineFunction *F = MBB->getParent(); MachineBasicBlock *thisMBB = MBB; - MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(MBBIter, newMBB); - F->insert(MBBIter, nextMBB); - - // Transfer the remainder of thisMBB and its successor edges to nextMBB. - nextMBB->splice(nextMBB->begin(), thisMBB, - llvm::next(MachineBasicBlock::iterator(mInstr)), - thisMBB->end()); - nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB); - - // Update thisMBB to fall through to newMBB - thisMBB->addSuccessor(newMBB); - - // newMBB jumps to newMBB and fall through to nextMBB - newMBB->addSuccessor(nextMBB); - newMBB->addSuccessor(newMBB); - - DebugLoc dl = mInstr->getDebugLoc(); - // Insert instructions into newMBB based on incoming instruction - assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 && - "unexpected number of operands"); - MachineOperand& destOper = mInstr->getOperand(0); - MachineOperand* argOpers[2 + X86::AddrNumOperands]; - int numArgs = mInstr->getNumOperands() - 1; - for (int i=0; i < numArgs; ++i) - argOpers[i] = &mInstr->getOperand(i+1); - - // x86 address has 4 operands: base, index, scale, and displacement - int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3] - int valArgIndx = lastAddrIndx + 1; - - unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - - // We only support register and immediate values - assert((argOpers[valArgIndx]->isReg() || - argOpers[valArgIndx]->isImm()) && - "invalid operand"); - - unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - if (argOpers[valArgIndx]->isReg()) - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2); - else - MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2); - (*MIB).addOperand(*argOpers[valArgIndx]); + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX); - MIB.addReg(t1); + MachineInstrBuilder MIB; - MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr)); - MIB.addReg(t1); - MIB.addReg(t2); + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + // Lo + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Hi + MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32) + else + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); - // Generate movc - unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass); - MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3); - MIB.addReg(t2); - MIB.addReg(t1); + thisMBB->addSuccessor(mainMBB); + + // mainMBB: + MachineBasicBlock *origMainMBB = mainMBB; + mainMBB->addLiveIn(X86::EAX); + mainMBB->addLiveIn(X86::EDX); + + // Copy EDX:EAX as they are used more than once. + unsigned LoReg = MRI.createVirtualRegister(RC); + unsigned HiReg = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX); + + unsigned t1L = MRI.createVirtualRegister(RC); + unsigned t1H = MRI.createVirtualRegister(RC); + + unsigned Opc = MI->getOpcode(); + switch (Opc) { + default: + llvm_unreachable("Unhandled atomic-load-op6432 opcode!"); + case X86::ATOMAND6432: + case X86::ATOMOR6432: + case X86::ATOMXOR6432: + case X86::ATOMADD6432: + case X86::ATOMSUB6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg); + break; + } + case X86::ATOMNAND6432: { + unsigned HiOpc, NOTOpc; + unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc); + unsigned t2L = MRI.createVirtualRegister(RC); + unsigned t2H = MRI.createVirtualRegister(RC); + BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L); + BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H); + break; + } + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + unsigned cL = MRI.createVirtualRegister(RC8); + unsigned cH = MRI.createVirtualRegister(RC8); + unsigned cL32 = MRI.createVirtualRegister(RC); + unsigned cH32 = MRI.createVirtualRegister(RC); + unsigned cc = MRI.createVirtualRegister(RC); + // cl := cmp src_lo, lo + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(LoOpc), cL); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL); + // ch := cmp src_hi, hi + BuildMI(mainMBB, DL, TII->get(X86::CMP32rr)) + .addReg(SrcHiReg).addReg(HiReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), cH); + BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH); + // cc := if (src_hi == hi) ? cl : ch; + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc) + .addReg(cH32).addReg(cL32); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc) + .addReg(cH32).addReg(cL32) + .addImm(X86::COND_E); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc); + if (Subtarget->hasCMov()) { + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L) + .addReg(SrcLoReg).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H) + .addReg(SrcHiReg).addReg(HiReg); + } else { + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L) + .addReg(SrcLoReg).addReg(LoReg) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H) + .addReg(SrcHiReg).addReg(HiReg) + .addImm(X86::COND_NE); + mainMBB = EmitLoweredSelect(MIB, mainMBB); + } + break; + } + case X86::ATOMSWAP6432: { + unsigned HiOpc; + unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc); + BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg); + BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg); + break; + } + } + + // Copy EDX:EAX back from HiReg:LoReg + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg); + // Copy ECX:EBX from t1H:t1L + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L); + BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H); + + MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + MIB.setMemRefs(MMOBegin, MMOEnd); - // Cmp and exchange if none has modified the memory location - MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32)); - for (int i=0; i <= lastAddrIndx; ++i) - (*MIB).addOperand(*argOpers[i]); - MIB.addReg(t3); - assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand"); - (*MIB).setMemRefs(mInstr->memoperands_begin(), - mInstr->memoperands_end()); + BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB); - MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg()); - MIB.addReg(X86::EAX); + mainMBB->addSuccessor(origMainMBB); + mainMBB->addSuccessor(sinkMBB); - // insert branch - BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB); + // sinkMBB: + sinkMBB->addLiveIn(X86::EAX); + sinkMBB->addLiveIn(X86::EDX); + + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstLoReg) + .addReg(X86::EAX); + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(TargetOpcode::COPY), DstHiReg) + .addReg(X86::EDX); - mInstr->eraseFromParent(); // The pseudo instruction is gone now. - return nextMBB; + MI->eraseFromParent(); + return sinkMBB; } // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8 // or XMM0_V32I8 in AVX all of this code can be replaced with that // in the .td file. -MachineBasicBlock * -X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB, - unsigned numArgs, bool memArg) const { - assert(Subtarget->hasSSE42() && - "Target must have SSE4.2 or AVX features enabled"); +static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { + unsigned Opc; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break; + case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break; + case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break; + case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break; + case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break; + case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break; + case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break; + case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break; + } DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); + + unsigned NumArgs = MI->getNumOperands(); + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); + if (!(Op.isReg() && Op.isImplicit())) + MIB.addOperand(Op); + } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + + BuildMI(*BB, MI, dl, + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addReg(X86::XMM0); + + MI->eraseFromParent(); + return BB; +} + +// FIXME: Custom handling because TableGen doesn't support multiple implicit +// defs in an instruction pattern +static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII) { unsigned Opc; - if (!Subtarget->hasAVX()) { - if (memArg) - Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm; - else - Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr; - } else { - if (memArg) - Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm; - else - Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr; + switch (MI->getOpcode()) { + default: llvm_unreachable("illegal opcode!"); + case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break; + case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break; + case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break; + case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break; + case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break; + case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break; + case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break; + case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break; } + DebugLoc dl = MI->getDebugLoc(); MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc)); - for (unsigned i = 0; i < numArgs; ++i) { - MachineOperand &Op = MI->getOperand(i+1); + + unsigned NumArgs = MI->getNumOperands(); // remove the results + for (unsigned i = 1; i < NumArgs; ++i) { + MachineOperand &Op = MI->getOperand(i); if (!(Op.isReg() && Op.isImplicit())) MIB.addOperand(Op); } + if (MI->hasOneMemOperand()) + MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) - .addReg(X86::XMM0); + .addReg(X86::ECX); MI->eraseFromParent(); return BB; } -MachineBasicBlock * -X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const { +static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB, + const TargetInstrInfo *TII, + const X86Subtarget* Subtarget) { DebugLoc dl = MI->getDebugLoc(); - const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); // Address into RAX/EAX, other two args into ECX, EDX. unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r; @@ -12767,6 +13601,203 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI, } MachineBasicBlock * +X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + const BasicBlock *BB = MBB->getBasicBlock(); + MachineFunction::iterator I = MBB; + ++I; + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + unsigned DstReg; + unsigned MemOpndSlot = 0; + + unsigned CurOp = 0; + + DstReg = MI->getOperand(CurOp++).getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(DstReg); + assert(RC->hasType(MVT::i32) && "Invalid destination!"); + unsigned mainDstReg = MRI.createVirtualRegister(RC); + unsigned restoreDstReg = MRI.createVirtualRegister(RC); + + MemOpndSlot = CurOp; + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + // For v = setjmp(buf), we generate + // + // thisMBB: + // buf[LabelOffset] = restoreMBB + // SjLjSetup restoreMBB + // + // mainMBB: + // v_main = 0 + // + // sinkMBB: + // v = phi(main, restore) + // + // restoreMBB: + // v_restore = 1 + + MachineBasicBlock *thisMBB = MBB; + MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); + MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); + MF->insert(I, mainMBB); + MF->insert(I, sinkMBB); + MF->push_back(restoreMBB); + + MachineInstrBuilder MIB; + + // Transfer the remainder of BB and its successor edges to sinkMBB. + sinkMBB->splice(sinkMBB->begin(), MBB, + llvm::next(MachineBasicBlock::iterator(MI)), MBB->end()); + sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // thisMBB: + unsigned PtrStoreOpc = 0; + unsigned LabelReg = 0; + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + Reloc::Model RM = getTargetMachine().getRelocationModel(); + bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) && + (RM == Reloc::Static || RM == Reloc::DynamicNoPIC); + + // Prepare IP either in reg or imm. + if (!UseImmLabel) { + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; + const TargetRegisterClass *PtrRC = getRegClassFor(PVT); + LabelReg = MRI.createVirtualRegister(PtrRC); + if (Subtarget->is64Bit()) { + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB) + .addReg(0); + } else { + const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) + .addReg(XII->getGlobalBaseReg(MF)) + .addImm(0) + .addReg(0) + .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference()) + .addReg(0); + } + } else + PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; + // Store IP + MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset); + else + MIB.addOperand(MI->getOperand(MemOpndSlot + i)); + } + if (!UseImmLabel) + MIB.addReg(LabelReg); + else + MIB.addMBB(restoreMBB); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Setup + MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) + .addMBB(restoreMBB); + MIB.addRegMask(RegInfo->getNoPreservedMask()); + thisMBB->addSuccessor(mainMBB); + thisMBB->addSuccessor(restoreMBB); + + // mainMBB: + // EAX = 0 + BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); + mainMBB->addSuccessor(sinkMBB); + + // sinkMBB: + BuildMI(*sinkMBB, sinkMBB->begin(), DL, + TII->get(X86::PHI), DstReg) + .addReg(mainDstReg).addMBB(mainMBB) + .addReg(restoreDstReg).addMBB(restoreMBB); + + // restoreMBB: + BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); + BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB); + restoreMBB->addSuccessor(sinkMBB); + + MI->eraseFromParent(); + return sinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI, + MachineBasicBlock *MBB) const { + DebugLoc DL = MI->getDebugLoc(); + const TargetInstrInfo *TII = getTargetMachine().getInstrInfo(); + + MachineFunction *MF = MBB->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // Memory Reference + MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin(); + MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end(); + + MVT PVT = getPointerTy(); + assert((PVT == MVT::i64 || PVT == MVT::i32) && + "Invalid Pointer Size!"); + + const TargetRegisterClass *RC = + (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; + unsigned Tmp = MRI.createVirtualRegister(RC); + // Since FP is only updated here but NOT referenced, it's treated as GPR. + unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; + unsigned SP = RegInfo->getStackRegister(); + + MachineInstrBuilder MIB; + + const int64_t LabelOffset = 1 * PVT.getStoreSize(); + const int64_t SPOffset = 2 * PVT.getStoreSize(); + + unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; + unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; + + // Reload FP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) + MIB.addOperand(MI->getOperand(i)); + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload IP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), LabelOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Reload SP + MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP); + for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { + if (i == X86::AddrDisp) + MIB.addDisp(MI->getOperand(i), SPOffset); + else + MIB.addOperand(MI->getOperand(i)); + } + MIB.setMemRefs(MMOBegin, MMOEnd); + // Jump + BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); + + MI->eraseFromParent(); + return MBB; +} + +MachineBasicBlock * X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *BB) const { switch (MI->getOpcode()) { @@ -12895,198 +13926,101 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, case X86::PCMPESTRM128REG: case X86::VPCMPESTRM128REG: case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: { - unsigned NumArgs; - bool MemArg; - switch (MI->getOpcode()) { - default: llvm_unreachable("illegal opcode!"); - case X86::PCMPISTRM128REG: - case X86::VPCMPISTRM128REG: - NumArgs = 3; MemArg = false; break; - case X86::PCMPISTRM128MEM: - case X86::VPCMPISTRM128MEM: - NumArgs = 3; MemArg = true; break; - case X86::PCMPESTRM128REG: - case X86::VPCMPESTRM128REG: - NumArgs = 5; MemArg = false; break; - case X86::PCMPESTRM128MEM: - case X86::VPCMPESTRM128MEM: - NumArgs = 5; MemArg = true; break; - } - return EmitPCMP(MI, BB, NumArgs, MemArg); - } - - // Thread synchronization. + case X86::VPCMPESTRM128MEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo()); + + // String/text processing lowering. + case X86::PCMPISTRIREG: + case X86::VPCMPISTRIREG: + case X86::PCMPISTRIMEM: + case X86::VPCMPISTRIMEM: + case X86::PCMPESTRIREG: + case X86::VPCMPESTRIREG: + case X86::PCMPESTRIMEM: + case X86::VPCMPESTRIMEM: + assert(Subtarget->hasSSE42() && + "Target must have SSE4.2 or AVX features enabled"); + return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo()); + + // Thread synchronization. case X86::MONITOR: - return EmitMonitor(MI, BB); + return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget); - // Atomic Lowering. - case X86::ATOMAND32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, - X86::AND32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass); - case X86::ATOMOR32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, - X86::OR32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass); - case X86::ATOMXOR32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr, - X86::XOR32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass); - case X86::ATOMNAND32: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr, - X86::AND32ri, X86::MOV32rm, - X86::LCMPXCHG32, - X86::NOT32r, X86::EAX, - &X86::GR32RegClass, true); - case X86::ATOMMIN32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr); - case X86::ATOMMAX32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr); - case X86::ATOMUMIN32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr); - case X86::ATOMUMAX32: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr); + // xbegin + case X86::XBEGIN: + return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo()); + // Atomic Lowering. + case X86::ATOMAND8: case X86::ATOMAND16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, - X86::AND16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass); + case X86::ATOMAND32: + case X86::ATOMAND64: + // Fall through + case X86::ATOMOR8: case X86::ATOMOR16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr, - X86::OR16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass); + case X86::ATOMOR32: + case X86::ATOMOR64: + // Fall through case X86::ATOMXOR16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr, - X86::XOR16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass); - case X86::ATOMNAND16: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr, - X86::AND16ri, X86::MOV16rm, - X86::LCMPXCHG16, - X86::NOT16r, X86::AX, - &X86::GR16RegClass, true); - case X86::ATOMMIN16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr); - case X86::ATOMMAX16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr); - case X86::ATOMUMIN16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr); - case X86::ATOMUMAX16: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr); - - case X86::ATOMAND8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, - X86::AND8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass); - case X86::ATOMOR8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr, - X86::OR8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass); case X86::ATOMXOR8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr, - X86::XOR8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass); - case X86::ATOMNAND8: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr, - X86::AND8ri, X86::MOV8rm, - X86::LCMPXCHG8, - X86::NOT8r, X86::AL, - &X86::GR8RegClass, true); - // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way. - // This group is for 64-bit host. - case X86::ATOMAND64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, - X86::AND64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass); - case X86::ATOMOR64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr, - X86::OR64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass); + case X86::ATOMXOR32: case X86::ATOMXOR64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr, - X86::XOR64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass); + // Fall through + case X86::ATOMNAND8: + case X86::ATOMNAND16: + case X86::ATOMNAND32: case X86::ATOMNAND64: - return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr, - X86::AND64ri32, X86::MOV64rm, - X86::LCMPXCHG64, - X86::NOT64r, X86::RAX, - &X86::GR64RegClass, true); - case X86::ATOMMIN64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr); + // Fall through + case X86::ATOMMAX8: + case X86::ATOMMAX16: + case X86::ATOMMAX32: case X86::ATOMMAX64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr); - case X86::ATOMUMIN64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr); + // Fall through + case X86::ATOMMIN8: + case X86::ATOMMIN16: + case X86::ATOMMIN32: + case X86::ATOMMIN64: + // Fall through + case X86::ATOMUMAX8: + case X86::ATOMUMAX16: + case X86::ATOMUMAX32: case X86::ATOMUMAX64: - return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr); + // Fall through + case X86::ATOMUMIN8: + case X86::ATOMUMIN16: + case X86::ATOMUMIN32: + case X86::ATOMUMIN64: + return EmitAtomicLoadArith(MI, BB); // This group does 64-bit operations on a 32-bit host. case X86::ATOMAND6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::AND32rr, X86::AND32rr, - X86::AND32ri, X86::AND32ri, - false); case X86::ATOMOR6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::OR32rr, X86::OR32rr, - X86::OR32ri, X86::OR32ri, - false); case X86::ATOMXOR6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::XOR32rr, X86::XOR32rr, - X86::XOR32ri, X86::XOR32ri, - false); case X86::ATOMNAND6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::AND32rr, X86::AND32rr, - X86::AND32ri, X86::AND32ri, - true); case X86::ATOMADD6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::ADD32rr, X86::ADC32rr, - X86::ADD32ri, X86::ADC32ri, - false); case X86::ATOMSUB6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::SUB32rr, X86::SBB32rr, - X86::SUB32ri, X86::SBB32ri, - false); + case X86::ATOMMAX6432: + case X86::ATOMMIN6432: + case X86::ATOMUMAX6432: + case X86::ATOMUMIN6432: case X86::ATOMSWAP6432: - return EmitAtomicBit6432WithCustomInserter(MI, BB, - X86::MOV32rr, X86::MOV32rr, - X86::MOV32ri, X86::MOV32ri, - false); + return EmitAtomicLoadArith6432(MI, BB); + case X86::VASTART_SAVE_XMM_REGS: return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB); case X86::VAARG_64: return EmitVAARG64WithCustomInserter(MI, BB); + + case X86::EH_SjLj_SetJmp32: + case X86::EH_SjLj_SetJmp64: + return emitEHSjLjSetJmp(MI, BB); + + case X86::EH_SjLj_LongJmp32: + case X86::EH_SjLj_LongJmp64: + return emitEHSjLjLongJmp(MI, BB); } } @@ -13333,12 +14267,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG, } -/// DCI, PerformTruncateCombine - Converts truncate operation to +/// PerformTruncateCombine - Converts truncate operation to /// a sequence of vector shuffle operations. /// It is possible when we truncate 256-bit vector to 128-bit vector - -SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, - DAGCombinerInfo &DCI) const { +static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { if (!DCI.isBeforeLegalizeOps()) return SDValue(); @@ -13530,7 +14464,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG, // alignment is valid. unsigned Align = LN0->getAlignment(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned NewAlign = TLI.getTargetData()-> + unsigned NewAlign = TLI.getDataLayout()-> getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT)) @@ -13561,6 +14495,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG, return NewOp; SDValue InputVector = N->getOperand(0); + // Detect whether we are trying to convert from mmx to i32 and the bitcast + // from mmx to v2i32 has a single usage. + if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST && + InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx && + InputVector.hasOneUse() && N->getValueType(0) == MVT::i32) + return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(), + N->getValueType(0), + InputVector.getNode()->getOperand(0)); // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. @@ -13961,7 +14903,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, // // where Op could be BRCOND or CMOV. // -static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { +static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { // Quit if not CMP and SUB with its value result used. if (Cmp.getOpcode() != X86ISD::CMP && (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0))) @@ -13997,40 +14939,55 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { if (SetCC.getOpcode() == ISD::ZERO_EXTEND) SetCC = SetCC.getOperand(0); - // Quit if not SETCC. - // FIXME: So far we only handle the boolean value generated from SETCC. If - // there is other ways to generate boolean values, we need handle them here - // as well. - if (SetCC.getOpcode() != X86ISD::SETCC) - return SDValue(); - - // Set the condition code or opposite one if necessary. - CC = X86::CondCode(SetCC.getConstantOperandVal(0)); - if (needOppositeCond) - CC = X86::GetOppositeBranchCondition(CC); - - return SetCC.getOperand(1); -} - -static bool IsValidFCMOVCondition(X86::CondCode CC) { - switch (CC) { - default: - return false; - case X86::COND_B: - case X86::COND_BE: - case X86::COND_E: - case X86::COND_P: - case X86::COND_AE: - case X86::COND_A: - case X86::COND_NE: - case X86::COND_NP: - return true; + switch (SetCC.getOpcode()) { + case X86ISD::SETCC: + // Set the condition code or opposite one if necessary. + CC = X86::CondCode(SetCC.getConstantOperandVal(0)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(1); + case X86ISD::CMOV: { + // Check whether false/true value has canonical one, i.e. 0 or 1. + ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); + ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); + // Quit if true value is not a constant. + if (!TVal) + return SDValue(); + // Quit if false value is not a constant. + if (!FVal) { + // A special case for rdrand, where 0 is set if false cond is found. + SDValue Op = SetCC.getOperand(0); + if (Op.getOpcode() != X86ISD::RDRAND) + return SDValue(); + } + // Quit if false value is not the constant 0 or 1. + bool FValIsFalse = true; + if (FVal && FVal->getZExtValue() != 0) { + if (FVal->getZExtValue() != 1) + return SDValue(); + // If FVal is 1, opposite cond is needed. + needOppositeCond = !needOppositeCond; + FValIsFalse = false; + } + // Quit if TVal is not the constant opposite of FVal. + if (FValIsFalse && TVal->getZExtValue() != 1) + return SDValue(); + if (!FValIsFalse && TVal->getZExtValue() != 0) + return SDValue(); + CC = X86::CondCode(SetCC.getConstantOperandVal(2)); + if (needOppositeCond) + CC = X86::GetOppositeBranchCondition(CC); + return SetCC.getOperand(3); + } } + + return SDValue(); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); // If the flag operand isn't dead, don't touch this CMOV. @@ -14055,10 +15012,10 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(Cond, CC); + Flags = checkBoolTestSetCCCombine(Cond, CC); if (Flags.getNode() && // Extra check as FCMOV only supports a subset of X86 cond. - (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) { + (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) { SDValue Ops[] = { FalseOp, TrueOp, DAG.getConstant(CC, MVT::i8), Flags }; return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), @@ -14075,6 +15032,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { CC = X86::GetOppositeBranchCondition(CC); std::swap(TrueC, FalseC); + std::swap(TrueOp, FalseOp); } // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. @@ -14157,6 +15115,46 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG, } } } + + // Handle these cases: + // (select (x != c), e, c) -> select (x != c), e, x), + // (select (x == c), c, e) -> select (x == c), x, e) + // where the c is an integer constant, and the "select" is the combination + // of CMOV and CMP. + // + // The rationale for this change is that the conditional-move from a constant + // needs two instructions, however, conditional-move from a register needs + // only one instruction. + // + // CAVEAT: By replacing a constant with a symbolic value, it may obscure + // some instruction-combining opportunities. This opt needs to be + // postponed as late as possible. + // + if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { + // the DCI.xxxx conditions are provided to postpone the optimization as + // late as possible. + + ConstantSDNode *CmpAgainst = 0; + if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && + (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && + dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) { + + if (CC == X86::COND_NE && + CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { + CC = X86::GetOppositeBranchCondition(CC); + std::swap(TrueOp, FalseOp); + } + + if (CC == X86::COND_E && + CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { + SDValue Ops[] = { FalseOp, Cond.getOperand(0), + DAG.getConstant(CC, MVT::i8), Cond }; + return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops, + array_lengthof(Ops)); + } + } + } + return SDValue(); } @@ -14813,11 +15811,11 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, ISD::LoadExtType Ext = Ld->getExtensionType(); // If this is a vector EXT Load then attempt to optimize it using a - // shuffle. We need SSE4 for the shuffles. + // shuffle. We need SSSE3 shuffles. // TODO: It is possible to support ZExt by zeroing the undef values // during the shuffle phase or after the shuffle. if (RegVT.isVector() && RegVT.isInteger() && - Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) { + Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) { assert(MemVT != RegVT && "Cannot extend to the same type"); assert(MemVT.isVector() && "Must load a vector from memory"); @@ -15043,7 +16041,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, return SDValue(); const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat); + bool NoImplicitFloatOps = F->getFnAttributes(). + hasAttribute(Attributes::NoImplicitFloat); bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps && Subtarget->hasSSE2(); if ((VT.isVector() || @@ -15315,6 +16314,29 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and +/// X86ISD::FMAX nodes. +static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX); + + // Only perform optimizations if UnsafeMath is used. + if (!DAG.getTarget().Options.UnsafeFPMath) + return SDValue(); + + // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes + // into FMINC and FMAXC, which are Commutative operations. + unsigned NewOp = 0; + switch (N->getOpcode()) { + default: llvm_unreachable("unknown opcode"); + case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; + case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; + } + + return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0), + N->getOperand(0), N->getOperand(1)); +} + + /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes. static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) { // FAND(0.0, x) -> 0.0 @@ -15420,8 +16442,13 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = N->getDebugLoc(); EVT VT = N->getValueType(0); + // Let legalize expand this if it isn't a legal type yet. + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + EVT ScalarVT = VT.getScalarType(); - if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA()) + if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || + (!Subtarget->hasFMA() && !Subtarget->hasFMA4())) return SDValue(); SDValue A = N->getOperand(0); @@ -15443,9 +16470,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode; if (!NegMul) - Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB; + Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; else - Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB; + Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + return DAG.getNode(Opcode, dl, VT, A, B, C); } @@ -15542,24 +16570,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// Helper function of PerformSETCCCombine. It is to materialize "setb reg" +// as "sbb reg,reg", since it can be extended without zext and produces +// an all-ones bit which is more useful than 0/1 in some cases. +static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) { + return DAG.getNode(ISD::AND, DL, MVT::i8, + DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, + DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS), + DAG.getConstant(1, MVT::i8)); +} + // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT -static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { DebugLoc DL = N->getDebugLoc(); X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); SDValue EFLAGS = N->getOperand(1); + if (CC == X86::COND_A) { + // Try to convert COND_A into COND_B in an attempt to facilitate + // materializing "setb reg". + // + // Do not flip "e > c", where "c" is a constant, because Cmp instruction + // cannot take an immediate as its first operand. + // + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(), + EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return MaterializeSETB(DL, NewEFLAGS, DAG); + } + } + // Materialize "setb reg" as "sbb reg,reg", since it can be extended without // a zext and produces an all-ones bit which is more useful than 0/1 in some // cases. if (CC == X86::COND_B) - return DAG.getNode(ISD::AND, DL, MVT::i8, - DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, - DAG.getConstant(CC, MVT::i8), EFLAGS), - DAG.getConstant(1, MVT::i8)); + return MaterializeSETB(DL, EFLAGS, DAG); SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags); @@ -15581,7 +16636,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, SDValue Flags; - Flags = BoolTestSetCCCombine(EFLAGS, CC); + Flags = checkBoolTestSetCCCombine(EFLAGS, CC); if (Flags.getNode()) { SDValue Cond = DAG.getConstant(CC, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond, @@ -15591,23 +16646,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) { - SDValue Op0 = N->getOperand(0); - EVT InVT = Op0->getValueType(0); - - // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32)) - if (InVT == MVT::v8i8 || InVT == MVT::v4i8) { - DebugLoc dl = N->getDebugLoc(); - MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; - SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); - // Notice that we use SINT_TO_FP because we know that the high bits - // are zero and SINT_TO_FP is better supported by the hardware. - return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P); - } - - return SDValue(); -} - static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, const X86TargetLowering *XTLI) { SDValue Op0 = N->getOperand(0); @@ -15639,20 +16677,6 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT() - if (VT == MVT::v8i8 || VT == MVT::v4i8) { - DebugLoc dl = N->getDebugLoc(); - MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32; - SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, I); - } - - return SDValue(); -} - // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG, X86TargetLowering::DAGCombinerInfo &DCI) { @@ -15767,6 +16791,21 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG, return OptimizeConditionalInDecrement(N, DAG); } +/// performVZEXTCombine - Performs build vector combines +static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget *Subtarget) { + // (vzext (bitcast (vzext (x)) -> (vzext x) + SDValue In = N->getOperand(0); + while (In.getOpcode() == ISD::BITCAST) + In = In.getOperand(0); + + if (In.getOpcode() != X86ISD::VZEXT) + return SDValue(); + + return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0)); +} + SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -15776,7 +16815,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI); case ISD::VSELECT: case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget); - case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI); + case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget); case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget); case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget); case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI); @@ -15789,23 +16828,24 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget); case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget); case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget); - case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG); case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this); - case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG); case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget); case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget); case X86ISD::FXOR: case X86ISD::FOR: return PerformFORCombine(N, DAG); + case X86ISD::FMIN: + case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG); case X86ISD::FAND: return PerformFANDCombine(N, DAG); case X86ISD::BT: return PerformBTCombine(N, DAG, DCI); case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget); case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget); - case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI); + case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget); case ISD::SETCC: return PerformISDSETCCCombine(N, DAG); - case X86ISD::SETCC: return PerformSETCCCombine(N, DAG); + case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget); case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); + case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::PALIGN: case X86ISD::UNPCKH: @@ -16233,7 +17273,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, return; case 'K': if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { - if ((int8_t)C->getSExtValue() == C->getSExtValue()) { + if (isInt<8>(C->getSExtValue())) { Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType()); break; } @@ -16558,3 +17598,207 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, return Res; } + +//===----------------------------------------------------------------------===// +// +// X86 cost model. +// +//===----------------------------------------------------------------------===// + +struct X86CostTblEntry { + int ISD; + MVT Type; + unsigned Cost; +}; + +static int +FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) { + for (unsigned int i = 0; i < len; ++i) + if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty) + return i; + + // Could not find an entry. + return -1; +} + +struct X86TypeConversionCostTblEntry { + int ISD; + MVT Dst; + MVT Src; + unsigned Cost; +}; + +static int +FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len, + int ISD, MVT Dst, MVT Src) { + for (unsigned int i = 0; i < len; ++i) + if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst) + return i; + + // Could not find an entry. + return -1; +} + +unsigned +X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, + Type *Ty) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty); + + int ISD = InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>(); + + static const X86CostTblEntry AVX1CostTable[] = { + // We don't have to scalarize unsupported ops. We can issue two half-sized + // operations and we only need to extract the upper YMM half. + // Two ops + 1 extract + 1 insert = 4. + { ISD::MUL, MVT::v8i32, 4 }, + { ISD::SUB, MVT::v8i32, 4 }, + { ISD::ADD, MVT::v8i32, 4 }, + { ISD::MUL, MVT::v4i64, 4 }, + { ISD::SUB, MVT::v4i64, 4 }, + { ISD::ADD, MVT::v4i64, 4 }, + }; + + // Look for AVX1 lowering tricks. + if (ST.hasAVX()) { + int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD, + LT.second); + if (Idx != -1) + return LT.first * AVX1CostTable[Idx].Cost; + } + // Fallback to the default implementation. + return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty); +} + +unsigned +X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const { + assert(Val->isVectorTy() && "This must be a vector type"); + + if (Index != -1U) { + // Legalize the type. + std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val); + + // This type is legalized to a scalar type. + if (!LT.second.isVector()) + return 0; + + // The type may be split. Normalize the index to the new type. + unsigned Width = LT.second.getVectorNumElements(); + Index = Index % Width; + + // Floating point scalars are already located in index #0. + if (Val->getScalarType()->isFloatingPointTy() && Index == 0) + return 0; + } + + return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index); +} + +unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, + Type *ValTy, + Type *CondTy) const { + // Legalize the type. + std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy); + + MVT MTy = LT.second; + + int ISD = InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + const X86Subtarget &ST = + TLI->getTargetMachine().getSubtarget<X86Subtarget>(); + + static const X86CostTblEntry SSE42CostTbl[] = { + { ISD::SETCC, MVT::v2f64, 1 }, + { ISD::SETCC, MVT::v4f32, 1 }, + { ISD::SETCC, MVT::v2i64, 1 }, + { ISD::SETCC, MVT::v4i32, 1 }, + { ISD::SETCC, MVT::v8i16, 1 }, + { ISD::SETCC, MVT::v16i8, 1 }, + }; + + static const X86CostTblEntry AVX1CostTbl[] = { + { ISD::SETCC, MVT::v4f64, 1 }, + { ISD::SETCC, MVT::v8f32, 1 }, + // AVX1 does not support 8-wide integer compare. + { ISD::SETCC, MVT::v4i64, 4 }, + { ISD::SETCC, MVT::v8i32, 4 }, + { ISD::SETCC, MVT::v16i16, 4 }, + { ISD::SETCC, MVT::v32i8, 4 }, + }; + + static const X86CostTblEntry AVX2CostTbl[] = { + { ISD::SETCC, MVT::v4i64, 1 }, + { ISD::SETCC, MVT::v8i32, 1 }, + { ISD::SETCC, MVT::v16i16, 1 }, + { ISD::SETCC, MVT::v32i8, 1 }, + }; + + if (ST.hasSSE42()) { + int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * SSE42CostTbl[Idx].Cost; + } + + if (ST.hasAVX()) { + int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * AVX1CostTbl[Idx].Cost; + } + + if (ST.hasAVX2()) { + int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy); + if (Idx != -1) + return LT.first * AVX2CostTbl[Idx].Cost; + } + + return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy); +} + +unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode, + Type *Dst, + Type *Src) const { + int ISD = InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + EVT SrcTy = TLI->getValueType(Src); + EVT DstTy = TLI->getValueType(Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src); + + const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>(); + + static const X86TypeConversionCostTblEntry AVXConversionTbl[] = { + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 }, + { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 }, + { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 }, + { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 }, + { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, + { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, + { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 }, + { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 }, + { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 }, + { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 }, + { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 }, + { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 }, + { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 }, + }; + + if (ST.hasAVX()) { + int Idx = FindInConvertTable(AVXConversionTbl, + array_lengthof(AVXConversionTbl), + ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()); + if (Idx != -1) + return AVXConversionTbl[Idx].Cost; + } + + return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src); +} + |