aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--contrib/llvm/lib/Target/X86/X86ISelLowering.cpp3024
1 files changed, 2134 insertions, 890 deletions
diff --git a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4af12e44fd40..b35fb514bf94 100644
--- a/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -85,7 +85,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
* ElemsPerChunk);
- SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
VecIdx);
@@ -118,7 +118,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
* ElemsPerChunk);
- SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+ SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
VecIdx);
}
@@ -158,10 +158,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
Subtarget = &TM.getSubtarget<X86Subtarget>();
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
- X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
RegInfo = TM.getRegisterInfo();
- TD = getTargetData();
+ TD = getDataLayout();
// Set up the TargetLowering object.
static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -180,7 +179,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setSchedulingPreference(Sched::ILP);
else
setSchedulingPreference(Sched::RegPressure);
- setStackPointerRegisterToSaveRestore(X86StackPtr);
+ setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+ // Bypass i32 with i8 on Atom when compiling with O2
+ if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+ addBypassSlowDiv(32, 8);
if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
// Setup Windows compiler runtime calls.
@@ -453,6 +456,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SETCC , MVT::i64 , Custom);
}
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
+ // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+ // support continuation, user-level threading, and etc.. As a result, no
+ // other SjLj exception interfaces are implemented and please don't build
+ // your own exception handling based on them.
+ // LLVM/Clang supports zero-cost DWARF exception handling.
+ setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+ setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
// Darwin ABI issue.
setOperationAction(ISD::ConstantPool , MVT::i32 , Custom);
@@ -510,6 +521,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
+ setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
}
if (Subtarget->hasCmpxchg16b()) {
@@ -541,6 +556,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
setOperationAction(ISD::TRAP, MVT::Other, Legal);
+ setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
setOperationAction(ISD::VASTART , MVT::Other, Custom);
@@ -737,6 +753,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
@@ -826,6 +843,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f32, Custom);
setOperationAction(ISD::LOAD, MVT::v4f32, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
@@ -859,6 +877,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v2f64, Custom);
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
@@ -927,6 +946,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
+ // As there is no 64-bit GPR available, we need build a special custom
+ // sequence to convert from v2i32 to v2f32.
+ if (!Subtarget->is64Bit())
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal);
}
if (Subtarget->hasSSE41()) {
@@ -941,6 +972,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FRINT, MVT::f64, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+
// FIXME: Do we need to handle scalar-to-vector here?
setOperationAction(ISD::MUL, MVT::v4i32, Legal);
@@ -1018,19 +1052,33 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FMUL, MVT::v8f32, Legal);
setOperationAction(ISD::FDIV, MVT::v8f32, Legal);
setOperationAction(ISD::FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal);
setOperationAction(ISD::FNEG, MVT::v8f32, Custom);
+ setOperationAction(ISD::FABS, MVT::v8f32, Custom);
setOperationAction(ISD::FADD, MVT::v4f64, Legal);
setOperationAction(ISD::FSUB, MVT::v4f64, Legal);
setOperationAction(ISD::FMUL, MVT::v4f64, Legal);
setOperationAction(ISD::FDIV, MVT::v4f64, Legal);
setOperationAction(ISD::FSQRT, MVT::v4f64, Legal);
+ setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal);
setOperationAction(ISD::FNEG, MVT::v4f64, Custom);
+ setOperationAction(ISD::FABS, MVT::v4f64, Custom);
+
+ setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
+
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal);
+
setOperationAction(ISD::SRL, MVT::v16i16, Custom);
setOperationAction(ISD::SRL, MVT::v32i8, Custom);
@@ -1054,7 +1102,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
- if (Subtarget->hasFMA()) {
+ if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
setOperationAction(ISD::FMA, MVT::v8f32, Custom);
setOperationAction(ISD::FMA, MVT::v4f64, Custom);
setOperationAction(ISD::FMA, MVT::v4f32, Custom);
@@ -1219,10 +1267,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::TRUNCATE);
- setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
- setTargetDAGCombine(ISD::FP_TO_SINT);
if (Subtarget->is64Bit())
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
@@ -1320,7 +1366,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
// cases like PR2962. This should be removed when PR2962 is fixed.
const Function *F = MF.getFunction();
if (IsZeroVal &&
- !F->hasFnAttr(Attribute::NoImplicitFloat)) {
+ !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
@@ -1988,7 +2034,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
TotalNumIntRegs);
- bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
+ bool NoImplicitFloatOps = Fn->getFnAttributes().
+ hasAttribute(Attributes::NoImplicitFloat);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
@@ -2136,16 +2183,14 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
/// optimization is performed and it is required (FPDiff!=0).
static SDValue
EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
- SDValue Chain, SDValue RetAddrFrIdx,
- bool Is64Bit, int FPDiff, DebugLoc dl) {
+ SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
+ unsigned SlotSize, int FPDiff, DebugLoc dl) {
// Store the return address to the appropriate stack slot.
if (!FPDiff) return Chain;
// Calculate the new stack slot for the return address.
- int SlotSize = Is64Bit ? 8 : 4;
int NewReturnAddrFI =
MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
- EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
- SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+ SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
MachinePointerInfo::getFixedStack(NewReturnAddrFI),
false, false, 0);
@@ -2180,7 +2225,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
- MF.getFunction()->hasStructRetAttr(),
+ MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
Outs, OutVals, Ins, DAG);
// Sibcalls are automatically detected tailcalls which do not require
@@ -2220,14 +2265,15 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
int FPDiff = 0;
if (isTailCall && !IsSibcall) {
// Lower arguments at fp - stackoffset + fpdiff.
- unsigned NumBytesCallerPushed =
- MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+ X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+ unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
FPDiff = NumBytesCallerPushed - NumBytes;
// Set the delta of movement of the returnaddr stackslot.
// But only set if delta is greater than previous delta.
- if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
- MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+ if (FPDiff < X86Info->getTCReturnAddrDelta())
+ X86Info->setTCReturnAddrDelta(FPDiff);
}
if (!IsSibcall)
@@ -2304,7 +2350,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
} else if (!IsSibcall && (!isTailCall || isByVal)) {
assert(VA.isMemLoc());
if (StackPtr.getNode() == 0)
- StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+ StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ getPointerTy());
MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
dl, DAG, VA, Flags));
}
@@ -2392,7 +2439,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Copy relative to framepointer.
SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
if (StackPtr.getNode() == 0)
- StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+ StackPtr = DAG.getCopyFromReg(Chain, dl,
+ RegInfo->getStackRegister(),
getPointerTy());
Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
@@ -2414,7 +2462,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
&MemOpChains2[0], MemOpChains2.size());
// Store the return address to the appropriate stack slot.
- Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+ Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+ getPointerTy(), RegInfo->getSlotSize(),
FPDiff, dl);
}
@@ -2464,7 +2513,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
OpFlags = X86II::MO_DARWIN_STUB;
} else if (Subtarget->isPICStyleRIPRel() &&
isa<Function>(GV) &&
- cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
+ cast<Function>(GV)->getFnAttributes().
+ hasAttribute(Attributes::NonLazyBind)) {
// If the function is marked as non-lazy, generate an indirect call
// which loads from the GOT directly. This avoids runtime overhead
// at the cost of eager binding (and one extra byte of encoding).
@@ -2625,7 +2675,7 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
unsigned StackAlignment = TFI.getStackAlignment();
uint64_t AlignMask = StackAlignment - 1;
int64_t Offset = StackSize;
- uint64_t SlotSize = TD->getPointerSize();
+ unsigned SlotSize = RegInfo->getSlotSize();
if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
// Number smaller than 12 so just add the difference.
Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
@@ -2700,6 +2750,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
bool isVarArg,
bool isCalleeStructRet,
bool isCallerStructRet,
+ Type *RetTy,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -2711,6 +2762,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// If -tailcallopt is specified, make fastcc functions tail-callable.
const MachineFunction &MF = DAG.getMachineFunction();
const Function *CallerF = DAG.getMachineFunction().getFunction();
+
+ // If the function return type is x86_fp80 and the callee return type is not,
+ // then the FP_EXTEND of the call result is not a nop. It's not safe to
+ // perform a tailcall optimization here.
+ if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+ return false;
+
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
@@ -2834,7 +2892,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
MachineFrameInfo *MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const X86InstrInfo *TII =
- ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+ ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[i];
@@ -2985,7 +3043,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
if (ReturnAddrIndex == 0) {
// Set up a frame object for the return address.
- uint64_t SlotSize = TD->getPointerSize();
+ unsigned SlotSize = RegInfo->getSlotSize();
ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
false);
FuncInfo->setRAIndex(ReturnAddrIndex);
@@ -3508,25 +3566,26 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
MatchOddMask = false;
}
- static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
- static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
- const int *CompactionMask;
- if (MatchEvenMask)
- CompactionMask = CompactionMaskEven;
- else if (MatchOddMask)
- CompactionMask = CompactionMaskOdd;
- else
+ if (!MatchEvenMask && !MatchOddMask)
return SDValue();
SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
- SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
- UndefNode, CompactionMask);
- SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
- UndefNode, CompactionMask);
- static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
- return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+ SDValue Op0 = SVOp->getOperand(0);
+ SDValue Op1 = SVOp->getOperand(1);
+
+ if (MatchEvenMask) {
+ // Shift the second operand right to 32 bits.
+ static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+ Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+ } else {
+ // Shift the first operand left to 32 bits.
+ static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+ Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+ }
+ static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+ return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
}
/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -4577,7 +4636,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
MVT ShufVT = V.getValueType().getSimpleVT();
unsigned NumElems = ShufVT.getVectorNumElements();
SmallVector<int, 16> ShuffleMask;
- SDValue ImmN;
bool IsUnary;
if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
@@ -4979,6 +5037,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
LDBase->getAlignment(),
false/*isVolatile*/, true/*ReadMem*/,
false/*WriteMem*/);
+
+ // Make sure the newly-created LOAD is in the same position as LDBase in
+ // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+ // update uses of LDBase's output chain to use the TokenFactor.
+ if (LDBase->hasAnyUseOfValue(1)) {
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+ DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+ SDValue(ResNode.getNode(), 1));
+ }
+
return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
}
return SDValue();
@@ -4992,7 +5062,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
/// The VBROADCAST node is returned when a pattern is found,
/// or SDValue() otherwise.
SDValue
-X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
+X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget->hasAVX())
return SDValue();
@@ -5116,80 +5186,78 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
return SDValue();
}
-// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64
-// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the
-// constraint of matching input/output vector elements.
SDValue
-X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const {
- DebugLoc DL = Op.getDebugLoc();
- SDNode *N = Op.getNode();
+X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- unsigned NumElts = Op.getNumOperands();
- // Check supported types and sub-targets.
- //
- // Only v2f32 -> v2f64 needs special handling.
- if (VT != MVT::v2f64 || !Subtarget->hasSSE2())
+ // Skip if insert_vec_elt is not supported.
+ if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
return SDValue();
- SDValue VecIn;
- EVT VecInVT;
- SmallVector<int, 8> Mask;
- EVT SrcVT = MVT::Other;
+ DebugLoc DL = Op.getDebugLoc();
+ unsigned NumElems = Op.getNumOperands();
+
+ SDValue VecIn1;
+ SDValue VecIn2;
+ SmallVector<unsigned, 4> InsertIndices;
+ SmallVector<int, 8> Mask(NumElems, -1);
- // Check the patterns could be translated into X86vfpext.
- for (unsigned i = 0; i < NumElts; ++i) {
- SDValue In = N->getOperand(i);
- unsigned Opcode = In.getOpcode();
+ for (unsigned i = 0; i != NumElems; ++i) {
+ unsigned Opc = Op.getOperand(i).getOpcode();
- // Skip if the element is undefined.
- if (Opcode == ISD::UNDEF) {
- Mask.push_back(-1);
+ if (Opc == ISD::UNDEF)
continue;
- }
- // Quit if one of the elements is not defined from 'fpext'.
- if (Opcode != ISD::FP_EXTEND)
- return SDValue();
+ if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+ // Quit if more than 1 elements need inserting.
+ if (InsertIndices.size() > 1)
+ return SDValue();
+
+ InsertIndices.push_back(i);
+ continue;
+ }
- // Check how the source of 'fpext' is defined.
- SDValue L2In = In.getOperand(0);
- EVT L2InVT = L2In.getValueType();
+ SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+ SDValue ExtIdx = Op.getOperand(i).getOperand(1);
- // Check the original type
- if (SrcVT == MVT::Other)
- SrcVT = L2InVT;
- else if (SrcVT != L2InVT) // Quit if non-homogenous typed.
+ // Quit if extracted from vector of different type.
+ if (ExtractedFromVec.getValueType() != VT)
return SDValue();
- // Check whether the value being 'fpext'ed is extracted from the same
- // source.
- Opcode = L2In.getOpcode();
-
- // Quit if it's not extracted with a constant index.
- if (Opcode != ISD::EXTRACT_VECTOR_ELT ||
- !isa<ConstantSDNode>(L2In.getOperand(1)))
+ // Quit if non-constant index.
+ if (!isa<ConstantSDNode>(ExtIdx))
return SDValue();
- SDValue ExtractedFromVec = L2In.getOperand(0);
+ if (VecIn1.getNode() == 0)
+ VecIn1 = ExtractedFromVec;
+ else if (VecIn1 != ExtractedFromVec) {
+ if (VecIn2.getNode() == 0)
+ VecIn2 = ExtractedFromVec;
+ else if (VecIn2 != ExtractedFromVec)
+ // Quit if more than 2 vectors to shuffle
+ return SDValue();
+ }
- if (VecIn.getNode() == 0) {
- VecIn = ExtractedFromVec;
- VecInVT = ExtractedFromVec.getValueType();
- } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec.
- return SDValue();
+ unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
- Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue());
+ if (ExtractedFromVec == VecIn1)
+ Mask[i] = Idx;
+ else if (ExtractedFromVec == VecIn2)
+ Mask[i] = Idx + NumElems;
}
- // Fill the remaining mask as undef.
- for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i)
- Mask.push_back(-1);
+ if (VecIn1.getNode() == 0)
+ return SDValue();
+
+ VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+ SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+ for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+ unsigned Idx = InsertIndices[i];
+ NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+ DAG.getIntPtrConstant(Idx));
+ }
- return DAG.getNode(X86ISD::VFPEXT, DL, VT,
- DAG.getVectorShuffle(VecInVT, DL,
- VecIn, DAG.getUNDEF(VecInVT),
- &Mask[0]));
+ return NV;
}
SDValue
@@ -5224,10 +5292,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (Broadcast.getNode())
return Broadcast;
- SDValue FpExt = LowerVectorFpExtend(Op, DAG);
- if (FpExt.getNode())
- return FpExt;
-
unsigned EVTBits = ExtVT.getSizeInBits();
unsigned NumZero = 0;
@@ -5472,6 +5536,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (LD.getNode())
return LD;
+ // Check for a build vector from mostly shuffle plus few inserting.
+ SDValue Sh = buildFromShuffleMostly(Op, DAG);
+ if (Sh.getNode())
+ return Sh;
+
// For SSE 4.1, use insertps to put the high elements into the low element.
if (getSubtarget()->hasSSE41()) {
SDValue Result;
@@ -5538,8 +5607,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
}
-SDValue
-X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
assert(Op.getNumOperands() == 2);
// 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
@@ -5548,9 +5616,9 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
}
// Try to lower a shuffle node into a simple blend instruction.
-static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
- const X86Subtarget *Subtarget,
- SelectionDAG &DAG) {
+static SDValue
+LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+ const X86Subtarget *Subtarget, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
@@ -5620,9 +5688,9 @@ static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
// 2. [ssse3] 1 x pshufb
// 3. [ssse3] 2 x pshufb + 1 x por
// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-SDValue
-X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
- SelectionDAG &DAG) const {
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
@@ -5879,8 +5947,6 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
DebugLoc dl = SVOp->getDebugLoc();
ArrayRef<int> MaskVals = SVOp->getMask();
- bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
@@ -5904,7 +5970,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v16i8, &pshufbMask[0], 16));
- if (V2IsUndef)
+
+ // As PSHUFB will zero elements with negative indices, it's safe to ignore
+ // the 2nd operand if it's undefined or zero.
+ if (V2.getOpcode() == ISD::UNDEF ||
+ ISD::isBuildVectorAllZeros(V2.getNode()))
return V1;
// Calculate the shuffle mask for the second input, shuffle it, and
@@ -5990,6 +6060,51 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
}
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+ const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
+ EVT VT = SVOp->getValueType(0);
+ SDValue V1 = SVOp->getOperand(0);
+ SDValue V2 = SVOp->getOperand(1);
+ DebugLoc dl = SVOp->getDebugLoc();
+ SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+ bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+ bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ // VPSHUFB may be generated if
+ // (1) one of input vector is undefined or zeroinitializer.
+ // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+ // And (2) the mask indexes don't cross the 128-bit lane.
+ if (VT != MVT::v32i8 || !Subtarget->hasAVX2() ||
+ (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+ return SDValue();
+
+ if (V1IsAllZero && !V2IsAllZero) {
+ CommuteVectorShuffleMask(MaskVals, 32);
+ V1 = V2;
+ }
+ SmallVector<SDValue, 32> pshufbMask;
+ for (unsigned i = 0; i != 32; i++) {
+ int EltIdx = MaskVals[i];
+ if (EltIdx < 0 || EltIdx >= 32)
+ EltIdx = 0x80;
+ else {
+ if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
+ // Cross lane is not allowed.
+ return SDValue();
+ EltIdx &= 0xf;
+ }
+ pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+ }
+ return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
+ DAG.getNode(ISD::BUILD_VECTOR, dl,
+ MVT::v32i8, &pshufbMask[0], 32));
+}
+
/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
/// done when every pair / quad of shuffle mask elements point to elements in
@@ -6324,17 +6439,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
}
static bool MayFoldVectorLoad(SDValue V) {
- if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+ while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
V = V.getOperand(0);
+
if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
V = V.getOperand(0);
if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
// BUILD_VECTOR (load), undef
V = V.getOperand(0);
- if (MayFoldLoad(V))
- return true;
- return false;
+
+ return MayFoldLoad(V);
}
// FIXME: the version above should always be used. Since there's
@@ -6457,6 +6572,81 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
getShuffleSHUFImmediate(SVOp), DAG);
}
+// Reduce a vector shuffle to zext.
+SDValue
+X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+ // PMOVZX is only available from SSE41.
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ EVT VT = Op.getValueType();
+
+ // Only AVX2 support 256-bit vector integer extending.
+ if (!Subtarget->hasAVX2() && VT.is256BitVector())
+ return SDValue();
+
+ ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+ DebugLoc DL = Op.getDebugLoc();
+ SDValue V1 = Op.getOperand(0);
+ SDValue V2 = Op.getOperand(1);
+ unsigned NumElems = VT.getVectorNumElements();
+
+ // Extending is an unary operation and the element type of the source vector
+ // won't be equal to or larger than i64.
+ if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+ VT.getVectorElementType() == MVT::i64)
+ return SDValue();
+
+ // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+ unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+ while ((1U << Shift) < NumElems) {
+ if (SVOp->getMaskElt(1U << Shift) == 1)
+ break;
+ Shift += 1;
+ // The maximal ratio is 8, i.e. from i8 to i64.
+ if (Shift > 3)
+ return SDValue();
+ }
+
+ // Check the shuffle mask.
+ unsigned Mask = (1U << Shift) - 1;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ int EltIdx = SVOp->getMaskElt(i);
+ if ((i & Mask) != 0 && EltIdx != -1)
+ return SDValue();
+ if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+ return SDValue();
+ }
+
+ unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+ EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
+
+ if (!isTypeLegal(NVT))
+ return SDValue();
+
+ // Simplify the operand as it's prepared to be fed into shuffle.
+ unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+ if (V1.getOpcode() == ISD::BITCAST &&
+ V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+ V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ V1.getOperand(0)
+ .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+ // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+ SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+ ConstantSDNode *CIdx =
+ dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+ // If it's foldable, i.e. normal load with single use, we will let code
+ // selection to fold it. Otherwise, we will short the conversion sequence.
+ if (CIdx && CIdx->getZExtValue() == 0 &&
+ (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
+ V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+ }
+
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
+
SDValue
X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -6487,6 +6677,11 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
return PromoteSplat(SVOp, DAG);
}
+ // Check integer expanding shuffles.
+ SDValue NewOp = lowerVectorIntExtend(Op, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+
// If the shuffle can be profitably rewritten as a narrower shuffle, then
// do it!
if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
@@ -6536,7 +6731,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
bool HasAVX = Subtarget->hasAVX();
bool HasAVX2 = Subtarget->hasAVX2();
MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+ bool OptForSize = MF.getFunction()->getFnAttributes().
+ hasAttribute(Attributes::OptimizeForSize);
assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
@@ -6805,7 +7001,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// Handle v8i16 specifically since SSE can do byte extraction and insertion.
if (VT == MVT::v8i16) {
- SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
+ SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
if (NewOp.getNode())
return NewOp;
}
@@ -6816,6 +7012,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return NewOp;
}
+ if (VT == MVT::v32i8) {
+ SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
+ if (NewOp.getNode())
+ return NewOp;
+ }
+
// Handle all 128-bit wide vectors with 4 elements, and match them with
// several different shuffle types.
if (NumElems == 4 && VT.is128BitVector())
@@ -6839,9 +7041,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
if (VT.getSizeInBits() == 8) {
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
- Op.getOperand(0), Op.getOperand(1));
+ Op.getOperand(0), Op.getOperand(1));
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
- DAG.getValueType(VT));
+ DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
}
@@ -6856,9 +7058,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
Op.getOperand(0)),
Op.getOperand(1)));
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
- Op.getOperand(0), Op.getOperand(1));
+ Op.getOperand(0), Op.getOperand(1));
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
- DAG.getValueType(VT));
+ DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
}
@@ -6942,9 +7144,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// Transform it so it match pextrw which produces a 32-bit result.
EVT EltVT = MVT::i32;
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
- Op.getOperand(0), Op.getOperand(1));
+ Op.getOperand(0), Op.getOperand(1));
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
- DAG.getValueType(VT));
+ DAG.getValueType(VT));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
}
@@ -7087,8 +7289,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue
-X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
LLVMContext *Context = DAG.getContext();
DebugLoc dl = Op.getDebugLoc();
EVT OpVT = Op.getValueType();
@@ -7120,8 +7321,8 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
// a simple subregister reference or explicit instructions to grab
// upper bits of a vector.
-SDValue
-X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
if (Subtarget->hasAVX()) {
DebugLoc dl = Op.getNode()->getDebugLoc();
SDValue Vec = Op.getNode()->getOperand(0);
@@ -7140,8 +7341,8 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
// simple superregister reference or explicit instructions to insert
// the upper bits of a vector.
-SDValue
-X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
if (Subtarget->hasAVX()) {
DebugLoc dl = Op.getNode()->getDebugLoc();
SDValue Vec = Op.getNode()->getOperand(0);
@@ -7284,9 +7485,10 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
Subtarget->ClassifyBlockAddressReference();
CodeModel::Model M = getTargetMachine().getCodeModel();
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+ int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
DebugLoc dl = Op.getDebugLoc();
- SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
- /*isTarget=*/true, OpFlags);
+ SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+ OpFlags);
if (Subtarget->isPICStyleRIPRel() &&
(M == CodeModel::Small || M == CodeModel::Kernel))
@@ -7395,8 +7597,8 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
SDValue InFlag;
DebugLoc dl = GA->getDebugLoc(); // ? function entry point might be better
SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
- DAG.getNode(X86ISD::GlobalBaseReg,
- DebugLoc(), PtrVT), InFlag);
+ DAG.getNode(X86ISD::GlobalBaseReg,
+ DebugLoc(), PtrVT), InFlag);
InFlag = Chain.getValue(1);
return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
@@ -7897,11 +8099,29 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
return Sub;
}
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue N0 = Op.getOperand(0);
+ EVT SVT = N0.getValueType();
+ DebugLoc dl = Op.getDebugLoc();
+
+ assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+ SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+ "Custom UINT_TO_FP is not supported!");
+
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
+ return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+ DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
SDValue N0 = Op.getOperand(0);
DebugLoc dl = Op.getDebugLoc();
+ if (Op.getValueType().isVector())
+ return lowerUINT_TO_FP_vec(Op, DAG);
+
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
// the optimization here.
@@ -8075,10 +8295,66 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co
}
}
+SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ SDValue In = Op.getOperand(0);
+ EVT SVT = In.getValueType();
+
+ if (!VT.is256BitVector() || !SVT.is128BitVector() ||
+ VT.getVectorNumElements() != SVT.getVectorNumElements())
+ return SDValue();
+
+ assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+ // AVX2 has better support of integer extending.
+ if (Subtarget->hasAVX2())
+ return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+ SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
+ static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
+ SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
+ DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0]));
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+}
+
+SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ EVT SVT = Op.getOperand(0).getValueType();
+
+ if (!VT.is128BitVector() || !SVT.is256BitVector() ||
+ VT.getVectorNumElements() != SVT.getVectorNumElements())
+ return SDValue();
+
+ assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+ unsigned NumElems = VT.getVectorNumElements();
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+ NumElems * 2);
+
+ SDValue In = Op.getOperand(0);
+ SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+ // Prepare truncation shuffle mask
+ for (unsigned i = 0; i != NumElems; ++i)
+ MaskVec[i] = i * 2;
+ SDValue V = DAG.getVectorShuffle(NVT, DL,
+ DAG.getNode(ISD::BITCAST, DL, NVT, In),
+ DAG.getUNDEF(NVT), &MaskVec[0]);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+ DAG.getIntPtrConstant(0));
+}
+
SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getValueType().isVector())
+ if (Op.getValueType().isVector()) {
+ if (Op.getValueType() == MVT::v8i16)
+ return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
+ DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+ MVT::v8i32, Op.getOperand(0)));
return SDValue();
+ }
std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
/*IsSigned=*/ true, /*IsReplace=*/ false);
@@ -8113,26 +8389,49 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
return FIST;
}
-SDValue X86TargetLowering::LowerFABS(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ EVT VT = Op.getValueType();
+ SDValue In = Op.getOperand(0);
+ EVT SVT = In.getValueType();
+
+ assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+ In, DAG.getUNDEF(SVT)));
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
LLVMContext *Context = DAG.getContext();
DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType();
EVT EltVT = VT;
- if (VT.isVector())
+ unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+ if (VT.isVector()) {
EltVT = VT.getVectorElementType();
- Constant *C;
- if (EltVT == MVT::f64) {
- C = ConstantVector::getSplat(2,
- ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
- } else {
- C = ConstantVector::getSplat(4,
- ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+ NumElts = VT.getVectorNumElements();
}
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ Constant *C;
+ if (EltVT == MVT::f64)
+ C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+ else
+ C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+ C = ConstantVector::getSplat(NumElts, C);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
+ false, false, false, Alignment);
+ if (VT.isVector()) {
+ MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+ return DAG.getNode(ISD::BITCAST, dl, VT,
+ DAG.getNode(ISD::AND, dl, ANDVT,
+ DAG.getNode(ISD::BITCAST, dl, ANDVT,
+ Op.getOperand(0)),
+ DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+ }
return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
}
@@ -8152,10 +8451,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
else
C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
C = ConstantVector::getSplat(NumElts, C);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+ SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+ unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
+ false, false, false, Alignment);
if (VT.isVector()) {
MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
return DAG.getNode(ISD::BITCAST, dl, VT,
@@ -8241,7 +8541,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
}
-SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue N0 = Op.getOperand(0);
DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType();
@@ -8252,6 +8552,98 @@ SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
}
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
+ assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+ if (!Subtarget->hasSSE41())
+ return SDValue();
+
+ if (!Op->hasOneUse())
+ return SDValue();
+
+ SDNode *N = Op.getNode();
+ DebugLoc DL = N->getDebugLoc();
+
+ SmallVector<SDValue, 8> Opnds;
+ DenseMap<SDValue, unsigned> VecInMap;
+ EVT VT = MVT::Other;
+
+ // Recognize a special case where a vector is casted into wide integer to
+ // test all 0s.
+ Opnds.push_back(N->getOperand(0));
+ Opnds.push_back(N->getOperand(1));
+
+ for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+ SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+ // BFS traverse all OR'd operands.
+ if (I->getOpcode() == ISD::OR) {
+ Opnds.push_back(I->getOperand(0));
+ Opnds.push_back(I->getOperand(1));
+ // Re-evaluate the number of nodes to be traversed.
+ e += 2; // 2 more nodes (LHS and RHS) are pushed.
+ continue;
+ }
+
+ // Quit if a non-EXTRACT_VECTOR_ELT
+ if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // Quit if without a constant index.
+ SDValue Idx = I->getOperand(1);
+ if (!isa<ConstantSDNode>(Idx))
+ return SDValue();
+
+ SDValue ExtractedFromVec = I->getOperand(0);
+ DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+ if (M == VecInMap.end()) {
+ VT = ExtractedFromVec.getValueType();
+ // Quit if not 128/256-bit vector.
+ if (!VT.is128BitVector() && !VT.is256BitVector())
+ return SDValue();
+ // Quit if not the same type.
+ if (VecInMap.begin() != VecInMap.end() &&
+ VT != VecInMap.begin()->first.getValueType())
+ return SDValue();
+ M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+ }
+ M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+ }
+
+ assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ "Not extracted from 128-/256-bit vector.");
+
+ unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+ SmallVector<SDValue, 8> VecIns;
+
+ for (DenseMap<SDValue, unsigned>::const_iterator
+ I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+ // Quit if not all elements are used.
+ if (I->second != FullMask)
+ return SDValue();
+ VecIns.push_back(I->first);
+ }
+
+ EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+ // Cast all vectors into TestVT for PTEST.
+ for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+ VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+ // If more than one full vectors are evaluated, OR them first before PTEST.
+ for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+ // Each iteration will OR 2 nodes and append the result until there is only
+ // 1 node left, i.e. the final OR'd value of all vectors.
+ SDValue LHS = VecIns[Slot];
+ SDValue RHS = VecIns[Slot + 1];
+ VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+ }
+
+ return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+ VecIns.back(), VecIns.back());
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@@ -8285,7 +8677,33 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
unsigned Opcode = 0;
unsigned NumOperands = 0;
- switch (Op.getNode()->getOpcode()) {
+
+ // Truncate operations may prevent the merge of the SETCC instruction
+ // and the arithmetic intruction before it. Attempt to truncate the operands
+ // of the arithmetic instruction and use a reduced bit-width instruction.
+ bool NeedTruncation = false;
+ SDValue ArithOp = Op;
+ if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+ SDValue Arith = Op->getOperand(0);
+ // Both the trunc and the arithmetic op need to have one user each.
+ if (Arith->hasOneUse())
+ switch (Arith.getOpcode()) {
+ default: break;
+ case ISD::ADD:
+ case ISD::SUB:
+ case ISD::AND:
+ case ISD::OR:
+ case ISD::XOR: {
+ NeedTruncation = true;
+ ArithOp = Arith;
+ }
+ }
+ }
+
+ // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+ // which may be the result of a CAST. We use the variable 'Op', which is the
+ // non-casted variable when we check for possible users.
+ switch (ArithOp.getOpcode()) {
case ISD::ADD:
// Due to an isel shortcoming, be conservative if this add is likely to be
// selected as part of a load-modify-store instruction. When the root node
@@ -8305,7 +8723,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
goto default_case;
if (ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+ dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
// An add of one will be selected as an INC.
if (C->getAPIntValue() == 1) {
Opcode = X86ISD::INC;
@@ -8341,7 +8759,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
if (User->getOpcode() != ISD::BRCOND &&
User->getOpcode() != ISD::SETCC &&
- (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+ !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
NonFlagUse = true;
break;
}
@@ -8362,14 +8780,20 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
goto default_case;
// Otherwise use a regular EFLAGS-setting instruction.
- switch (Op.getNode()->getOpcode()) {
+ switch (ArithOp.getOpcode()) {
default: llvm_unreachable("unexpected operator!");
- case ISD::SUB:
- Opcode = X86ISD::SUB;
- break;
- case ISD::OR: Opcode = X86ISD::OR; break;
+ case ISD::SUB: Opcode = X86ISD::SUB; break;
case ISD::XOR: Opcode = X86ISD::XOR; break;
case ISD::AND: Opcode = X86ISD::AND; break;
+ case ISD::OR: {
+ if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+ SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+ if (EFLAGS.getNode())
+ return EFLAGS;
+ }
+ Opcode = X86ISD::OR;
+ break;
+ }
}
NumOperands = 2;
@@ -8387,19 +8811,40 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
break;
}
+ // If we found that truncation is beneficial, perform the truncation and
+ // update 'Op'.
+ if (NeedTruncation) {
+ EVT VT = Op.getValueType();
+ SDValue WideVal = Op->getOperand(0);
+ EVT WideVT = WideVal.getValueType();
+ unsigned ConvertedOp = 0;
+ // Use a target machine opcode to prevent further DAGCombine
+ // optimizations that may separate the arithmetic operations
+ // from the setcc node.
+ switch (WideVal.getOpcode()) {
+ default: break;
+ case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+ case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+ case ISD::AND: ConvertedOp = X86ISD::AND; break;
+ case ISD::OR: ConvertedOp = X86ISD::OR; break;
+ case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+ }
+
+ if (ConvertedOp) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+ SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+ SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+ Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+ }
+ }
+ }
+
if (Opcode == 0)
// Emit a CMP with 0, which is the TEST pattern.
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
DAG.getConstant(0, Op.getValueType()));
- if (Opcode == X86ISD::CMP) {
- SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
- Op.getOperand(1));
- // We can't replace usage of SUB with CMP.
- // The SUB node will be removed later because there is no use of it.
- return SDValue(New.getNode(), 0);
- }
-
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
SmallVector<SDValue, 4> Ops;
for (unsigned i = 0; i != NumOperands; ++i)
@@ -8958,6 +9403,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
}
}
+ // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+ // widen the cmov and push the truncate through. This avoids introducing a new
+ // branch during isel and doesn't add any extensions.
+ if (Op.getValueType() == MVT::i8 &&
+ Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+ SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+ if (T1.getValueType() == T2.getValueType() &&
+ // Blacklist CopyFromReg to avoid partial register stalls.
+ T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+ SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+ return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+ }
+ }
+
// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
// condition is true.
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
@@ -9312,7 +9772,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
Flag = Chain.getValue(1);
- Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+ Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+ SPTy).getValue(1);
SDValue Ops1[2] = { Chain.getValue(0), Chain };
return DAG.getMergeValues(Ops1, 2, dl);
@@ -9395,7 +9856,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
EVT ArgVT = Op.getNode()->getValueType(0);
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
- uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
+ uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
uint8_t ArgMode;
// Decide which area this value should be read from.
@@ -9415,7 +9876,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!getTargetMachine().Options.UseSoftFloat &&
!(DAG.getMachineFunction()
- .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
+ .getFunction()->getFnAttributes()
+ .hasAttribute(Attributes::NoImplicitFloat)) &&
Subtarget->hasSSE1());
}
@@ -9446,7 +9908,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
false, false, false, 0);
}
-SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
// X86-64 va_list is a struct { i32, i32, i8*, i8* }.
assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
SDValue Chain = Op.getOperand(0);
@@ -9507,8 +9970,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
}
-SDValue
-X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IntNo) {
@@ -9896,62 +10358,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
Op.getOperand(1), Op.getOperand(2), DAG);
}
- // Fix vector shift instructions where the last operand is a non-immediate
- // i32 value.
- case Intrinsic::x86_mmx_pslli_w:
- case Intrinsic::x86_mmx_pslli_d:
- case Intrinsic::x86_mmx_pslli_q:
- case Intrinsic::x86_mmx_psrli_w:
- case Intrinsic::x86_mmx_psrli_d:
- case Intrinsic::x86_mmx_psrli_q:
- case Intrinsic::x86_mmx_psrai_w:
- case Intrinsic::x86_mmx_psrai_d: {
- SDValue ShAmt = Op.getOperand(2);
- if (isa<ConstantSDNode>(ShAmt))
- return SDValue();
-
- unsigned NewIntNo;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_mmx_pslli_w:
- NewIntNo = Intrinsic::x86_mmx_psll_w;
- break;
- case Intrinsic::x86_mmx_pslli_d:
- NewIntNo = Intrinsic::x86_mmx_psll_d;
- break;
- case Intrinsic::x86_mmx_pslli_q:
- NewIntNo = Intrinsic::x86_mmx_psll_q;
- break;
- case Intrinsic::x86_mmx_psrli_w:
- NewIntNo = Intrinsic::x86_mmx_psrl_w;
- break;
- case Intrinsic::x86_mmx_psrli_d:
- NewIntNo = Intrinsic::x86_mmx_psrl_d;
- break;
- case Intrinsic::x86_mmx_psrli_q:
- NewIntNo = Intrinsic::x86_mmx_psrl_q;
- break;
- case Intrinsic::x86_mmx_psrai_w:
- NewIntNo = Intrinsic::x86_mmx_psra_w;
- break;
- case Intrinsic::x86_mmx_psrai_d:
- NewIntNo = Intrinsic::x86_mmx_psra_d;
- break;
- }
-
- // The vector shift intrinsics with scalars uses 32b shift amounts but
- // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
- // to be zero.
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
- DAG.getConstant(0, MVT::i32));
-// FIXME this must be lowered to get rid of the invalid type.
-
- EVT VT = Op.getValueType();
- ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
- return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
- DAG.getConstant(NewIntNo, MVT::i32),
- Op.getOperand(1), ShAmt);
- }
case Intrinsic::x86_sse42_pcmpistria128:
case Intrinsic::x86_sse42_pcmpestria128:
case Intrinsic::x86_sse42_pcmpistric128:
@@ -10030,11 +10436,78 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
}
+ case Intrinsic::x86_fma_vfmadd_ps:
+ case Intrinsic::x86_fma_vfmadd_pd:
+ case Intrinsic::x86_fma_vfmsub_ps:
+ case Intrinsic::x86_fma_vfmsub_pd:
+ case Intrinsic::x86_fma_vfnmadd_ps:
+ case Intrinsic::x86_fma_vfnmadd_pd:
+ case Intrinsic::x86_fma_vfnmsub_ps:
+ case Intrinsic::x86_fma_vfnmsub_pd:
+ case Intrinsic::x86_fma_vfmaddsub_ps:
+ case Intrinsic::x86_fma_vfmaddsub_pd:
+ case Intrinsic::x86_fma_vfmsubadd_ps:
+ case Intrinsic::x86_fma_vfmsubadd_pd:
+ case Intrinsic::x86_fma_vfmadd_ps_256:
+ case Intrinsic::x86_fma_vfmadd_pd_256:
+ case Intrinsic::x86_fma_vfmsub_ps_256:
+ case Intrinsic::x86_fma_vfmsub_pd_256:
+ case Intrinsic::x86_fma_vfnmadd_ps_256:
+ case Intrinsic::x86_fma_vfnmadd_pd_256:
+ case Intrinsic::x86_fma_vfnmsub_ps_256:
+ case Intrinsic::x86_fma_vfnmsub_pd_256:
+ case Intrinsic::x86_fma_vfmaddsub_ps_256:
+ case Intrinsic::x86_fma_vfmaddsub_pd_256:
+ case Intrinsic::x86_fma_vfmsubadd_ps_256:
+ case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+ unsigned Opc;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_fma_vfmadd_ps:
+ case Intrinsic::x86_fma_vfmadd_pd:
+ case Intrinsic::x86_fma_vfmadd_ps_256:
+ case Intrinsic::x86_fma_vfmadd_pd_256:
+ Opc = X86ISD::FMADD;
+ break;
+ case Intrinsic::x86_fma_vfmsub_ps:
+ case Intrinsic::x86_fma_vfmsub_pd:
+ case Intrinsic::x86_fma_vfmsub_ps_256:
+ case Intrinsic::x86_fma_vfmsub_pd_256:
+ Opc = X86ISD::FMSUB;
+ break;
+ case Intrinsic::x86_fma_vfnmadd_ps:
+ case Intrinsic::x86_fma_vfnmadd_pd:
+ case Intrinsic::x86_fma_vfnmadd_ps_256:
+ case Intrinsic::x86_fma_vfnmadd_pd_256:
+ Opc = X86ISD::FNMADD;
+ break;
+ case Intrinsic::x86_fma_vfnmsub_ps:
+ case Intrinsic::x86_fma_vfnmsub_pd:
+ case Intrinsic::x86_fma_vfnmsub_ps_256:
+ case Intrinsic::x86_fma_vfnmsub_pd_256:
+ Opc = X86ISD::FNMSUB;
+ break;
+ case Intrinsic::x86_fma_vfmaddsub_ps:
+ case Intrinsic::x86_fma_vfmaddsub_pd:
+ case Intrinsic::x86_fma_vfmaddsub_ps_256:
+ case Intrinsic::x86_fma_vfmaddsub_pd_256:
+ Opc = X86ISD::FMADDSUB;
+ break;
+ case Intrinsic::x86_fma_vfmsubadd_ps:
+ case Intrinsic::x86_fma_vfmsubadd_pd:
+ case Intrinsic::x86_fma_vfmsubadd_ps_256:
+ case Intrinsic::x86_fma_vfmsubadd_pd_256:
+ Opc = X86ISD::FMSUBADD;
+ break;
+ }
+
+ return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+ Op.getOperand(2), Op.getOperand(3));
+ }
}
}
-SDValue
-X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
switch (IntNo) {
@@ -10072,21 +10545,21 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
DebugLoc dl = Op.getDebugLoc();
+ EVT PtrVT = getPointerTy();
if (Depth > 0) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
SDValue Offset =
- DAG.getConstant(TD->getPointerSize(),
- Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
- return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
- DAG.getNode(ISD::ADD, dl, getPointerTy(),
+ DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+ DAG.getNode(ISD::ADD, dl, PtrVT,
FrameAddr, Offset),
MachinePointerInfo(), false, false, false, 0);
}
// Just load the return address.
SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
- return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+ return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
RetAddrFI, MachinePointerInfo(), false, false, false, 0);
}
@@ -10108,7 +10581,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
SelectionDAG &DAG) const {
- return DAG.getIntPtrConstant(2*TD->getPointerSize());
+ return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
}
SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
@@ -10123,7 +10596,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
- DAG.getIntPtrConstant(TD->getPointerSize()));
+ DAG.getIntPtrConstant(RegInfo->getSlotSize()));
StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
false, false, 0);
@@ -10134,8 +10607,22 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
}
-SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+ DAG.getVTList(MVT::i32, MVT::Other),
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+ SelectionDAG &DAG) const {
+ DebugLoc DL = Op.getDebugLoc();
+ return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+ Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
return Op.getOperand(0);
}
@@ -10148,6 +10635,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
DebugLoc dl = Op.getDebugLoc();
const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+ const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
if (Subtarget->is64Bit()) {
SDValue OutChains[6];
@@ -10156,8 +10644,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
- const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
- const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
+ const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+ const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
@@ -10230,7 +10718,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
- if (Attrs.paramHasAttr(Idx, Attribute::InReg))
+ if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
// FIXME: should only count parameters that are lowered to integers.
InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
@@ -10259,7 +10747,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
// This is storing the opcode for MOV32ri.
const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
- const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
+ const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
OutChains[0] = DAG.getStore(Root, dl,
DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
Trmp, MachinePointerInfo(TrmpAddr),
@@ -10358,7 +10846,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
}
-SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
@@ -10392,8 +10880,7 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
return Op;
}
-SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
- SelectionDAG &DAG) const {
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
EVT OpVT = VT;
unsigned NumBits = VT.getSizeInBits();
@@ -10418,7 +10905,7 @@ SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
return Op;
}
-SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
unsigned NumBits = VT.getSizeInBits();
DebugLoc dl = Op.getDebugLoc();
@@ -10467,21 +10954,22 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
}
-SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
assert(Op.getValueType().is256BitVector() &&
Op.getValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return Lower256IntArith(Op, DAG);
}
-SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
assert(Op.getValueType().is256BitVector() &&
Op.getValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
return Lower256IntArith(Op, DAG);
}
-SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Decompose 256-bit ops into smaller 128-bit ops.
@@ -10756,7 +11244,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
// looks for this combo and may remove the "setcc" instruction if the "setcc"
@@ -10871,7 +11359,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
}
// fall through
case MVT::v4i32:
@@ -10884,7 +11372,8 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
}
-SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
+static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
// Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
@@ -10929,8 +11418,8 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
}
-SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
- SelectionDAG &DAG) const {
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
@@ -10968,7 +11457,8 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
}
-SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
EVT T = Op.getValueType();
DebugLoc DL = Op.getDebugLoc();
unsigned Reg = 0;
@@ -10999,8 +11489,8 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
return cpOut;
}
-SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
- SelectionDAG &DAG) const {
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
assert(Subtarget->is64Bit() && "Result not type legalized?");
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue TheChain = Op.getOperand(0);
@@ -11018,8 +11508,7 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
return DAG.getMergeValues(Ops, 2, dl);
}
-SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
EVT SrcVT = Op.getOperand(0).getValueType();
EVT DstVT = Op.getValueType();
assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
@@ -11039,7 +11528,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
return SDValue();
}
-SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
SDNode *Node = Op.getNode();
DebugLoc dl = Node->getDebugLoc();
EVT T = Node->getValueType(0);
@@ -11112,9 +11601,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: llvm_unreachable("Should not custom lower this!");
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG);
- case ISD::MEMBARRIER: return LowerMEMBARRIER(Op,DAG);
- case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op,DAG);
- case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op,DAG);
+ case ISD::MEMBARRIER: return LowerMEMBARRIER(Op, Subtarget, DAG);
+ case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+ case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG);
case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG);
case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
@@ -11122,8 +11611,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
- case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
- case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, DAG);
+ case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+ case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
@@ -11135,8 +11624,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
+ case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG);
+ case ISD::ZERO_EXTEND: return lowerZERO_EXTEND(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
+ case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG);
case ISD::FABS: return LowerFABS(Op, DAG);
case ISD::FNEG: return LowerFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
@@ -11147,7 +11639,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
- case ISD::VACOPY: return LowerVACOPY(Op, DAG);
+ case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
@@ -11156,13 +11648,15 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
+ case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
+ case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
case ISD::CTLZ: return LowerCTLZ(Op, DAG);
case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ_ZERO_UNDEF(Op, DAG);
case ISD::CTTZ: return LowerCTTZ(Op, DAG);
- case ISD::MUL: return LowerMUL(Op, DAG);
+ case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL: return LowerShift(Op, DAG);
@@ -11172,7 +11666,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::USUBO:
case ISD::SMULO:
case ISD::UMULO: return LowerXALUO(Op, DAG);
- case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, DAG);
+ case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
case ISD::BITCAST: return LowerBITCAST(Op, DAG);
case ISD::ADDC:
case ISD::ADDE:
@@ -11265,6 +11759,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case ISD::UINT_TO_FP: {
+ if (N->getOperand(0).getValueType() != MVT::v2i32 &&
+ N->getValueType(0) != MVT::v2f32)
+ return;
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+ N->getOperand(0));
+ SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+ MVT::f64);
+ SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+ SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+ DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
+ Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ return;
+ }
+ case ISD::FP_ROUND: {
+ SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+ Results.push_back(V);
+ return;
+ }
case ISD::READCYCLECOUNTER: {
SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue TheChain = N->getOperand(0);
@@ -11332,6 +11847,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_LOAD_OR:
case ISD::ATOMIC_LOAD_SUB:
case ISD::ATOMIC_LOAD_XOR:
+ case ISD::ATOMIC_LOAD_MAX:
+ case ISD::ATOMIC_LOAD_MIN:
+ case ISD::ATOMIC_LOAD_UMAX:
+ case ISD::ATOMIC_LOAD_UMIN:
case ISD::ATOMIC_SWAP: {
unsigned Opc;
switch (N->getOpcode()) {
@@ -11354,6 +11873,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::ATOMIC_LOAD_XOR:
Opc = X86ISD::ATOMXOR64_DAG;
break;
+ case ISD::ATOMIC_LOAD_MAX:
+ Opc = X86ISD::ATOMMAX64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_MIN:
+ Opc = X86ISD::ATOMMIN64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_UMAX:
+ Opc = X86ISD::ATOMUMAX64_DAG;
+ break;
+ case ISD::ATOMIC_LOAD_UMIN:
+ Opc = X86ISD::ATOMUMIN64_DAG;
+ break;
case ISD::ATOMIC_SWAP:
Opc = X86ISD::ATOMSWAP64_DAG;
break;
@@ -11420,11 +11951,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FHSUB: return "X86ISD::FHSUB";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
+ case X86ISD::FMAXC: return "X86ISD::FMAXC";
+ case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
case X86ISD::FRCP: return "X86ISD::FRCP";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
+ case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
+ case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
@@ -11440,7 +11975,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VSEXT_MOVL: return "X86ISD::VSEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
+ case X86ISD::VZEXT: return "X86ISD::VZEXT";
+ case X86ISD::VSEXT: return "X86ISD::VSEXT";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
+ case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
case X86ISD::VSHL: return "X86ISD::VSHL";
@@ -11507,6 +12045,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
+ case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
+ case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
}
}
@@ -11655,430 +12195,724 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
// X86 Scheduler Hooks
//===----------------------------------------------------------------------===//
-// private utility function
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
- MachineBasicBlock *MBB,
- unsigned regOpc,
- unsigned immOpc,
- unsigned LoadOpc,
- unsigned CXchgOpc,
- unsigned notOpc,
- unsigned EAXreg,
- const TargetRegisterClass *RC,
- bool Invert) const {
- // For the atomic bitwise operator, we generate
- // thisMBB:
- // newMBB:
- // ld t1 = [bitinstr.addr]
- // op t2 = t1, [bitinstr.val]
- // not t3 = t2 (if Invert)
- // mov EAX = t1
- // lcs dest = [bitinstr.addr], t3 [EAX is implicit]
- // bz newMBB
- // fallthrough -->nextMBB
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- MachineFunction::iterator MBBIter = MBB;
- ++MBBIter;
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+ const TargetInstrInfo *TII) {
+ DebugLoc DL = MI->getDebugLoc();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ // For the v = xbegin(), we generate
+ //
+ // thisMBB:
+ // xbegin sinkMBB
+ //
+ // mainMBB:
+ // eax = -1
+ //
+ // sinkMBB:
+ // v = eax
- /// First build the CFG
- MachineFunction *F = MBB->getParent();
MachineBasicBlock *thisMBB = MBB;
- MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
- F->insert(MBBIter, newMBB);
- F->insert(MBBIter, nextMBB);
-
- // Transfer the remainder of thisMBB and its successor edges to nextMBB.
- nextMBB->splice(nextMBB->begin(), thisMBB,
- llvm::next(MachineBasicBlock::iterator(bInstr)),
- thisMBB->end());
- nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
- // Update thisMBB to fall through to newMBB
- thisMBB->addSuccessor(newMBB);
-
- // newMBB jumps to itself and fall through to nextMBB
- newMBB->addSuccessor(nextMBB);
- newMBB->addSuccessor(newMBB);
-
- // Insert instructions into newMBB based on incoming instruction
- assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
- "unexpected number of operands");
- DebugLoc dl = bInstr->getDebugLoc();
- MachineOperand& destOper = bInstr->getOperand(0);
- MachineOperand* argOpers[2 + X86::AddrNumOperands];
- int numArgs = bInstr->getNumOperands() - 1;
- for (int i=0; i < numArgs; ++i)
- argOpers[i] = &bInstr->getOperand(i+1);
-
- // x86 address has 4 operands: base, index, scale, and displacement
- int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
- int valArgIndx = lastAddrIndx + 1;
-
- unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
- MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
- for (int i=0; i <= lastAddrIndx; ++i)
- (*MIB).addOperand(*argOpers[i]);
-
- unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
- assert((argOpers[valArgIndx]->isReg() ||
- argOpers[valArgIndx]->isImm()) &&
- "invalid operand");
- if (argOpers[valArgIndx]->isReg())
- MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
- else
- MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
- MIB.addReg(t1);
- (*MIB).addOperand(*argOpers[valArgIndx]);
+ MachineFunction *MF = MBB->getParent();
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
- unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
- if (Invert) {
- MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
- }
- else
- t3 = t2;
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ // xbegin sinkMBB
+ // # fallthrough to mainMBB
+ // # abortion to sinkMBB
+ BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(sinkMBB);
+
+ // mainMBB:
+ // EAX = -1
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ // EAX is live into the sinkMBB
+ sinkMBB->addLiveIn(X86::EAX);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::EAX);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
- MIB.addReg(t1);
+ MI->eraseFromParent();
+ return sinkMBB;
+}
- MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
- for (int i=0; i <= lastAddrIndx; ++i)
- (*MIB).addOperand(*argOpers[i]);
- MIB.addReg(t3);
- assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
- (*MIB).setMemRefs(bInstr->memoperands_begin(),
- bInstr->memoperands_end());
+// Get CMPXCHG opcode for the specified data type.
+static unsigned getCmpXChgOpcode(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i8: return X86::LCMPXCHG8;
+ case MVT::i16: return X86::LCMPXCHG16;
+ case MVT::i32: return X86::LCMPXCHG32;
+ case MVT::i64: return X86::LCMPXCHG64;
+ default:
+ break;
+ }
+ llvm_unreachable("Invalid operand size!");
+}
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
- MIB.addReg(EAXreg);
+// Get LOAD opcode for the specified data type.
+static unsigned getLoadOpcode(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i8: return X86::MOV8rm;
+ case MVT::i16: return X86::MOV16rm;
+ case MVT::i32: return X86::MOV32rm;
+ case MVT::i64: return X86::MOV64rm;
+ default:
+ break;
+ }
+ llvm_unreachable("Invalid operand size!");
+}
- // insert branch
- BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+// Get opcode of the non-atomic one from the specified atomic instruction.
+static unsigned getNonAtomicOpcode(unsigned Opc) {
+ switch (Opc) {
+ case X86::ATOMAND8: return X86::AND8rr;
+ case X86::ATOMAND16: return X86::AND16rr;
+ case X86::ATOMAND32: return X86::AND32rr;
+ case X86::ATOMAND64: return X86::AND64rr;
+ case X86::ATOMOR8: return X86::OR8rr;
+ case X86::ATOMOR16: return X86::OR16rr;
+ case X86::ATOMOR32: return X86::OR32rr;
+ case X86::ATOMOR64: return X86::OR64rr;
+ case X86::ATOMXOR8: return X86::XOR8rr;
+ case X86::ATOMXOR16: return X86::XOR16rr;
+ case X86::ATOMXOR32: return X86::XOR32rr;
+ case X86::ATOMXOR64: return X86::XOR64rr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction with
+// extra opcode.
+static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
+ unsigned &ExtraOpc) {
+ switch (Opc) {
+ case X86::ATOMNAND8: ExtraOpc = X86::NOT8r; return X86::AND8rr;
+ case X86::ATOMNAND16: ExtraOpc = X86::NOT16r; return X86::AND16rr;
+ case X86::ATOMNAND32: ExtraOpc = X86::NOT32r; return X86::AND32rr;
+ case X86::ATOMNAND64: ExtraOpc = X86::NOT64r; return X86::AND64rr;
+ case X86::ATOMMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVL32rr;
+ case X86::ATOMMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
+ case X86::ATOMMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
+ case X86::ATOMMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
+ case X86::ATOMMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVG32rr;
+ case X86::ATOMMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
+ case X86::ATOMMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
+ case X86::ATOMMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
+ case X86::ATOMUMAX8: ExtraOpc = X86::CMP8rr; return X86::CMOVB32rr;
+ case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
+ case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
+ case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
+ case X86::ATOMUMIN8: ExtraOpc = X86::CMP8rr; return X86::CMOVA32rr;
+ case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
+ case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
+ case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target.
+static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
+ switch (Opc) {
+ case X86::ATOMAND6432: HiOpc = X86::AND32rr; return X86::AND32rr;
+ case X86::ATOMOR6432: HiOpc = X86::OR32rr; return X86::OR32rr;
+ case X86::ATOMXOR6432: HiOpc = X86::XOR32rr; return X86::XOR32rr;
+ case X86::ATOMADD6432: HiOpc = X86::ADC32rr; return X86::ADD32rr;
+ case X86::ATOMSUB6432: HiOpc = X86::SBB32rr; return X86::SUB32rr;
+ case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
+ case X86::ATOMMAX6432: HiOpc = X86::SETLr; return X86::SETLr;
+ case X86::ATOMMIN6432: HiOpc = X86::SETGr; return X86::SETGr;
+ case X86::ATOMUMAX6432: HiOpc = X86::SETBr; return X86::SETBr;
+ case X86::ATOMUMIN6432: HiOpc = X86::SETAr; return X86::SETAr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target with extra opcode.
+static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
+ unsigned &HiOpc,
+ unsigned &ExtraOpc) {
+ switch (Opc) {
+ case X86::ATOMNAND6432:
+ ExtraOpc = X86::NOT32r;
+ HiOpc = X86::AND32rr;
+ return X86::AND32rr;
+ }
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
- bInstr->eraseFromParent(); // The pseudo instruction is gone now.
- return nextMBB;
+// Get pseudo CMOV opcode from the specified data type.
+static unsigned getPseudoCMOVOpc(EVT VT) {
+ switch (VT.getSimpleVT().SimpleTy) {
+ case MVT::i8: return X86::CMOV_GR8;
+ case MVT::i16: return X86::CMOV_GR16;
+ case MVT::i32: return X86::CMOV_GR32;
+ default:
+ break;
+ }
+ llvm_unreachable("Unknown CMOV opcode!");
}
-// private utility function: 64 bit atomics on 32 bit host.
+// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
+// They will be translated into a spin-loop or compare-exchange loop from
+//
+// ...
+// dst = atomic-fetch-op MI.addr, MI.val
+// ...
+//
+// to
+//
+// ...
+// EAX = LOAD MI.addr
+// loop:
+// t1 = OP MI.val, EAX
+// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+// JNE loop
+// sink:
+// dst = EAX
+// ...
MachineBasicBlock *
-X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
- MachineBasicBlock *MBB,
- unsigned regOpcL,
- unsigned regOpcH,
- unsigned immOpcL,
- unsigned immOpcH,
- bool Invert) const {
- // For the atomic bitwise operator, we generate
- // thisMBB (instructions are in pairs, except cmpxchg8b)
- // ld t1,t2 = [bitinstr.addr]
- // newMBB:
- // out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
- // op t5, t6 <- out1, out2, [bitinstr.val]
- // (for SWAP, substitute: mov t5, t6 <- [bitinstr.val])
- // neg t7, t8 < t5, t6 (if Invert)
- // mov ECX, EBX <- t5, t6
- // mov EAX, EDX <- t1, t2
- // cmpxchg8b [bitinstr.addr] [EAX, EDX, EBX, ECX implicit]
- // mov t3, t4 <- EAX, EDX
- // bz newMBB
- // result in out1, out2
- // fallthrough -->nextMBB
-
- const TargetRegisterClass *RC = &X86::GR32RegClass;
- const unsigned LoadOpc = X86::MOV32rm;
- const unsigned NotOpc = X86::NOT32r;
+X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- MachineFunction::iterator MBBIter = MBB;
- ++MBBIter;
+ DebugLoc DL = MI->getDebugLoc();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
+ "Unexpected number of operands");
+
+ assert(MI->hasOneMemOperand() &&
+ "Expected atomic-load-op to have one memoperand");
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ unsigned DstReg, SrcReg;
+ unsigned MemOpndSlot;
+
+ unsigned CurOp = 0;
+
+ DstReg = MI->getOperand(CurOp++).getReg();
+ MemOpndSlot = CurOp;
+ CurOp += X86::AddrNumOperands;
+ SrcReg = MI->getOperand(CurOp++).getReg();
+
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ MVT::SimpleValueType VT = *RC->vt_begin();
+ unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
+
+ unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
+ unsigned LOADOpc = getLoadOpcode(VT);
+
+ // For the atomic load-arith operator, we generate
+ //
+ // thisMBB:
+ // EAX = LOAD [MI.addr]
+ // mainMBB:
+ // t1 = OP MI.val, EAX
+ // LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+ // JNE mainMBB
+ // sinkMBB:
- /// First build the CFG
- MachineFunction *F = MBB->getParent();
MachineBasicBlock *thisMBB = MBB;
- MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
- F->insert(MBBIter, newMBB);
- F->insert(MBBIter, nextMBB);
-
- // Transfer the remainder of thisMBB and its successor edges to nextMBB.
- nextMBB->splice(nextMBB->begin(), thisMBB,
- llvm::next(MachineBasicBlock::iterator(bInstr)),
- thisMBB->end());
- nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
- // Update thisMBB to fall through to newMBB
- thisMBB->addSuccessor(newMBB);
-
- // newMBB jumps to itself and fall through to nextMBB
- newMBB->addSuccessor(nextMBB);
- newMBB->addSuccessor(newMBB);
-
- DebugLoc dl = bInstr->getDebugLoc();
- // Insert instructions into newMBB based on incoming instruction
- // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
- assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
- "unexpected number of operands");
- MachineOperand& dest1Oper = bInstr->getOperand(0);
- MachineOperand& dest2Oper = bInstr->getOperand(1);
- MachineOperand* argOpers[2 + X86::AddrNumOperands];
- for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
- argOpers[i] = &bInstr->getOperand(i+2);
-
- // We use some of the operands multiple times, so conservatively just
- // clear any kill flags that might be present.
- if (argOpers[i]->isReg() && argOpers[i]->isUse())
- argOpers[i]->setIsKill(false);
- }
-
- // x86 address has 5 operands: base, index, scale, displacement, and segment.
- int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-
- unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
- MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
- for (int i=0; i <= lastAddrIndx; ++i)
- (*MIB).addOperand(*argOpers[i]);
- unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
- MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
- // add 4 to displacement.
- for (int i=0; i <= lastAddrIndx-2; ++i)
- (*MIB).addOperand(*argOpers[i]);
- MachineOperand newOp3 = *(argOpers[3]);
- if (newOp3.isImm())
- newOp3.setImm(newOp3.getImm()+4);
- else
- newOp3.setOffset(newOp3.getOffset()+4);
- (*MIB).addOperand(newOp3);
- (*MIB).addOperand(*argOpers[lastAddrIndx]);
-
- // t3/4 are defined later, at the bottom of the loop
- unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
- unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
- BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
- .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
- BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
- .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
-
- // The subsequent operations should be using the destination registers of
- // the PHI instructions.
- t1 = dest1Oper.getReg();
- t2 = dest2Oper.getReg();
-
- int valArgIndx = lastAddrIndx + 1;
- assert((argOpers[valArgIndx]->isReg() ||
- argOpers[valArgIndx]->isImm()) &&
- "invalid operand");
- unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
- unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
- if (argOpers[valArgIndx]->isReg())
- MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
- else
- MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
- if (regOpcL != X86::MOV32rr)
- MIB.addReg(t1);
- (*MIB).addOperand(*argOpers[valArgIndx]);
- assert(argOpers[valArgIndx + 1]->isReg() ==
- argOpers[valArgIndx]->isReg());
- assert(argOpers[valArgIndx + 1]->isImm() ==
- argOpers[valArgIndx]->isImm());
- if (argOpers[valArgIndx + 1]->isReg())
- MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
- else
- MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
- if (regOpcH != X86::MOV32rr)
- MIB.addReg(t2);
- (*MIB).addOperand(*argOpers[valArgIndx + 1]);
-
- unsigned t7, t8;
- if (Invert) {
- t7 = F->getRegInfo().createVirtualRegister(RC);
- t8 = F->getRegInfo().createVirtualRegister(RC);
- MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
- MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
- } else {
- t7 = t5;
- t8 = t6;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+
+ thisMBB->addSuccessor(mainMBB);
+
+ // mainMBB:
+ MachineBasicBlock *origMainMBB = mainMBB;
+ mainMBB->addLiveIn(AccPhyReg);
+
+ // Copy AccPhyReg as it is used more than once.
+ unsigned AccReg = MRI.createVirtualRegister(RC);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
+ .addReg(AccPhyReg);
+
+ unsigned t1 = MRI.createVirtualRegister(RC);
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unhandled atomic-load-op opcode!");
+ case X86::ATOMAND8:
+ case X86::ATOMAND16:
+ case X86::ATOMAND32:
+ case X86::ATOMAND64:
+ case X86::ATOMOR8:
+ case X86::ATOMOR16:
+ case X86::ATOMOR32:
+ case X86::ATOMOR64:
+ case X86::ATOMXOR8:
+ case X86::ATOMXOR16:
+ case X86::ATOMXOR32:
+ case X86::ATOMXOR64: {
+ unsigned ARITHOpc = getNonAtomicOpcode(Opc);
+ BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
+ .addReg(AccReg);
+ break;
+ }
+ case X86::ATOMNAND8:
+ case X86::ATOMNAND16:
+ case X86::ATOMNAND32:
+ case X86::ATOMNAND64: {
+ unsigned t2 = MRI.createVirtualRegister(RC);
+ unsigned NOTOpc;
+ unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
+ BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
+ .addReg(AccReg);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
+ break;
+ }
+ case X86::ATOMMAX8:
+ case X86::ATOMMAX16:
+ case X86::ATOMMAX32:
+ case X86::ATOMMAX64:
+ case X86::ATOMMIN8:
+ case X86::ATOMMIN16:
+ case X86::ATOMMIN32:
+ case X86::ATOMMIN64:
+ case X86::ATOMUMAX8:
+ case X86::ATOMUMAX16:
+ case X86::ATOMUMAX32:
+ case X86::ATOMUMAX64:
+ case X86::ATOMUMIN8:
+ case X86::ATOMUMIN16:
+ case X86::ATOMUMIN32:
+ case X86::ATOMUMIN64: {
+ unsigned CMPOpc;
+ unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
+
+ BuildMI(mainMBB, DL, TII->get(CMPOpc))
+ .addReg(SrcReg)
+ .addReg(AccReg);
+
+ if (Subtarget->hasCMov()) {
+ if (VT != MVT::i8) {
+ // Native support
+ BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
+ .addReg(SrcReg)
+ .addReg(AccReg);
+ } else {
+ // Promote i8 to i32 to use CMOV32
+ const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
+ unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
+ unsigned AccReg32 = MRI.createVirtualRegister(RC32);
+ unsigned t2 = MRI.createVirtualRegister(RC32);
+
+ unsigned Undef = MRI.createVirtualRegister(RC32);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
+
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
+ .addReg(Undef)
+ .addReg(SrcReg)
+ .addImm(X86::sub_8bit);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
+ .addReg(Undef)
+ .addReg(AccReg)
+ .addImm(X86::sub_8bit);
+
+ BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+ .addReg(SrcReg32)
+ .addReg(AccReg32);
+
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
+ .addReg(t2, 0, X86::sub_8bit);
+ }
+ } else {
+ // Use pseudo select and lower them.
+ assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+ "Invalid atomic-load-op transformation!");
+ unsigned SelOpc = getPseudoCMOVOpc(VT);
+ X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
+ assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
+ MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
+ .addReg(SrcReg).addReg(AccReg)
+ .addImm(CC);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ }
+ break;
+ }
}
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
- MIB.addReg(t1);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
- MIB.addReg(t2);
+ // Copy AccPhyReg back from virtual register.
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
+ .addReg(AccReg);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
- MIB.addReg(t7);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
- MIB.addReg(t8);
+ MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ MIB.addReg(t1);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
- MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
- for (int i=0; i <= lastAddrIndx; ++i)
- (*MIB).addOperand(*argOpers[i]);
+ BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
- assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
- (*MIB).setMemRefs(bInstr->memoperands_begin(),
- bInstr->memoperands_end());
+ mainMBB->addSuccessor(origMainMBB);
+ mainMBB->addSuccessor(sinkMBB);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
- MIB.addReg(X86::EAX);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
- MIB.addReg(X86::EDX);
+ // sinkMBB:
+ sinkMBB->addLiveIn(AccPhyReg);
- // insert branch
- BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), DstReg)
+ .addReg(AccPhyReg);
- bInstr->eraseFromParent(); // The pseudo instruction is gone now.
- return nextMBB;
+ MI->eraseFromParent();
+ return sinkMBB;
}
-// private utility function
+// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
+// instructions. They will be translated into a spin-loop or compare-exchange
+// loop from
+//
+// ...
+// dst = atomic-fetch-op MI.addr, MI.val
+// ...
+//
+// to
+//
+// ...
+// EAX = LOAD [MI.addr + 0]
+// EDX = LOAD [MI.addr + 4]
+// loop:
+// EBX = OP MI.val.lo, EAX
+// ECX = OP MI.val.hi, EDX
+// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+// JNE loop
+// sink:
+// dst = EDX:EAX
+// ...
MachineBasicBlock *
-X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
- MachineBasicBlock *MBB,
- unsigned cmovOpc) const {
- // For the atomic min/max operator, we generate
- // thisMBB:
- // newMBB:
- // ld t1 = [min/max.addr]
- // mov t2 = [min/max.val]
- // cmp t1, t2
- // cmov[cond] t2 = t1
- // mov EAX = t1
- // lcs dest = [bitinstr.addr], t2 [EAX is implicit]
- // bz newMBB
- // fallthrough -->nextMBB
- //
+X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
- const BasicBlock *LLVM_BB = MBB->getBasicBlock();
- MachineFunction::iterator MBBIter = MBB;
- ++MBBIter;
+ DebugLoc DL = MI->getDebugLoc();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+ "Unexpected number of operands");
+
+ assert(MI->hasOneMemOperand() &&
+ "Expected atomic-load-op32 to have one memoperand");
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ unsigned DstLoReg, DstHiReg;
+ unsigned SrcLoReg, SrcHiReg;
+ unsigned MemOpndSlot;
+
+ unsigned CurOp = 0;
+
+ DstLoReg = MI->getOperand(CurOp++).getReg();
+ DstHiReg = MI->getOperand(CurOp++).getReg();
+ MemOpndSlot = CurOp;
+ CurOp += X86::AddrNumOperands;
+ SrcLoReg = MI->getOperand(CurOp++).getReg();
+ SrcHiReg = MI->getOperand(CurOp++).getReg();
+
+ const TargetRegisterClass *RC = &X86::GR32RegClass;
+ const TargetRegisterClass *RC8 = &X86::GR8RegClass;
+
+ unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
+ unsigned LOADOpc = X86::MOV32rm;
+
+ // For the atomic load-arith operator, we generate
+ //
+ // thisMBB:
+ // EAX = LOAD [MI.addr + 0]
+ // EDX = LOAD [MI.addr + 4]
+ // mainMBB:
+ // EBX = OP MI.vallo, EAX
+ // ECX = OP MI.valhi, EDX
+ // LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+ // JNE mainMBB
+ // sinkMBB:
- /// First build the CFG
- MachineFunction *F = MBB->getParent();
MachineBasicBlock *thisMBB = MBB;
- MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
- F->insert(MBBIter, newMBB);
- F->insert(MBBIter, nextMBB);
-
- // Transfer the remainder of thisMBB and its successor edges to nextMBB.
- nextMBB->splice(nextMBB->begin(), thisMBB,
- llvm::next(MachineBasicBlock::iterator(mInstr)),
- thisMBB->end());
- nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
- // Update thisMBB to fall through to newMBB
- thisMBB->addSuccessor(newMBB);
-
- // newMBB jumps to newMBB and fall through to nextMBB
- newMBB->addSuccessor(nextMBB);
- newMBB->addSuccessor(newMBB);
-
- DebugLoc dl = mInstr->getDebugLoc();
- // Insert instructions into newMBB based on incoming instruction
- assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
- "unexpected number of operands");
- MachineOperand& destOper = mInstr->getOperand(0);
- MachineOperand* argOpers[2 + X86::AddrNumOperands];
- int numArgs = mInstr->getNumOperands() - 1;
- for (int i=0; i < numArgs; ++i)
- argOpers[i] = &mInstr->getOperand(i+1);
-
- // x86 address has 4 operands: base, index, scale, and displacement
- int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
- int valArgIndx = lastAddrIndx + 1;
-
- unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
- MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
- for (int i=0; i <= lastAddrIndx; ++i)
- (*MIB).addOperand(*argOpers[i]);
-
- // We only support register and immediate values
- assert((argOpers[valArgIndx]->isReg() ||
- argOpers[valArgIndx]->isImm()) &&
- "invalid operand");
-
- unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
- if (argOpers[valArgIndx]->isReg())
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
- else
- MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
- (*MIB).addOperand(*argOpers[valArgIndx]);
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
- MIB.addReg(t1);
+ MachineInstrBuilder MIB;
- MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
- MIB.addReg(t1);
- MIB.addReg(t2);
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ // Lo
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Hi
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
+ else
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
- // Generate movc
- unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
- MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
- MIB.addReg(t2);
- MIB.addReg(t1);
+ thisMBB->addSuccessor(mainMBB);
+
+ // mainMBB:
+ MachineBasicBlock *origMainMBB = mainMBB;
+ mainMBB->addLiveIn(X86::EAX);
+ mainMBB->addLiveIn(X86::EDX);
+
+ // Copy EDX:EAX as they are used more than once.
+ unsigned LoReg = MRI.createVirtualRegister(RC);
+ unsigned HiReg = MRI.createVirtualRegister(RC);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
+
+ unsigned t1L = MRI.createVirtualRegister(RC);
+ unsigned t1H = MRI.createVirtualRegister(RC);
+
+ unsigned Opc = MI->getOpcode();
+ switch (Opc) {
+ default:
+ llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
+ case X86::ATOMAND6432:
+ case X86::ATOMOR6432:
+ case X86::ATOMXOR6432:
+ case X86::ATOMADD6432:
+ case X86::ATOMSUB6432: {
+ unsigned HiOpc;
+ unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg);
+ break;
+ }
+ case X86::ATOMNAND6432: {
+ unsigned HiOpc, NOTOpc;
+ unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
+ unsigned t2L = MRI.createVirtualRegister(RC);
+ unsigned t2H = MRI.createVirtualRegister(RC);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
+ break;
+ }
+ case X86::ATOMMAX6432:
+ case X86::ATOMMIN6432:
+ case X86::ATOMUMAX6432:
+ case X86::ATOMUMIN6432: {
+ unsigned HiOpc;
+ unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+ unsigned cL = MRI.createVirtualRegister(RC8);
+ unsigned cH = MRI.createVirtualRegister(RC8);
+ unsigned cL32 = MRI.createVirtualRegister(RC);
+ unsigned cH32 = MRI.createVirtualRegister(RC);
+ unsigned cc = MRI.createVirtualRegister(RC);
+ // cl := cmp src_lo, lo
+ BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+ .addReg(SrcLoReg).addReg(LoReg);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
+ BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
+ // ch := cmp src_hi, hi
+ BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+ .addReg(SrcHiReg).addReg(HiReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
+ BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
+ // cc := if (src_hi == hi) ? cl : ch;
+ if (Subtarget->hasCMov()) {
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
+ .addReg(cH32).addReg(cL32);
+ } else {
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
+ .addReg(cH32).addReg(cL32)
+ .addImm(X86::COND_E);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ }
+ BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
+ if (Subtarget->hasCMov()) {
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L)
+ .addReg(SrcLoReg).addReg(LoReg);
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H)
+ .addReg(SrcHiReg).addReg(HiReg);
+ } else {
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L)
+ .addReg(SrcLoReg).addReg(LoReg)
+ .addImm(X86::COND_NE);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H)
+ .addReg(SrcHiReg).addReg(HiReg)
+ .addImm(X86::COND_NE);
+ mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ }
+ break;
+ }
+ case X86::ATOMSWAP6432: {
+ unsigned HiOpc;
+ unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
+ break;
+ }
+ }
+
+ // Copy EDX:EAX back from HiReg:LoReg
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
+ // Copy ECX:EBX from t1H:t1L
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
+
+ MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ MIB.setMemRefs(MMOBegin, MMOEnd);
- // Cmp and exchange if none has modified the memory location
- MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
- for (int i=0; i <= lastAddrIndx; ++i)
- (*MIB).addOperand(*argOpers[i]);
- MIB.addReg(t3);
- assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
- (*MIB).setMemRefs(mInstr->memoperands_begin(),
- mInstr->memoperands_end());
+ BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
- MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
- MIB.addReg(X86::EAX);
+ mainMBB->addSuccessor(origMainMBB);
+ mainMBB->addSuccessor(sinkMBB);
- // insert branch
- BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+ // sinkMBB:
+ sinkMBB->addLiveIn(X86::EAX);
+ sinkMBB->addLiveIn(X86::EDX);
+
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), DstLoReg)
+ .addReg(X86::EAX);
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(TargetOpcode::COPY), DstHiReg)
+ .addReg(X86::EDX);
- mInstr->eraseFromParent(); // The pseudo instruction is gone now.
- return nextMBB;
+ MI->eraseFromParent();
+ return sinkMBB;
}
// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
// or XMM0_V32I8 in AVX all of this code can be replaced with that
// in the .td file.
-MachineBasicBlock *
-X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
- unsigned numArgs, bool memArg) const {
- assert(Subtarget->hasSSE42() &&
- "Target must have SSE4.2 or AVX features enabled");
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
+ unsigned Opc;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
+ case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+ case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
+ case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+ case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
+ case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+ case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
+ case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+ }
DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+ MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+ unsigned NumArgs = MI->getNumOperands();
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
+ if (!(Op.isReg() && Op.isImplicit()))
+ MIB.addOperand(Op);
+ }
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+ BuildMI(*BB, MI, dl,
+ TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+ .addReg(X86::XMM0);
+
+ MI->eraseFromParent();
+ return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII) {
unsigned Opc;
- if (!Subtarget->hasAVX()) {
- if (memArg)
- Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
- else
- Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
- } else {
- if (memArg)
- Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
- else
- Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
+ switch (MI->getOpcode()) {
+ default: llvm_unreachable("illegal opcode!");
+ case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
+ case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+ case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
+ case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+ case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
+ case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+ case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
+ case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
}
+ DebugLoc dl = MI->getDebugLoc();
MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
- for (unsigned i = 0; i < numArgs; ++i) {
- MachineOperand &Op = MI->getOperand(i+1);
+
+ unsigned NumArgs = MI->getNumOperands(); // remove the results
+ for (unsigned i = 1; i < NumArgs; ++i) {
+ MachineOperand &Op = MI->getOperand(i);
if (!(Op.isReg() && Op.isImplicit()))
MIB.addOperand(Op);
}
+ if (MI->hasOneMemOperand())
+ MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
BuildMI(*BB, MI, dl,
TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
- .addReg(X86::XMM0);
+ .addReg(X86::ECX);
MI->eraseFromParent();
return BB;
}
-MachineBasicBlock *
-X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+ const TargetInstrInfo *TII,
+ const X86Subtarget* Subtarget) {
DebugLoc dl = MI->getDebugLoc();
- const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
// Address into RAX/EAX, other two args into ECX, EDX.
unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
@@ -12767,6 +13601,203 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
}
MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ const BasicBlock *BB = MBB->getBasicBlock();
+ MachineFunction::iterator I = MBB;
+ ++I;
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ unsigned DstReg;
+ unsigned MemOpndSlot = 0;
+
+ unsigned CurOp = 0;
+
+ DstReg = MI->getOperand(CurOp++).getReg();
+ const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+ assert(RC->hasType(MVT::i32) && "Invalid destination!");
+ unsigned mainDstReg = MRI.createVirtualRegister(RC);
+ unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+ MemOpndSlot = CurOp;
+
+ MVT PVT = getPointerTy();
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ // For v = setjmp(buf), we generate
+ //
+ // thisMBB:
+ // buf[LabelOffset] = restoreMBB
+ // SjLjSetup restoreMBB
+ //
+ // mainMBB:
+ // v_main = 0
+ //
+ // sinkMBB:
+ // v = phi(main, restore)
+ //
+ // restoreMBB:
+ // v_restore = 1
+
+ MachineBasicBlock *thisMBB = MBB;
+ MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+ MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+ MF->insert(I, mainMBB);
+ MF->insert(I, sinkMBB);
+ MF->push_back(restoreMBB);
+
+ MachineInstrBuilder MIB;
+
+ // Transfer the remainder of BB and its successor edges to sinkMBB.
+ sinkMBB->splice(sinkMBB->begin(), MBB,
+ llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+ sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+ // thisMBB:
+ unsigned PtrStoreOpc = 0;
+ unsigned LabelReg = 0;
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ Reloc::Model RM = getTargetMachine().getRelocationModel();
+ bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+ (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+ // Prepare IP either in reg or imm.
+ if (!UseImmLabel) {
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+ const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+ LabelReg = MRI.createVirtualRegister(PtrRC);
+ if (Subtarget->is64Bit()) {
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB)
+ .addReg(0);
+ } else {
+ const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+ .addReg(XII->getGlobalBaseReg(MF))
+ .addImm(0)
+ .addReg(0)
+ .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+ .addReg(0);
+ }
+ } else
+ PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+ // Store IP
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+ else
+ MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ }
+ if (!UseImmLabel)
+ MIB.addReg(LabelReg);
+ else
+ MIB.addMBB(restoreMBB);
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Setup
+ MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+ .addMBB(restoreMBB);
+ MIB.addRegMask(RegInfo->getNoPreservedMask());
+ thisMBB->addSuccessor(mainMBB);
+ thisMBB->addSuccessor(restoreMBB);
+
+ // mainMBB:
+ // EAX = 0
+ BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+ mainMBB->addSuccessor(sinkMBB);
+
+ // sinkMBB:
+ BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+ TII->get(X86::PHI), DstReg)
+ .addReg(mainDstReg).addMBB(mainMBB)
+ .addReg(restoreDstReg).addMBB(restoreMBB);
+
+ // restoreMBB:
+ BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+ BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+ restoreMBB->addSuccessor(sinkMBB);
+
+ MI->eraseFromParent();
+ return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+ MachineBasicBlock *MBB) const {
+ DebugLoc DL = MI->getDebugLoc();
+ const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ // Memory Reference
+ MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+ MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+ MVT PVT = getPointerTy();
+ assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+ "Invalid Pointer Size!");
+
+ const TargetRegisterClass *RC =
+ (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+ unsigned Tmp = MRI.createVirtualRegister(RC);
+ // Since FP is only updated here but NOT referenced, it's treated as GPR.
+ unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+ unsigned SP = RegInfo->getStackRegister();
+
+ MachineInstrBuilder MIB;
+
+ const int64_t LabelOffset = 1 * PVT.getStoreSize();
+ const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+ unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+ unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+ // Reload FP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+ MIB.addOperand(MI->getOperand(i));
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload IP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(i), LabelOffset);
+ else
+ MIB.addOperand(MI->getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Reload SP
+ MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ if (i == X86::AddrDisp)
+ MIB.addDisp(MI->getOperand(i), SPOffset);
+ else
+ MIB.addOperand(MI->getOperand(i));
+ }
+ MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Jump
+ BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+ MI->eraseFromParent();
+ return MBB;
+}
+
+MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
MachineBasicBlock *BB) const {
switch (MI->getOpcode()) {
@@ -12895,198 +13926,101 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
case X86::PCMPESTRM128REG:
case X86::VPCMPESTRM128REG:
case X86::PCMPESTRM128MEM:
- case X86::VPCMPESTRM128MEM: {
- unsigned NumArgs;
- bool MemArg;
- switch (MI->getOpcode()) {
- default: llvm_unreachable("illegal opcode!");
- case X86::PCMPISTRM128REG:
- case X86::VPCMPISTRM128REG:
- NumArgs = 3; MemArg = false; break;
- case X86::PCMPISTRM128MEM:
- case X86::VPCMPISTRM128MEM:
- NumArgs = 3; MemArg = true; break;
- case X86::PCMPESTRM128REG:
- case X86::VPCMPESTRM128REG:
- NumArgs = 5; MemArg = false; break;
- case X86::PCMPESTRM128MEM:
- case X86::VPCMPESTRM128MEM:
- NumArgs = 5; MemArg = true; break;
- }
- return EmitPCMP(MI, BB, NumArgs, MemArg);
- }
-
- // Thread synchronization.
+ case X86::VPCMPESTRM128MEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+
+ // String/text processing lowering.
+ case X86::PCMPISTRIREG:
+ case X86::VPCMPISTRIREG:
+ case X86::PCMPISTRIMEM:
+ case X86::VPCMPISTRIMEM:
+ case X86::PCMPESTRIREG:
+ case X86::VPCMPESTRIREG:
+ case X86::PCMPESTRIMEM:
+ case X86::VPCMPESTRIMEM:
+ assert(Subtarget->hasSSE42() &&
+ "Target must have SSE4.2 or AVX features enabled");
+ return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+
+ // Thread synchronization.
case X86::MONITOR:
- return EmitMonitor(MI, BB);
+ return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
- // Atomic Lowering.
- case X86::ATOMAND32:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
- X86::AND32ri, X86::MOV32rm,
- X86::LCMPXCHG32,
- X86::NOT32r, X86::EAX,
- &X86::GR32RegClass);
- case X86::ATOMOR32:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
- X86::OR32ri, X86::MOV32rm,
- X86::LCMPXCHG32,
- X86::NOT32r, X86::EAX,
- &X86::GR32RegClass);
- case X86::ATOMXOR32:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
- X86::XOR32ri, X86::MOV32rm,
- X86::LCMPXCHG32,
- X86::NOT32r, X86::EAX,
- &X86::GR32RegClass);
- case X86::ATOMNAND32:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
- X86::AND32ri, X86::MOV32rm,
- X86::LCMPXCHG32,
- X86::NOT32r, X86::EAX,
- &X86::GR32RegClass, true);
- case X86::ATOMMIN32:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
- case X86::ATOMMAX32:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
- case X86::ATOMUMIN32:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
- case X86::ATOMUMAX32:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+ // xbegin
+ case X86::XBEGIN:
+ return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
+ // Atomic Lowering.
+ case X86::ATOMAND8:
case X86::ATOMAND16:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
- X86::AND16ri, X86::MOV16rm,
- X86::LCMPXCHG16,
- X86::NOT16r, X86::AX,
- &X86::GR16RegClass);
+ case X86::ATOMAND32:
+ case X86::ATOMAND64:
+ // Fall through
+ case X86::ATOMOR8:
case X86::ATOMOR16:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
- X86::OR16ri, X86::MOV16rm,
- X86::LCMPXCHG16,
- X86::NOT16r, X86::AX,
- &X86::GR16RegClass);
+ case X86::ATOMOR32:
+ case X86::ATOMOR64:
+ // Fall through
case X86::ATOMXOR16:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
- X86::XOR16ri, X86::MOV16rm,
- X86::LCMPXCHG16,
- X86::NOT16r, X86::AX,
- &X86::GR16RegClass);
- case X86::ATOMNAND16:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
- X86::AND16ri, X86::MOV16rm,
- X86::LCMPXCHG16,
- X86::NOT16r, X86::AX,
- &X86::GR16RegClass, true);
- case X86::ATOMMIN16:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
- case X86::ATOMMAX16:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
- case X86::ATOMUMIN16:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
- case X86::ATOMUMAX16:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
-
- case X86::ATOMAND8:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
- X86::AND8ri, X86::MOV8rm,
- X86::LCMPXCHG8,
- X86::NOT8r, X86::AL,
- &X86::GR8RegClass);
- case X86::ATOMOR8:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
- X86::OR8ri, X86::MOV8rm,
- X86::LCMPXCHG8,
- X86::NOT8r, X86::AL,
- &X86::GR8RegClass);
case X86::ATOMXOR8:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
- X86::XOR8ri, X86::MOV8rm,
- X86::LCMPXCHG8,
- X86::NOT8r, X86::AL,
- &X86::GR8RegClass);
- case X86::ATOMNAND8:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
- X86::AND8ri, X86::MOV8rm,
- X86::LCMPXCHG8,
- X86::NOT8r, X86::AL,
- &X86::GR8RegClass, true);
- // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
- // This group is for 64-bit host.
- case X86::ATOMAND64:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
- X86::AND64ri32, X86::MOV64rm,
- X86::LCMPXCHG64,
- X86::NOT64r, X86::RAX,
- &X86::GR64RegClass);
- case X86::ATOMOR64:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
- X86::OR64ri32, X86::MOV64rm,
- X86::LCMPXCHG64,
- X86::NOT64r, X86::RAX,
- &X86::GR64RegClass);
+ case X86::ATOMXOR32:
case X86::ATOMXOR64:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
- X86::XOR64ri32, X86::MOV64rm,
- X86::LCMPXCHG64,
- X86::NOT64r, X86::RAX,
- &X86::GR64RegClass);
+ // Fall through
+ case X86::ATOMNAND8:
+ case X86::ATOMNAND16:
+ case X86::ATOMNAND32:
case X86::ATOMNAND64:
- return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
- X86::AND64ri32, X86::MOV64rm,
- X86::LCMPXCHG64,
- X86::NOT64r, X86::RAX,
- &X86::GR64RegClass, true);
- case X86::ATOMMIN64:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
+ // Fall through
+ case X86::ATOMMAX8:
+ case X86::ATOMMAX16:
+ case X86::ATOMMAX32:
case X86::ATOMMAX64:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
- case X86::ATOMUMIN64:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
+ // Fall through
+ case X86::ATOMMIN8:
+ case X86::ATOMMIN16:
+ case X86::ATOMMIN32:
+ case X86::ATOMMIN64:
+ // Fall through
+ case X86::ATOMUMAX8:
+ case X86::ATOMUMAX16:
+ case X86::ATOMUMAX32:
case X86::ATOMUMAX64:
- return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+ // Fall through
+ case X86::ATOMUMIN8:
+ case X86::ATOMUMIN16:
+ case X86::ATOMUMIN32:
+ case X86::ATOMUMIN64:
+ return EmitAtomicLoadArith(MI, BB);
// This group does 64-bit operations on a 32-bit host.
case X86::ATOMAND6432:
- return EmitAtomicBit6432WithCustomInserter(MI, BB,
- X86::AND32rr, X86::AND32rr,
- X86::AND32ri, X86::AND32ri,
- false);
case X86::ATOMOR6432:
- return EmitAtomicBit6432WithCustomInserter(MI, BB,
- X86::OR32rr, X86::OR32rr,
- X86::OR32ri, X86::OR32ri,
- false);
case X86::ATOMXOR6432:
- return EmitAtomicBit6432WithCustomInserter(MI, BB,
- X86::XOR32rr, X86::XOR32rr,
- X86::XOR32ri, X86::XOR32ri,
- false);
case X86::ATOMNAND6432:
- return EmitAtomicBit6432WithCustomInserter(MI, BB,
- X86::AND32rr, X86::AND32rr,
- X86::AND32ri, X86::AND32ri,
- true);
case X86::ATOMADD6432:
- return EmitAtomicBit6432WithCustomInserter(MI, BB,
- X86::ADD32rr, X86::ADC32rr,
- X86::ADD32ri, X86::ADC32ri,
- false);
case X86::ATOMSUB6432:
- return EmitAtomicBit6432WithCustomInserter(MI, BB,
- X86::SUB32rr, X86::SBB32rr,
- X86::SUB32ri, X86::SBB32ri,
- false);
+ case X86::ATOMMAX6432:
+ case X86::ATOMMIN6432:
+ case X86::ATOMUMAX6432:
+ case X86::ATOMUMIN6432:
case X86::ATOMSWAP6432:
- return EmitAtomicBit6432WithCustomInserter(MI, BB,
- X86::MOV32rr, X86::MOV32rr,
- X86::MOV32ri, X86::MOV32ri,
- false);
+ return EmitAtomicLoadArith6432(MI, BB);
+
case X86::VASTART_SAVE_XMM_REGS:
return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
case X86::VAARG_64:
return EmitVAARG64WithCustomInserter(MI, BB);
+
+ case X86::EH_SjLj_SetJmp32:
+ case X86::EH_SjLj_SetJmp64:
+ return emitEHSjLjSetJmp(MI, BB);
+
+ case X86::EH_SjLj_LongJmp32:
+ case X86::EH_SjLj_LongJmp64:
+ return emitEHSjLjLongJmp(MI, BB);
}
}
@@ -13333,12 +14267,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
}
-/// DCI, PerformTruncateCombine - Converts truncate operation to
+/// PerformTruncateCombine - Converts truncate operation to
/// a sequence of vector shuffle operations.
/// It is possible when we truncate 256-bit vector to 128-bit vector
-
-SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
- DAGCombinerInfo &DCI) const {
+static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
if (!DCI.isBeforeLegalizeOps())
return SDValue();
@@ -13530,7 +14464,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
// alignment is valid.
unsigned Align = LN0->getAlignment();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned NewAlign = TLI.getTargetData()->
+ unsigned NewAlign = TLI.getDataLayout()->
getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
@@ -13561,6 +14495,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return NewOp;
SDValue InputVector = N->getOperand(0);
+ // Detect whether we are trying to convert from mmx to i32 and the bitcast
+ // from mmx to v2i32 has a single usage.
+ if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
+ InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
+ InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
+ return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+ N->getValueType(0),
+ InputVector.getNode()->getOperand(0));
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
@@ -13961,7 +14903,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
//
// where Op could be BRCOND or CMOV.
//
-static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
// Quit if not CMP and SUB with its value result used.
if (Cmp.getOpcode() != X86ISD::CMP &&
(Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
@@ -13997,40 +14939,55 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
SetCC = SetCC.getOperand(0);
- // Quit if not SETCC.
- // FIXME: So far we only handle the boolean value generated from SETCC. If
- // there is other ways to generate boolean values, we need handle them here
- // as well.
- if (SetCC.getOpcode() != X86ISD::SETCC)
- return SDValue();
-
- // Set the condition code or opposite one if necessary.
- CC = X86::CondCode(SetCC.getConstantOperandVal(0));
- if (needOppositeCond)
- CC = X86::GetOppositeBranchCondition(CC);
-
- return SetCC.getOperand(1);
-}
-
-static bool IsValidFCMOVCondition(X86::CondCode CC) {
- switch (CC) {
- default:
- return false;
- case X86::COND_B:
- case X86::COND_BE:
- case X86::COND_E:
- case X86::COND_P:
- case X86::COND_AE:
- case X86::COND_A:
- case X86::COND_NE:
- case X86::COND_NP:
- return true;
+ switch (SetCC.getOpcode()) {
+ case X86ISD::SETCC:
+ // Set the condition code or opposite one if necessary.
+ CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(1);
+ case X86ISD::CMOV: {
+ // Check whether false/true value has canonical one, i.e. 0 or 1.
+ ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+ ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+ // Quit if true value is not a constant.
+ if (!TVal)
+ return SDValue();
+ // Quit if false value is not a constant.
+ if (!FVal) {
+ // A special case for rdrand, where 0 is set if false cond is found.
+ SDValue Op = SetCC.getOperand(0);
+ if (Op.getOpcode() != X86ISD::RDRAND)
+ return SDValue();
+ }
+ // Quit if false value is not the constant 0 or 1.
+ bool FValIsFalse = true;
+ if (FVal && FVal->getZExtValue() != 0) {
+ if (FVal->getZExtValue() != 1)
+ return SDValue();
+ // If FVal is 1, opposite cond is needed.
+ needOppositeCond = !needOppositeCond;
+ FValIsFalse = false;
+ }
+ // Quit if TVal is not the constant opposite of FVal.
+ if (FValIsFalse && TVal->getZExtValue() != 1)
+ return SDValue();
+ if (!FValIsFalse && TVal->getZExtValue() != 0)
+ return SDValue();
+ CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+ if (needOppositeCond)
+ CC = X86::GetOppositeBranchCondition(CC);
+ return SetCC.getOperand(3);
+ }
}
+
+ return SDValue();
}
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
DebugLoc DL = N->getDebugLoc();
// If the flag operand isn't dead, don't touch this CMOV.
@@ -14055,10 +15012,10 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
SDValue Flags;
- Flags = BoolTestSetCCCombine(Cond, CC);
+ Flags = checkBoolTestSetCCCombine(Cond, CC);
if (Flags.getNode() &&
// Extra check as FCMOV only supports a subset of X86 cond.
- (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
+ (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
SDValue Ops[] = { FalseOp, TrueOp,
DAG.getConstant(CC, MVT::i8), Flags };
return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
@@ -14075,6 +15032,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
CC = X86::GetOppositeBranchCondition(CC);
std::swap(TrueC, FalseC);
+ std::swap(TrueOp, FalseOp);
}
// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
@@ -14157,6 +15115,46 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
}
}
}
+
+ // Handle these cases:
+ // (select (x != c), e, c) -> select (x != c), e, x),
+ // (select (x == c), c, e) -> select (x == c), x, e)
+ // where the c is an integer constant, and the "select" is the combination
+ // of CMOV and CMP.
+ //
+ // The rationale for this change is that the conditional-move from a constant
+ // needs two instructions, however, conditional-move from a register needs
+ // only one instruction.
+ //
+ // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+ // some instruction-combining opportunities. This opt needs to be
+ // postponed as late as possible.
+ //
+ if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+ // the DCI.xxxx conditions are provided to postpone the optimization as
+ // late as possible.
+
+ ConstantSDNode *CmpAgainst = 0;
+ if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+ (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+ dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) {
+
+ if (CC == X86::COND_NE &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+ CC = X86::GetOppositeBranchCondition(CC);
+ std::swap(TrueOp, FalseOp);
+ }
+
+ if (CC == X86::COND_E &&
+ CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+ SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+ DAG.getConstant(CC, MVT::i8), Cond };
+ return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
+ array_lengthof(Ops));
+ }
+ }
+ }
+
return SDValue();
}
@@ -14813,11 +15811,11 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
ISD::LoadExtType Ext = Ld->getExtensionType();
// If this is a vector EXT Load then attempt to optimize it using a
- // shuffle. We need SSE4 for the shuffles.
+ // shuffle. We need SSSE3 shuffles.
// TODO: It is possible to support ZExt by zeroing the undef values
// during the shuffle phase or after the shuffle.
if (RegVT.isVector() && RegVT.isInteger() &&
- Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+ Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
@@ -15043,7 +16041,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
const Function *F = DAG.getMachineFunction().getFunction();
- bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+ bool NoImplicitFloatOps = F->getFnAttributes().
+ hasAttribute(Attributes::NoImplicitFloat);
bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
&& Subtarget->hasSSE2();
if ((VT.isVector() ||
@@ -15315,6 +16314,29 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+ assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+ // Only perform optimizations if UnsafeMath is used.
+ if (!DAG.getTarget().Options.UnsafeFPMath)
+ return SDValue();
+
+ // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+ // into FMINC and FMAXC, which are Commutative operations.
+ unsigned NewOp = 0;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unknown opcode");
+ case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
+ case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
+ }
+
+ return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+ N->getOperand(0), N->getOperand(1));
+}
+
+
/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
@@ -15420,8 +16442,13 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
DebugLoc dl = N->getDebugLoc();
EVT VT = N->getValueType(0);
+ // Let legalize expand this if it isn't a legal type yet.
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+ return SDValue();
+
EVT ScalarVT = VT.getScalarType();
- if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+ (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
return SDValue();
SDValue A = N->getOperand(0);
@@ -15443,9 +16470,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
unsigned Opcode;
if (!NegMul)
- Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+ Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
else
- Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+ Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
return DAG.getNode(Opcode, dl, VT, A, B, C);
}
@@ -15542,24 +16570,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+ return DAG.getNode(ISD::AND, DL, MVT::i8,
+ DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+ DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+ DAG.getConstant(1, MVT::i8));
+}
+
// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
DebugLoc DL = N->getDebugLoc();
X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
SDValue EFLAGS = N->getOperand(1);
+ if (CC == X86::COND_A) {
+ // Try to convert COND_A into COND_B in an attempt to facilitate
+ // materializing "setb reg".
+ //
+ // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+ // cannot take an immediate as its first operand.
+ //
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+ EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return MaterializeSETB(DL, NewEFLAGS, DAG);
+ }
+ }
+
// Materialize "setb reg" as "sbb reg,reg", since it can be extended without
// a zext and produces an all-ones bit which is more useful than 0/1 in some
// cases.
if (CC == X86::COND_B)
- return DAG.getNode(ISD::AND, DL, MVT::i8,
- DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
- DAG.getConstant(CC, MVT::i8), EFLAGS),
- DAG.getConstant(1, MVT::i8));
+ return MaterializeSETB(DL, EFLAGS, DAG);
SDValue Flags;
- Flags = BoolTestSetCCCombine(EFLAGS, CC);
+ Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
@@ -15581,7 +16636,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
SDValue Flags;
- Flags = BoolTestSetCCCombine(EFLAGS, CC);
+ Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
if (Flags.getNode()) {
SDValue Cond = DAG.getConstant(CC, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
@@ -15591,23 +16646,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
- SDValue Op0 = N->getOperand(0);
- EVT InVT = Op0->getValueType(0);
-
- // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
- if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
- DebugLoc dl = N->getDebugLoc();
- MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
- SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
- // Notice that we use SINT_TO_FP because we know that the high bits
- // are zero and SINT_TO_FP is better supported by the hardware.
- return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
- }
-
- return SDValue();
-}
-
static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
const X86TargetLowering *XTLI) {
SDValue Op0 = N->getOperand(0);
@@ -15639,20 +16677,6 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
-
- // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
- if (VT == MVT::v8i8 || VT == MVT::v4i8) {
- DebugLoc dl = N->getDebugLoc();
- MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
- SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
- return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
- }
-
- return SDValue();
-}
-
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -15767,6 +16791,21 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
return OptimizeConditionalInDecrement(N, DAG);
}
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ // (vzext (bitcast (vzext (x)) -> (vzext x)
+ SDValue In = N->getOperand(0);
+ while (In.getOpcode() == ISD::BITCAST)
+ In = In.getOperand(0);
+
+ if (In.getOpcode() != X86ISD::VZEXT)
+ return SDValue();
+
+ return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
+}
+
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -15776,7 +16815,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
case ISD::VSELECT:
case ISD::SELECT: return PerformSELECTCombine(N, DAG, DCI, Subtarget);
- case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI);
+ case X86ISD::CMOV: return PerformCMOVCombine(N, DAG, DCI, Subtarget);
case ISD::ADD: return PerformAddCombine(N, DAG, Subtarget);
case ISD::SUB: return PerformSubCombine(N, DAG, Subtarget);
case X86ISD::ADC: return PerformADCCombine(N, DAG, DCI);
@@ -15789,23 +16828,24 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::XOR: return PerformXorCombine(N, DAG, DCI, Subtarget);
case ISD::LOAD: return PerformLOADCombine(N, DAG, DCI, Subtarget);
case ISD::STORE: return PerformSTORECombine(N, DAG, Subtarget);
- case ISD::UINT_TO_FP: return PerformUINT_TO_FPCombine(N, DAG);
case ISD::SINT_TO_FP: return PerformSINT_TO_FPCombine(N, DAG, this);
- case ISD::FP_TO_SINT: return PerformFP_TO_SINTCombine(N, DAG);
case ISD::FADD: return PerformFADDCombine(N, DAG, Subtarget);
case ISD::FSUB: return PerformFSUBCombine(N, DAG, Subtarget);
case X86ISD::FXOR:
case X86ISD::FOR: return PerformFORCombine(N, DAG);
+ case X86ISD::FMIN:
+ case X86ISD::FMAX: return PerformFMinFMaxCombine(N, DAG);
case X86ISD::FAND: return PerformFANDCombine(N, DAG);
case X86ISD::BT: return PerformBTCombine(N, DAG, DCI);
case X86ISD::VZEXT_MOVL: return PerformVZEXT_MOVLCombine(N, DAG);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
- case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG, DCI);
+ case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
- case X86ISD::SETCC: return PerformSETCCCombine(N, DAG);
+ case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+ case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::PALIGN:
case X86ISD::UNPCKH:
@@ -16233,7 +17273,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
return;
case 'K':
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
- if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
+ if (isInt<8>(C->getSExtValue())) {
Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
break;
}
@@ -16558,3 +17598,207 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
return Res;
}
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+struct X86CostTblEntry {
+ int ISD;
+ MVT Type;
+ unsigned Cost;
+};
+
+static int
+FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
+ for (unsigned int i = 0; i < len; ++i)
+ if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
+ return i;
+
+ // Could not find an entry.
+ return -1;
+}
+
+struct X86TypeConversionCostTblEntry {
+ int ISD;
+ MVT Dst;
+ MVT Src;
+ unsigned Cost;
+};
+
+static int
+FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
+ int ISD, MVT Dst, MVT Src) {
+ for (unsigned int i = 0; i < len; ++i)
+ if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
+ return i;
+
+ // Could not find an entry.
+ return -1;
+}
+
+unsigned
+X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
+ Type *Ty) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
+
+ int ISD = InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+ static const X86CostTblEntry AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::MUL, MVT::v4i64, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+ };
+
+ // Look for AVX1 lowering tricks.
+ if (ST.hasAVX()) {
+ int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
+ LT.second);
+ if (Idx != -1)
+ return LT.first * AVX1CostTable[Idx].Cost;
+ }
+ // Fallback to the default implementation.
+ return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
+}
+
+unsigned
+X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ if (Index != -1U) {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned Width = LT.second.getVectorNumElements();
+ Index = Index % Width;
+
+ // Floating point scalars are already located in index #0.
+ if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+ return 0;
+ }
+
+ return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
+ Type *ValTy,
+ Type *CondTy) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ const X86Subtarget &ST =
+ TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+ static const X86CostTblEntry SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+ };
+
+ static const X86CostTblEntry AVX1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f64, 1 },
+ { ISD::SETCC, MVT::v8f32, 1 },
+ // AVX1 does not support 8-wide integer compare.
+ { ISD::SETCC, MVT::v4i64, 4 },
+ { ISD::SETCC, MVT::v8i32, 4 },
+ { ISD::SETCC, MVT::v16i16, 4 },
+ { ISD::SETCC, MVT::v32i8, 4 },
+ };
+
+ static const X86CostTblEntry AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+ };
+
+ if (ST.hasSSE42()) {
+ int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * SSE42CostTbl[Idx].Cost;
+ }
+
+ if (ST.hasAVX()) {
+ int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * AVX1CostTbl[Idx].Cost;
+ }
+
+ if (ST.hasAVX2()) {
+ int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * AVX2CostTbl[Idx].Cost;
+ }
+
+ return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
+ Type *Dst,
+ Type *Src) const {
+ int ISD = InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ EVT SrcTy = TLI->getValueType(Src);
+ EVT DstTy = TLI->getValueType(Dst);
+
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+
+ const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+ static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
+ };
+
+ if (ST.hasAVX()) {
+ int Idx = FindInConvertTable(AVXConversionTbl,
+ array_lengthof(AVXConversionTbl),
+ ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+ if (Idx != -1)
+ return AVXConversionTbl[Idx].Cost;
+ }
+
+ return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+}
+