diff options
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 537 |
1 files changed, 314 insertions, 223 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b89914f8893e..65486cf7f529 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4217,6 +4217,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: + case X86ISD::EXTRQI: + case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -5554,6 +5556,24 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::EXTRQI: + if (isa<ConstantSDNode>(N->getOperand(1)) && + isa<ConstantSDNode>(N->getOperand(2))) { + int BitLen = N->getConstantOperandVal(1); + int BitIdx = N->getConstantOperandVal(2); + DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask); + IsUnary = true; + } + break; + case X86ISD::INSERTQI: + if (isa<ConstantSDNode>(N->getOperand(2)) && + isa<ConstantSDNode>(N->getOperand(3))) { + int BitLen = N->getConstantOperandVal(2); + int BitIdx = N->getConstantOperandVal(3); + DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + } + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -9317,11 +9337,11 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getBitcast(VT, V); } -/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. -static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +// EXTRQ: Extract Len elements from lower half of source, starting at Idx. +// Remainder of lower half result is zero and upper half is all undef. +static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -9329,120 +9349,133 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) - return SDValue(); + return false; - // EXTRQ: Extract Len elements from lower half of source, starting at Idx. - // Remainder of lower half result is zero and upper half is all undef. - auto LowerAsEXTRQ = [&]() { - // Determine the extraction length from the part of the - // lower half that isn't zeroable. - int Len = HalfSize; - for (; Len > 0; --Len) - if (!Zeroable[Len - 1]) - break; - assert(Len > 0 && "Zeroable shuffle mask"); + // Determine the extraction length from the part of the + // lower half that isn't zeroable. + int Len = HalfSize; + for (; Len > 0; --Len) + if (!Zeroable[Len - 1]) + break; + assert(Len > 0 && "Zeroable shuffle mask"); - // Attempt to match first Len sequential elements from the lower half. - SDValue Src; - int Idx = -1; - for (int i = 0; i != Len; ++i) { - int M = Mask[i]; - if (M < 0) - continue; - SDValue &V = (M < Size ? V1 : V2); - M = M % Size; + // Attempt to match first Len sequential elements from the lower half. + SDValue Src; + int Idx = -1; + for (int i = 0; i != Len; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + SDValue &V = (M < Size ? V1 : V2); + M = M % Size; - // The extracted elements must start at a valid index and all mask - // elements must be in the lower half. - if (i > M || M >= HalfSize) - return SDValue(); + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) + return false; - if (Idx < 0 || (Src == V && Idx == (M - i))) { - Src = V; - Idx = M - i; - continue; - } - return SDValue(); + if (Idx < 0 || (Src == V && Idx == (M - i))) { + Src = V; + Idx = M - i; + continue; } + return false; + } - if (Idx < 0) - return SDValue(); + if (!Src || Idx < 0) + return false; - assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); - int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; - int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; - return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); - }; + assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); + BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + V1 = Src; + return true; +} + +// INSERTQ: Extract lowest Len elements from lower half of second source and +// insert over first source, starting at Idx. +// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } +static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx) { + int Size = Mask.size(); + int HalfSize = Size / 2; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + // Upper half must be undefined. + if (!isUndefInRange(Mask, HalfSize, HalfSize)) + return false; + + for (int Idx = 0; Idx != HalfSize; ++Idx) { + SDValue Base; + + // Attempt to match first source from mask before insertion point. + if (isUndefInRange(Mask, 0, Idx)) { + /* EMPTY */ + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + Base = V1; + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + Base = V2; + } else { + continue; + } - if (SDValue ExtrQ = LowerAsEXTRQ()) - return ExtrQ; + // Extend the extraction length looking to match both the insertion of + // the second source and the remaining elements of the first. + for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { + SDValue Insert; + int Len = Hi - Idx; - // INSERTQ: Extract lowest Len elements from lower half of second source and - // insert over first source, starting at Idx. - // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } - auto LowerAsInsertQ = [&]() { - for (int Idx = 0; Idx != HalfSize; ++Idx) { - SDValue Base; + // Match insertion. + if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { + Insert = V1; + } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { + Insert = V2; + } else { + continue; + } - // Attempt to match first source from mask before insertion point. - if (isUndefInRange(Mask, 0, Idx)) { + // Match the remaining elements of the lower half. + if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ - } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + } else if ((!Base || (Base == V1)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; - } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + } else if ((!Base || (Base == V2)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, + Size + Hi)) { Base = V2; } else { continue; } - // Extend the extraction length looking to match both the insertion of - // the second source and the remaining elements of the first. - for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { - SDValue Insert; - int Len = Hi - Idx; - - // Match insertion. - if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { - Insert = V1; - } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { - Insert = V2; - } else { - continue; - } - - // Match the remaining elements of the lower half. - if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { - /* EMPTY */ - } else if ((!Base || (Base == V1)) && - isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { - Base = V1; - } else if ((!Base || (Base == V2)) && - isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, - Size + Hi)) { - Base = V2; - } else { - continue; - } - - // We may not have a base (first source) - this can safely be undefined. - if (!Base) - Base = DAG.getUNDEF(VT); - - int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; - int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; - return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); - } + BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + V1 = Base; + V2 = Insert; + return true; } + } - return SDValue(); - }; + return false; +} + +/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. +static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + uint64_t BitLen, BitIdx; + if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) + return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); - if (SDValue InsertQ = LowerAsInsertQ()) - return InsertQ; + if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) + return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), + V2 ? V2 : DAG.getUNDEF(VT), + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); return SDValue(); } @@ -22817,7 +22850,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - auto SynchScope = AI->getSynchScope(); + auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); @@ -22839,7 +22872,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. - if (SynchScope == SingleThread) + if (SSID == SyncScope::SingleThread) // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; @@ -22858,7 +22891,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, AI->getType()->getPrimitiveSizeInBits()); - Loaded->setAtomic(Order, SynchScope); + Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; @@ -22869,13 +22902,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); - SynchronizationScope FenceScope = static_cast<SynchronizationScope>( + SyncScope::ID FenceSSID = static_cast<SyncScope::ID>( cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && - FenceScope == CrossThread) { + FenceSSID == SyncScope::System) { if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); @@ -23203,6 +23236,20 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); + // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. + if (Subtarget.hasVPOPCNTDQ()) { + if (VT == MVT::v8i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + if (VT == MVT::v16i8 || VT == MVT::v16i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + } + if (!Subtarget.hasSSSE3()) { // We can't use the fast LUT approach, so fall back on vectorized bitmath. assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); @@ -27101,6 +27148,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const X86Subtarget &Subtarget, @@ -27111,38 +27159,67 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); - bool ContainsZeros = false; - APInt Zeroable(NumMaskElts, false); - for (unsigned i = 0; i != NumMaskElts; ++i) { - int M = Mask[i]; - if (isUndefOrZero(M)) - Zeroable.setBit(i); - ContainsZeros |= (M == SM_SentinelZero); - } + bool ContainsZeros = + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); - // Attempt to match against byte/bit shifts. - // FIXME: Add 512-bit support. - if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, - MaskScalarSizeInBits, Mask, - 0, Zeroable, Subtarget); - if (0 < ShiftAmt) { - PermuteImm = (unsigned)ShiftAmt; + // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. + if (!ContainsZeros && MaskScalarSizeInBits == 64) { + // Check for lane crossing permutes. + if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { + // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). + if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); + PermuteImm = getV4X86ShuffleImm(Mask); + return true; + } + if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) { + SmallVector<int, 4> RepeatedMask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); + PermuteImm = getV4X86ShuffleImm(RepeatedMask); + return true; + } + } + } else if (AllowFloatDomain && Subtarget.hasAVX()) { + // VPERMILPD can permute with a non-repeating shuffle. + Shuffle = X86ISD::VPERMILPI; + ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); + PermuteImm = 0; + for (int i = 0, e = Mask.size(); i != e; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); + PermuteImm |= (M & 1) << i; + } return true; } } - // Ensure we don't contain any zero elements. - if (ContainsZeros) - return false; - - assert(llvm::all_of(Mask, [&](int M) { - return SM_SentinelUndef <= M && M < (int)NumMaskElts; - }) && "Expected unary shuffle"); + // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && + !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + // Narrow the repeated mask to create 32-bit element permutes. + SmallVector<int, 4> WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) + scaleShuffleMask(2, RepeatedMask, WordMask); + + Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); + ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; + } + } - // Handle PSHUFLW/PSHUFHW repeated patterns. - if (MaskScalarSizeInBits == 16) { + // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector<int, 4> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef<int> LoMask(Mask.data() + 0, 4); @@ -27170,78 +27247,23 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, PermuteImm = getV4X86ShuffleImm(OffsetHiMask); return true; } - - return false; } - return false; - } - - // We only support permutation of 32/64 bit elements after this. - if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) - return false; - - // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we - // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX()) - return false; - - // Pre-AVX2 we must use float shuffles on 256-bit vectors. - if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) { - AllowFloatDomain = true; - AllowIntDomain = false; } - // Check for lane crossing permutes. - if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { - // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). - if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) { - Shuffle = X86ISD::VPERMI; - ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); - PermuteImm = getV4X86ShuffleImm(Mask); + // Attempt to match against byte/bit shifts. + // FIXME: Add 512-bit support. + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, + MaskScalarSizeInBits, Mask, + 0, Zeroable, Subtarget); + if (0 < ShiftAmt) { + PermuteImm = (unsigned)ShiftAmt; return true; } - if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) { - SmallVector<int, 4> RepeatedMask; - if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { - Shuffle = X86ISD::VPERMI; - ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); - PermuteImm = getV4X86ShuffleImm(RepeatedMask); - return true; - } - } - return false; } - // VPERMILPD can permute with a non-repeating shuffle. - if (AllowFloatDomain && MaskScalarSizeInBits == 64) { - Shuffle = X86ISD::VPERMILPI; - ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); - PermuteImm = 0; - for (int i = 0, e = Mask.size(); i != e; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); - PermuteImm |= (M & 1) << i; - } - return true; - } - - // We need a repeating shuffle mask for VPERMILPS/PSHUFD. - SmallVector<int, 4> RepeatedMask; - if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) - return false; - - // Narrow the repeated mask for 32-bit element permutes. - SmallVector<int, 4> WordMask = RepeatedMask; - if (MaskScalarSizeInBits == 64) - scaleShuffleMask(2, RepeatedMask, WordMask); - - Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); - ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32); - ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); - PermuteImm = getV4X86ShuffleImm(WordMask); - return true; + return false; } // Attempt to match a combined unary shuffle mask against supported binary @@ -27303,6 +27325,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, SDLoc &DL, @@ -27388,11 +27411,6 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // Attempt to combine to INSERTPS. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { - APInt Zeroable(4, 0); - for (unsigned i = 0; i != NumMaskElts; ++i) - if (Mask[i] < 0) - Zeroable.setBit(i); - if (Zeroable.getBoolValue() && matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; @@ -27578,7 +27596,14 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. bool AllowFloatDomain = FloatDomain || (Depth > 3); - bool AllowIntDomain = !FloatDomain || (Depth > 3); + bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && + (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); + + // Determine zeroable mask elements. + APInt Zeroable(NumMaskElts, 0); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (isUndefOrZero(Mask[i])) + Zeroable.setBit(i); if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load @@ -27612,7 +27637,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) @@ -27648,7 +27673,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm)) { @@ -27668,6 +27693,45 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } + // Typically from here on, we need an integer version of MaskVT. + MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits); + IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts); + + // Annoyingly, SSE4A instructions don't map into the above match helpers. + if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { + uint64_t BitLen, BitIdx; + if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, + Zeroable)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) + return false; // Nothing to do! + V1 = DAG.getBitcast(IntMaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) + return false; // Nothing to do! + V1 = DAG.getBitcast(IntMaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(IntMaskVT, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + } + // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) @@ -27688,9 +27752,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27719,9 +27781,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (Mask[i] == SM_SentinelZero) Mask[i] = NumMaskElts + i; - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27746,9 +27806,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); V1 = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(V1.getNode()); @@ -27807,8 +27865,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); VPermIdx.push_back(Idx); } - MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts); - SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); + SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27831,8 +27888,6 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumLanes = MaskVT.getSizeInBits() / 128; unsigned NumEltsPerLane = NumMaskElts / NumLanes; SmallVector<int, 8> VPerm2Idx; - MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits()); - MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts); unsigned M2ZImm = 0; for (int M : Mask) { if (M == SM_SentinelUndef) { @@ -27852,7 +27907,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(MaskVT, V2); DCI.AddToWorklist(V2.getNode()); - SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true); + SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPerm2MaskOp.getNode()); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getConstant(M2ZImm, DL, MVT::i8)); @@ -29163,9 +29218,9 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // v8i16 and v16i16. // For these two cases, we can shuffle the upper element bytes to a // consecutive sequence at the start of the vector and treat the results as - // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However, + // v16i8 or v32i8, and for v61i8 this is the preferable solution. However, // for v16i16 this is not the case, because the shuffle is expensive, so we - // avoid sign-exteding to this type entirely. + // avoid sign-extending to this type entirely. // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; @@ -29207,7 +29262,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SExtVT = MVT::v16i8; // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), // it is not profitable to sign-extend to 256-bit because this will - // require an extra cross-lane shuffle which is more exprensive than + // require an extra cross-lane shuffle which is more expensive than // truncating the result of the compare to 128-bits. break; case MVT::v32i1: @@ -29580,8 +29635,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // (extends the sign bit which is zero). // So it is correct to skip the sign/zero extend instruction. if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || - Root.getOpcode() == ISD::ZERO_EXTEND || - Root.getOpcode() == ISD::ANY_EXTEND)) + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) Root = Root.getOperand(0); // If there was a match, we want Root to be a select that is the root of an @@ -34950,6 +35005,40 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); + // If X is -1 or 0, then we have an opportunity to avoid constants required in + // the general case below. + auto *ConstantX = dyn_cast<ConstantSDNode>(X); + if (ConstantX) { + if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) || + (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) { + // This is a complicated way to get -1 or 0 from the carry flag: + // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax + // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + Y.getOperand(1)); + } + + if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) || + (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) { + SDValue EFLAGS = Y->getOperand(1); + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + // Swap the operands of a SUB, and we have the same pattern as above. + // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB + // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB + SDValue NewSub = DAG.getNode( + X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + NewEFLAGS); + } + } + } + if (CC == X86::COND_B) { // X + SETB Z --> X + (mask SBB Z, Z) // X - SETB Z --> X - (mask SBB Z, Z) @@ -34996,7 +35085,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. - if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) { + if (ConstantX) { // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with // fake operands: // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) @@ -35549,6 +35638,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: + case X86ISD::EXTRQI: + case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: |