aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp537
1 files changed, 314 insertions, 223 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b89914f8893e..65486cf7f529 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4217,6 +4217,8 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::PSHUFLW:
case X86ISD::SHUFP:
case X86ISD::INSERTPS:
+ case X86ISD::EXTRQI:
+ case X86ISD::INSERTQI:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
@@ -5554,6 +5556,24 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
+ case X86ISD::EXTRQI:
+ if (isa<ConstantSDNode>(N->getOperand(1)) &&
+ isa<ConstantSDNode>(N->getOperand(2))) {
+ int BitLen = N->getConstantOperandVal(1);
+ int BitIdx = N->getConstantOperandVal(2);
+ DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
+ IsUnary = true;
+ }
+ break;
+ case X86ISD::INSERTQI:
+ if (isa<ConstantSDNode>(N->getOperand(2)) &&
+ isa<ConstantSDNode>(N->getOperand(3))) {
+ int BitLen = N->getConstantOperandVal(2);
+ int BitIdx = N->getConstantOperandVal(3);
+ DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
+ IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+ }
+ break;
case X86ISD::UNPCKH:
DecodeUNPCKHMask(VT, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -9317,11 +9337,11 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getBitcast(VT, V);
}
-/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
-static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
- SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable,
- SelectionDAG &DAG) {
+// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
+// Remainder of lower half result is zero and upper half is all undef.
+static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx, const APInt &Zeroable) {
int Size = Mask.size();
int HalfSize = Size / 2;
assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
@@ -9329,120 +9349,133 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
// Upper half must be undefined.
if (!isUndefInRange(Mask, HalfSize, HalfSize))
- return SDValue();
+ return false;
- // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
- // Remainder of lower half result is zero and upper half is all undef.
- auto LowerAsEXTRQ = [&]() {
- // Determine the extraction length from the part of the
- // lower half that isn't zeroable.
- int Len = HalfSize;
- for (; Len > 0; --Len)
- if (!Zeroable[Len - 1])
- break;
- assert(Len > 0 && "Zeroable shuffle mask");
+ // Determine the extraction length from the part of the
+ // lower half that isn't zeroable.
+ int Len = HalfSize;
+ for (; Len > 0; --Len)
+ if (!Zeroable[Len - 1])
+ break;
+ assert(Len > 0 && "Zeroable shuffle mask");
- // Attempt to match first Len sequential elements from the lower half.
- SDValue Src;
- int Idx = -1;
- for (int i = 0; i != Len; ++i) {
- int M = Mask[i];
- if (M < 0)
- continue;
- SDValue &V = (M < Size ? V1 : V2);
- M = M % Size;
+ // Attempt to match first Len sequential elements from the lower half.
+ SDValue Src;
+ int Idx = -1;
+ for (int i = 0; i != Len; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ SDValue &V = (M < Size ? V1 : V2);
+ M = M % Size;
- // The extracted elements must start at a valid index and all mask
- // elements must be in the lower half.
- if (i > M || M >= HalfSize)
- return SDValue();
+ // The extracted elements must start at a valid index and all mask
+ // elements must be in the lower half.
+ if (i > M || M >= HalfSize)
+ return false;
- if (Idx < 0 || (Src == V && Idx == (M - i))) {
- Src = V;
- Idx = M - i;
- continue;
- }
- return SDValue();
+ if (Idx < 0 || (Src == V && Idx == (M - i))) {
+ Src = V;
+ Idx = M - i;
+ continue;
}
+ return false;
+ }
- if (Idx < 0)
- return SDValue();
+ if (!Src || Idx < 0)
+ return false;
- assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
- int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
- int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
- return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
- };
+ assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
+ BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ V1 = Src;
+ return true;
+}
+
+// INSERTQ: Extract lowest Len elements from lower half of second source and
+// insert over first source, starting at Idx.
+// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
+static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
+ ArrayRef<int> Mask, uint64_t &BitLen,
+ uint64_t &BitIdx) {
+ int Size = Mask.size();
+ int HalfSize = Size / 2;
+ assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+ // Upper half must be undefined.
+ if (!isUndefInRange(Mask, HalfSize, HalfSize))
+ return false;
+
+ for (int Idx = 0; Idx != HalfSize; ++Idx) {
+ SDValue Base;
+
+ // Attempt to match first source from mask before insertion point.
+ if (isUndefInRange(Mask, 0, Idx)) {
+ /* EMPTY */
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ Base = V1;
+ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ Base = V2;
+ } else {
+ continue;
+ }
- if (SDValue ExtrQ = LowerAsEXTRQ())
- return ExtrQ;
+ // Extend the extraction length looking to match both the insertion of
+ // the second source and the remaining elements of the first.
+ for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
+ SDValue Insert;
+ int Len = Hi - Idx;
- // INSERTQ: Extract lowest Len elements from lower half of second source and
- // insert over first source, starting at Idx.
- // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
- auto LowerAsInsertQ = [&]() {
- for (int Idx = 0; Idx != HalfSize; ++Idx) {
- SDValue Base;
+ // Match insertion.
+ if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
+ Insert = V1;
+ } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
+ Insert = V2;
+ } else {
+ continue;
+ }
- // Attempt to match first source from mask before insertion point.
- if (isUndefInRange(Mask, 0, Idx)) {
+ // Match the remaining elements of the lower half.
+ if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
/* EMPTY */
- } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
+ } else if ((!Base || (Base == V1)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
Base = V1;
- } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
+ } else if ((!Base || (Base == V2)) &&
+ isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
+ Size + Hi)) {
Base = V2;
} else {
continue;
}
- // Extend the extraction length looking to match both the insertion of
- // the second source and the remaining elements of the first.
- for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
- SDValue Insert;
- int Len = Hi - Idx;
-
- // Match insertion.
- if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
- Insert = V1;
- } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
- Insert = V2;
- } else {
- continue;
- }
-
- // Match the remaining elements of the lower half.
- if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
- /* EMPTY */
- } else if ((!Base || (Base == V1)) &&
- isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
- Base = V1;
- } else if ((!Base || (Base == V2)) &&
- isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
- Size + Hi)) {
- Base = V2;
- } else {
- continue;
- }
-
- // We may not have a base (first source) - this can safely be undefined.
- if (!Base)
- Base = DAG.getUNDEF(VT);
-
- int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
- int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
- return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
- DAG.getConstant(BitLen, DL, MVT::i8),
- DAG.getConstant(BitIdx, DL, MVT::i8));
- }
+ BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
+ BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
+ V1 = Base;
+ V2 = Insert;
+ return true;
}
+ }
- return SDValue();
- };
+ return false;
+}
+
+/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
+static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ SelectionDAG &DAG) {
+ uint64_t BitLen, BitIdx;
+ if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
+ return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
- if (SDValue InsertQ = LowerAsInsertQ())
- return InsertQ;
+ if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
+ return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
+ V2 ? V2 : DAG.getUNDEF(VT),
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
return SDValue();
}
@@ -22817,7 +22850,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
auto Builder = IRBuilder<>(AI);
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
- auto SynchScope = AI->getSynchScope();
+ auto SSID = AI->getSyncScopeID();
// We must restrict the ordering to avoid generating loads with Release or
// ReleaseAcquire orderings.
auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
@@ -22839,7 +22872,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// otherwise, we might be able to be more aggressive on relaxed idempotent
// rmw. In practice, they do not look useful, so we don't try to be
// especially clever.
- if (SynchScope == SingleThread)
+ if (SSID == SyncScope::SingleThread)
// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
// the IR level, so we must wrap it in an intrinsic.
return nullptr;
@@ -22858,7 +22891,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
// Finally we can emit the atomic load.
LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
AI->getType()->getPrimitiveSizeInBits());
- Loaded->setAtomic(Order, SynchScope);
+ Loaded->setAtomic(Order, SSID);
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
return Loaded;
@@ -22869,13 +22902,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
SDLoc dl(Op);
AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
- SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
+ SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
// The only fence that needs an instruction is a sequentially-consistent
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
- FenceScope == CrossThread) {
+ FenceSSID == SyncScope::System) {
if (Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
@@ -23203,6 +23236,20 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
SDLoc DL(Op.getNode());
SDValue Op0 = Op.getOperand(0);
+ // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
+ if (Subtarget.hasVPOPCNTDQ()) {
+ if (VT == MVT::v8i16) {
+ Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
+ Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
+ }
+ if (VT == MVT::v16i8 || VT == MVT::v16i16) {
+ Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
+ Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
+ return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
+ }
+ }
+
if (!Subtarget.hasSSSE3()) {
// We can't use the fast LUT approach, so fall back on vectorized bitmath.
assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
@@ -27101,6 +27148,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// permute instructions.
// TODO: Investigate sharing more of this with shuffle lowering.
static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const APInt &Zeroable,
bool AllowFloatDomain,
bool AllowIntDomain,
const X86Subtarget &Subtarget,
@@ -27111,38 +27159,67 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
- bool ContainsZeros = false;
- APInt Zeroable(NumMaskElts, false);
- for (unsigned i = 0; i != NumMaskElts; ++i) {
- int M = Mask[i];
- if (isUndefOrZero(M))
- Zeroable.setBit(i);
- ContainsZeros |= (M == SM_SentinelZero);
- }
+ bool ContainsZeros =
+ llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
- // Attempt to match against byte/bit shifts.
- // FIXME: Add 512-bit support.
- if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
- (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
- int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
- MaskScalarSizeInBits, Mask,
- 0, Zeroable, Subtarget);
- if (0 < ShiftAmt) {
- PermuteImm = (unsigned)ShiftAmt;
+ // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
+ if (!ContainsZeros && MaskScalarSizeInBits == 64) {
+ // Check for lane crossing permutes.
+ if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
+ // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
+ if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
+ PermuteImm = getV4X86ShuffleImm(Mask);
+ return true;
+ }
+ if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
+ Shuffle = X86ISD::VPERMI;
+ ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
+ PermuteImm = getV4X86ShuffleImm(RepeatedMask);
+ return true;
+ }
+ }
+ } else if (AllowFloatDomain && Subtarget.hasAVX()) {
+ // VPERMILPD can permute with a non-repeating shuffle.
+ Shuffle = X86ISD::VPERMILPI;
+ ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
+ PermuteImm = 0;
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ int M = Mask[i];
+ if (M == SM_SentinelUndef)
+ continue;
+ assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
+ PermuteImm |= (M & 1) << i;
+ }
return true;
}
}
- // Ensure we don't contain any zero elements.
- if (ContainsZeros)
- return false;
-
- assert(llvm::all_of(Mask, [&](int M) {
- return SM_SentinelUndef <= M && M < (int)NumMaskElts;
- }) && "Expected unary shuffle");
+ // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
+ // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
+ // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
+ if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
+ !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
+ SmallVector<int, 4> RepeatedMask;
+ if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
+ // Narrow the repeated mask to create 32-bit element permutes.
+ SmallVector<int, 4> WordMask = RepeatedMask;
+ if (MaskScalarSizeInBits == 64)
+ scaleShuffleMask(2, RepeatedMask, WordMask);
+
+ Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
+ ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
+ ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
+ PermuteImm = getV4X86ShuffleImm(WordMask);
+ return true;
+ }
+ }
- // Handle PSHUFLW/PSHUFHW repeated patterns.
- if (MaskScalarSizeInBits == 16) {
+ // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
+ if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
SmallVector<int, 4> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
ArrayRef<int> LoMask(Mask.data() + 0, 4);
@@ -27170,78 +27247,23 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
return true;
}
-
- return false;
}
- return false;
- }
-
- // We only support permutation of 32/64 bit elements after this.
- if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
- return false;
-
- // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
- // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
- if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
- return false;
-
- // Pre-AVX2 we must use float shuffles on 256-bit vectors.
- if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
- AllowFloatDomain = true;
- AllowIntDomain = false;
}
- // Check for lane crossing permutes.
- if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
- // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
- if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
- Shuffle = X86ISD::VPERMI;
- ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
- PermuteImm = getV4X86ShuffleImm(Mask);
+ // Attempt to match against byte/bit shifts.
+ // FIXME: Add 512-bit support.
+ if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+ (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+ int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
+ MaskScalarSizeInBits, Mask,
+ 0, Zeroable, Subtarget);
+ if (0 < ShiftAmt) {
+ PermuteImm = (unsigned)ShiftAmt;
return true;
}
- if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
- SmallVector<int, 4> RepeatedMask;
- if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
- Shuffle = X86ISD::VPERMI;
- ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
- PermuteImm = getV4X86ShuffleImm(RepeatedMask);
- return true;
- }
- }
- return false;
}
- // VPERMILPD can permute with a non-repeating shuffle.
- if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
- Shuffle = X86ISD::VPERMILPI;
- ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
- PermuteImm = 0;
- for (int i = 0, e = Mask.size(); i != e; ++i) {
- int M = Mask[i];
- if (M == SM_SentinelUndef)
- continue;
- assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
- PermuteImm |= (M & 1) << i;
- }
- return true;
- }
-
- // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
- SmallVector<int, 4> RepeatedMask;
- if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
- return false;
-
- // Narrow the repeated mask for 32-bit element permutes.
- SmallVector<int, 4> WordMask = RepeatedMask;
- if (MaskScalarSizeInBits == 64)
- scaleShuffleMask(2, RepeatedMask, WordMask);
-
- Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
- ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
- ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
- PermuteImm = getV4X86ShuffleImm(WordMask);
- return true;
+ return false;
}
// Attempt to match a combined unary shuffle mask against supported binary
@@ -27303,6 +27325,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
+ const APInt &Zeroable,
bool AllowFloatDomain,
bool AllowIntDomain,
SDValue &V1, SDValue &V2, SDLoc &DL,
@@ -27388,11 +27411,6 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Attempt to combine to INSERTPS.
if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
MaskVT.is128BitVector()) {
- APInt Zeroable(4, 0);
- for (unsigned i = 0; i != NumMaskElts; ++i)
- if (Mask[i] < 0)
- Zeroable.setBit(i);
-
if (Zeroable.getBoolValue() &&
matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
Shuffle = X86ISD::INSERTPS;
@@ -27578,7 +27596,14 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
bool AllowFloatDomain = FloatDomain || (Depth > 3);
- bool AllowIntDomain = !FloatDomain || (Depth > 3);
+ bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
+ (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
+
+ // Determine zeroable mask elements.
+ APInt Zeroable(NumMaskElts, 0);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (isUndefOrZero(Mask[i]))
+ Zeroable.setBit(i);
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
@@ -27612,7 +27637,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
- if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+ if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, Subtarget, Shuffle,
ShuffleVT, PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
@@ -27648,7 +27673,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
- if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+ if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, V1, V2, DL, DAG,
Subtarget, Shuffle, ShuffleVT,
PermuteImm)) {
@@ -27668,6 +27693,45 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
return true;
}
+ // Typically from here on, we need an integer version of MaskVT.
+ MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
+ IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
+
+ // Annoyingly, SSE4A instructions don't map into the above match helpers.
+ if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
+ uint64_t BitLen, BitIdx;
+ if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
+ Zeroable)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
+ return false; // Nothing to do!
+ V1 = DAG.getBitcast(IntMaskVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+
+ if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
+ if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
+ return false; // Nothing to do!
+ V1 = DAG.getBitcast(IntMaskVT, V1);
+ DCI.AddToWorklist(V1.getNode());
+ V2 = DAG.getBitcast(IntMaskVT, V2);
+ DCI.AddToWorklist(V2.getNode());
+ Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
+ DAG.getConstant(BitLen, DL, MVT::i8),
+ DAG.getConstant(BitIdx, DL, MVT::i8));
+ DCI.AddToWorklist(Res.getNode());
+ DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
+ /*AddTo*/ true);
+ return true;
+ }
+ }
+
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 2)
@@ -27688,9 +27752,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
- MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
- SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
@@ -27719,9 +27781,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (Mask[i] == SM_SentinelZero)
Mask[i] = NumMaskElts + i;
- MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
- MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
- SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
@@ -27746,9 +27806,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
- MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
- MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
- SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
+ SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPermMask.getNode());
V1 = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(V1.getNode());
@@ -27807,8 +27865,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
VPermIdx.push_back(Idx);
}
- MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
- SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
+ SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
DCI.AddToWorklist(VPermMask.getNode());
Res = DAG.getBitcast(MaskVT, V1);
DCI.AddToWorklist(Res.getNode());
@@ -27831,8 +27888,6 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumLanes = MaskVT.getSizeInBits() / 128;
unsigned NumEltsPerLane = NumMaskElts / NumLanes;
SmallVector<int, 8> VPerm2Idx;
- MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
- MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
unsigned M2ZImm = 0;
for (int M : Mask) {
if (M == SM_SentinelUndef) {
@@ -27852,7 +27907,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(MaskVT, V2);
DCI.AddToWorklist(V2.getNode());
- SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
+ SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
DCI.AddToWorklist(VPerm2MaskOp.getNode());
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getConstant(M2ZImm, DL, MVT::i8));
@@ -29163,9 +29218,9 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// v8i16 and v16i16.
// For these two cases, we can shuffle the upper element bytes to a
// consecutive sequence at the start of the vector and treat the results as
- // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However,
+ // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
// for v16i16 this is not the case, because the shuffle is expensive, so we
- // avoid sign-exteding to this type entirely.
+ // avoid sign-extending to this type entirely.
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
MVT SExtVT;
@@ -29207,7 +29262,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SExtVT = MVT::v16i8;
// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
// it is not profitable to sign-extend to 256-bit because this will
- // require an extra cross-lane shuffle which is more exprensive than
+ // require an extra cross-lane shuffle which is more expensive than
// truncating the result of the compare to 128-bits.
break;
case MVT::v32i1:
@@ -29580,8 +29635,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
// (extends the sign bit which is zero).
// So it is correct to skip the sign/zero extend instruction.
if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
- Root.getOpcode() == ISD::ZERO_EXTEND ||
- Root.getOpcode() == ISD::ANY_EXTEND))
+ Root.getOpcode() == ISD::ZERO_EXTEND ||
+ Root.getOpcode() == ISD::ANY_EXTEND))
Root = Root.getOperand(0);
// If there was a match, we want Root to be a select that is the root of an
@@ -34950,6 +35005,40 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
+ // If X is -1 or 0, then we have an opportunity to avoid constants required in
+ // the general case below.
+ auto *ConstantX = dyn_cast<ConstantSDNode>(X);
+ if (ConstantX) {
+ if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
+ (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
+ // This is a complicated way to get -1 or 0 from the carry flag:
+ // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ Y.getOperand(1));
+ }
+
+ if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
+ (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
+ SDValue EFLAGS = Y->getOperand(1);
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ EFLAGS.getValueType().isInteger() &&
+ !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+ // Swap the operands of a SUB, and we have the same pattern as above.
+ // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
+ // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
+ SDValue NewSub = DAG.getNode(
+ X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+ EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+ SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+ return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+ DAG.getConstant(X86::COND_B, DL, MVT::i8),
+ NewEFLAGS);
+ }
+ }
+ }
+
if (CC == X86::COND_B) {
// X + SETB Z --> X + (mask SBB Z, Z)
// X - SETB Z --> X - (mask SBB Z, Z)
@@ -34996,7 +35085,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
// If X is -1 or 0, then we have an opportunity to avoid constants required in
// the general case below.
- if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) {
+ if (ConstantX) {
// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
// fake operands:
// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
@@ -35549,6 +35638,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
case X86ISD::INSERTPS:
+ case X86ISD::EXTRQI:
+ case X86ISD::INSERTQI:
case X86ISD::PALIGNR:
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ: