aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp2559
1 files changed, 1846 insertions, 713 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0b630197911a..cc3f5c7d4b48 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -22,6 +22,7 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
@@ -217,15 +218,18 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
return true;
}
+/// \returns True if the value is a constant (but not globals/constant
+/// expressions).
+static bool isConstant(Value *V) {
+ return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
+}
+
/// \returns True if all of the values in \p VL are constants (but not
/// globals/constant expressions).
static bool allConstant(ArrayRef<Value *> VL) {
// Constant expressions and globals can't be vectorized like normal integer/FP
// constants.
- for (Value *i : VL)
- if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
- return false;
- return true;
+ return all_of(VL, isConstant);
}
/// \returns True if all of the values in \p VL are identical.
@@ -287,10 +291,11 @@ static bool isCommutative(Instruction *I) {
/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
/// ret <4 x i8> %ins4
/// InstCombiner transforms this into a shuffle and vector mul
+/// Mask will return the Shuffle Mask equivalent to the extracted elements.
/// TODO: Can we split off and reuse the shuffle mask detection from
/// TargetTransformInfo::getInstructionThroughput?
static Optional<TargetTransformInfo::ShuffleKind>
-isShuffle(ArrayRef<Value *> VL) {
+isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size =
cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
@@ -308,9 +313,12 @@ isShuffle(ArrayRef<Value *> VL) {
if (!Idx)
return None;
// Undefined behavior if Idx is negative or >= Size.
- if (Idx->getValue().uge(Size))
+ if (Idx->getValue().uge(Size)) {
+ Mask.push_back(UndefMaskElem);
continue;
+ }
unsigned IntIdx = Idx->getValue().getZExtValue();
+ Mask.push_back(IntIdx);
// We can extractelement from undef or poison vector.
if (isa<UndefValue>(Vec))
continue;
@@ -538,6 +546,41 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
Mask[Indices[I]] = I;
}
+/// \returns inserting index of InsertElement or InsertValue instruction,
+/// using Offset as base offset for index.
+static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) {
+ int Index = Offset;
+ if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+ auto *VT = cast<FixedVectorType>(IE->getType());
+ if (CI->getValue().uge(VT->getNumElements()))
+ return UndefMaskElem;
+ Index *= VT->getNumElements();
+ Index += CI->getZExtValue();
+ return Index;
+ }
+ if (isa<UndefValue>(IE->getOperand(2)))
+ return UndefMaskElem;
+ return None;
+ }
+
+ auto *IV = cast<InsertValueInst>(InsertInst);
+ Type *CurrentType = IV->getType();
+ for (unsigned I : IV->indices()) {
+ if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+ Index *= ST->getNumElements();
+ CurrentType = ST->getElementType(I);
+ } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ Index *= AT->getNumElements();
+ CurrentType = AT->getElementType();
+ } else {
+ return None;
+ }
+ Index += I;
+ }
+ return Index;
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -570,7 +613,9 @@ public:
if (MaxVectorRegSizeOption.getNumOccurrences())
MaxVecRegSize = MaxVectorRegSizeOption;
else
- MaxVecRegSize = TTI->getRegisterBitWidth(true);
+ MaxVecRegSize =
+ TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+ .getFixedSize();
if (MinVectorRegSizeOption.getNumOccurrences())
MinVecRegSize = MinVectorRegSizeOption;
@@ -593,7 +638,7 @@ public:
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- InstructionCost getTreeCost();
+ InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None);
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -621,6 +666,7 @@ public:
BS->clear();
}
MinBWs.clear();
+ InstrElementSize.clear();
}
unsigned getTreeSize() const { return VectorizableTree.size(); }
@@ -937,10 +983,16 @@ public:
ScalarEvolution &SE) {
auto *LI1 = dyn_cast<LoadInst>(V1);
auto *LI2 = dyn_cast<LoadInst>(V2);
- if (LI1 && LI2)
- return isConsecutiveAccess(LI1, LI2, DL, SE)
- ? VLOperands::ScoreConsecutiveLoads
- : VLOperands::ScoreFail;
+ if (LI1 && LI2) {
+ if (LI1->getParent() != LI2->getParent())
+ return VLOperands::ScoreFail;
+
+ Optional<int> Dist = getPointersDiff(
+ LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
+ LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
+ return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads
+ : VLOperands::ScoreFail;
+ }
auto *C1 = dyn_cast<Constant>(V1);
auto *C2 = dyn_cast<Constant>(V2);
@@ -1500,10 +1552,12 @@ public:
private:
/// Checks if all users of \p I are the part of the vectorization tree.
- bool areAllUsersVectorized(Instruction *I) const;
+ bool areAllUsersVectorized(Instruction *I,
+ ArrayRef<Value *> VectorizedVals) const;
/// \returns the cost of the vectorizable entry.
- InstructionCost getEntryCost(TreeEntry *E);
+ InstructionCost getEntryCost(const TreeEntry *E,
+ ArrayRef<Value *> VectorizedVals);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -1529,6 +1583,14 @@ private:
getGatherCost(FixedVectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const;
+ /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
+ /// tree entries.
+ /// \returns ShuffleKind, if gathered values can be represented as shuffles of
+ /// previous tree entries. \p Mask is filled with the shuffle mask.
+ Optional<TargetTransformInfo::ShuffleKind>
+ isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries);
+
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
@@ -1536,7 +1598,7 @@ private:
/// Set the Builder insert point to one after the last instruction in
/// the bundle
- void setInsertPointAfterBundle(TreeEntry *E);
+ void setInsertPointAfterBundle(const TreeEntry *E);
/// \returns a vector from a collection of scalars in \p VL.
Value *gather(ArrayRef<Value *> VL);
@@ -1615,7 +1677,7 @@ private:
void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
if (Operands.size() < OpIdx + 1)
Operands.resize(OpIdx + 1);
- assert(Operands[OpIdx].size() == 0 && "Already resized?");
+ assert(Operands[OpIdx].empty() && "Already resized?");
Operands[OpIdx].resize(Scalars.size());
for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
Operands[OpIdx][Lane] = OpVL[Lane];
@@ -1706,6 +1768,17 @@ private:
setOperations(S);
return true;
}
+ /// When ReuseShuffleIndices is empty it just returns position of \p V
+ /// within vector of Scalars. Otherwise, try to remap on its reuse index.
+ int findLaneForValue(Value *V) const {
+ unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
+ assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
+ if (!ReuseShuffleIndices.empty()) {
+ FoundLane = std::distance(ReuseShuffleIndices.begin(),
+ find(ReuseShuffleIndices, FoundLane));
+ }
+ return FoundLane;
+ }
#ifndef NDEBUG
/// Debug printer.
@@ -1766,7 +1839,7 @@ private:
};
#ifndef NDEBUG
- void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
+ void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
InstructionCost VecCost,
InstructionCost ScalarCost) const {
dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
@@ -1896,7 +1969,7 @@ private:
bool aliased = true;
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
// Do the alias check.
- aliased = AA->alias(Loc1, Loc2);
+ aliased = !AA->isNoAlias(Loc1, Loc2);
}
// Store the result in the cache.
result = aliased;
@@ -2547,12 +2620,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// For each lane:
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
Value *Scalar = Entry->Scalars[Lane];
- int FoundLane = Lane;
- if (!Entry->ReuseShuffleIndices.empty()) {
- FoundLane =
- std::distance(Entry->ReuseShuffleIndices.begin(),
- llvm::find(Entry->ReuseShuffleIndices, FoundLane));
- }
+ int FoundLane = Entry->findLaneForValue(Scalar);
// Check if the scalar is externally used as an extra arg.
auto ExtI = ExternallyUsedValues.find(Scalar);
@@ -2575,6 +2643,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
// instructions. If that is the case, the one in Lane 0 will
// be used.
if (UseScalar != U ||
+ UseEntry->State == TreeEntry::ScatterVectorize ||
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
<< ".\n");
@@ -2606,8 +2675,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
return;
}
+ // Don't handle scalable vectors
+ if (S.getOpcode() == Instruction::ExtractElement &&
+ isa<ScalableVectorType>(
+ cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
// Don't handle vectors.
- if (S.OpValue->getType()->isVectorTy()) {
+ if (S.OpValue->getType()->isVectorTy() &&
+ !isa<InsertElementInst>(S.OpValue)) {
LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
return;
@@ -2764,6 +2843,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Keeps the reordered operands to avoid code duplication.
SmallVector<ValueList, 2> OperandsVec;
for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
+ if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) {
+ ValueList Operands(VL.size(), PoisonValue::get(PH->getType()));
+ TE->setOperand(I, Operands);
+ OperandsVec.push_back(Operands);
+ continue;
+ }
ValueList Operands;
// Prepare the operand vector.
for (Value *V : VL)
@@ -2819,6 +2904,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
BS.cancelScheduling(VL, VL0);
return;
}
+ case Instruction::InsertElement: {
+ assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
+
+ // Check that we have a buildvector and not a shuffle of 2 or more
+ // different vectors.
+ ValueSet SourceVectors;
+ for (Value *V : VL)
+ SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
+
+ if (count_if(VL, [&SourceVectors](Value *V) {
+ return !SourceVectors.contains(V);
+ }) >= 2) {
+ // Found 2nd source vector - cancel.
+ LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
+ "different source vectors.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ BS.cancelScheduling(VL, VL0);
+ return;
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx);
+ LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
+
+ constexpr int NumOps = 2;
+ ValueList VectorOperands[NumOps];
+ for (int I = 0; I < NumOps; ++I) {
+ for (Value *V : VL)
+ VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
+
+ TE->setOperand(I, VectorOperands[I]);
+ }
+ buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0});
+ return;
+ }
case Instruction::Load: {
// Check that a vectorized load would load the same memory as a scalar
// load. For example, we don't want to vectorize loads that are smaller
@@ -2856,7 +2976,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
OrdersType CurrentOrder;
// Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
Value *Ptr0;
Value *PtrN;
if (CurrentOrder.empty()) {
@@ -2866,13 +2986,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Ptr0 = PointerOps[CurrentOrder.front()];
PtrN = PointerOps[CurrentOrder.back()];
}
- const SCEV *Scev0 = SE->getSCEV(Ptr0);
- const SCEV *ScevN = SE->getSCEV(PtrN);
- const auto *Diff =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
- uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ Optional<int> Diff = getPointersDiff(
+ ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
// Check that the sorted loads are consecutive.
- if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+ if (static_cast<unsigned>(*Diff) == VL.size() - 1) {
if (CurrentOrder.empty()) {
// Original loads are consecutive and does not require reordering.
++NumOpsWantToKeepOriginalOrder;
@@ -2892,13 +3009,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
}
return;
}
- // Vectorizing non-consecutive loads with `llvm.masked.gather`.
- TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
- UserTreeIdx, ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- buildTree_rec(PointerOps, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
- return;
+ Align CommonAlignment = cast<LoadInst>(VL0)->getAlign();
+ for (Value *V : VL)
+ CommonAlignment =
+ commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()),
+ CommonAlignment)) {
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle,
+ S, UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs()
+ << "SLP: added a vector of non-consecutive loads.\n");
+ return;
+ }
}
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -3135,7 +3260,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
OrdersType CurrentOrder;
// Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
Value *Ptr0;
Value *PtrN;
if (CurrentOrder.empty()) {
@@ -3145,13 +3270,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Ptr0 = PointerOps[CurrentOrder.front()];
PtrN = PointerOps[CurrentOrder.back()];
}
- const SCEV *Scev0 = SE->getSCEV(Ptr0);
- const SCEV *ScevN = SE->getSCEV(PtrN);
- const auto *Diff =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
- uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ Optional<int> Dist =
+ getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
// Check that the sorted pointer operands are consecutive.
- if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+ if (static_cast<unsigned>(*Dist) == VL.size() - 1) {
if (CurrentOrder.empty()) {
// Original stores are consecutive and does not require reordering.
++NumOpsWantToKeepOriginalOrder;
@@ -3405,8 +3527,10 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
return ShouldKeepOrder;
}
-bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
- return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
+bool BoUpSLP::areAllUsersVectorized(Instruction *I,
+ ArrayRef<Value *> VectorizedVals) const {
+ return (I->hasOneUse() && is_contained(VectorizedVals, I)) ||
+ llvm::all_of(I->users(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0;
});
}
@@ -3417,7 +3541,16 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
+ SmallVector<Type *, 4> VecTys;
+ for (Use &Arg : CI->args())
+ VecTys.push_back(
+ FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+ FastMathFlags FMF;
+ if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
+ FMF = FPCI->getFastMathFlags();
+ SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
+ IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF,
+ dyn_cast<IntrinsicInst>(CI));
auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
@@ -3428,11 +3561,6 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
auto LibCost = IntrinsicCost;
if (!CI->isNoBuiltin() && VecFunc) {
// Calculate the cost of the vector library call.
- SmallVector<Type *, 4> VecTys;
- for (Use &Arg : CI->args())
- VecTys.push_back(
- FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
-
// If the corresponding vector call is cheaper, return its cost.
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
TTI::TCK_RecipThroughput);
@@ -3440,7 +3568,82 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
return {IntrinsicCost, LibCost};
}
-InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
+/// Compute the cost of creating a vector of type \p VecTy containing the
+/// extracted values from \p VL.
+static InstructionCost
+computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy,
+ TargetTransformInfo::ShuffleKind ShuffleKind,
+ ArrayRef<int> Mask, TargetTransformInfo &TTI) {
+ unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
+
+ if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts ||
+ VecTy->getNumElements() < NumOfParts)
+ return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
+
+ bool AllConsecutive = true;
+ unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts;
+ unsigned Idx = -1;
+ InstructionCost Cost = 0;
+
+ // Process extracts in blocks of EltsPerVector to check if the source vector
+ // operand can be re-used directly. If not, add the cost of creating a shuffle
+ // to extract the values into a vector register.
+ for (auto *V : VL) {
+ ++Idx;
+
+ // Reached the start of a new vector registers.
+ if (Idx % EltsPerVector == 0) {
+ AllConsecutive = true;
+ continue;
+ }
+
+ // Check all extracts for a vector register on the target directly
+ // extract values in order.
+ unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
+ unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1]));
+ AllConsecutive &= PrevIdx + 1 == CurrentIdx &&
+ CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+
+ if (AllConsecutive)
+ continue;
+
+ // Skip all indices, except for the last index per vector block.
+ if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
+ continue;
+
+ // If we have a series of extracts which are not consecutive and hence
+ // cannot re-use the source vector register directly, compute the shuffle
+ // cost to extract the a vector with EltsPerVector elements.
+ Cost += TTI.getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc,
+ FixedVectorType::get(VecTy->getElementType(), EltsPerVector));
+ }
+ return Cost;
+}
+
+/// Shuffles \p Mask in accordance with the given \p SubMask.
+static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) {
+ if (SubMask.empty())
+ return;
+ if (Mask.empty()) {
+ Mask.append(SubMask.begin(), SubMask.end());
+ return;
+ }
+ SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
+ int TermValue = std::min(Mask.size(), SubMask.size());
+ for (int I = 0, E = SubMask.size(); I < E; ++I) {
+ if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
+ Mask[SubMask[I]] >= TermValue) {
+ NewMask[I] = UndefMaskElem;
+ continue;
+ }
+ NewMask[I] = Mask[SubMask[I]];
+ }
+ Mask.swap(NewMask);
+}
+
+InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
+ ArrayRef<Value *> VectorizedVals) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
@@ -3448,6 +3651,8 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
ScalarTy = SI->getValueOperand()->getType();
else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
ScalarTy = CI->getOperand(0)->getType();
+ else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+ ScalarTy = IE->getOperand(1)->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
@@ -3456,45 +3661,168 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
if (MinBWs.count(VL[0]))
VecTy = FixedVectorType::get(
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+ auto *FinalVecTy = VecTy;
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- InstructionCost ReuseShuffleCost = 0;
- if (NeedToShuffleReuses) {
- ReuseShuffleCost =
- TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
+ if (NeedToShuffleReuses)
+ FinalVecTy =
+ FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers);
+ // FIXME: it tries to fix a problem with MSVC buildbots.
+ TargetTransformInfo &TTIRef = *TTI;
+ auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy,
+ VectorizedVals](InstructionCost &Cost,
+ bool IsGather) {
+ DenseMap<Value *, int> ExtractVectorsTys;
+ for (auto *V : VL) {
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) ||
+ (IsGather && ScalarToTreeEntry.count(V)))
+ continue;
+ auto *EE = cast<ExtractElementInst>(V);
+ unsigned Idx = *getExtractIndex(EE);
+ if (TTIRef.getNumberOfParts(VecTy) !=
+ TTIRef.getNumberOfParts(EE->getVectorOperandType())) {
+ auto It =
+ ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
+ It->getSecond() = std::min<int>(It->second, Idx);
+ }
+ // Take credit for instruction that will become dead.
+ if (EE->hasOneUse()) {
+ Instruction *Ext = EE->user_back();
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ Cost -=
+ TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+ EE->getVectorOperandType(), Idx);
+ // Add back the cost of s|zext which is subtracted separately.
+ Cost += TTIRef.getCastInstrCost(
+ Ext->getOpcode(), Ext->getType(), EE->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
+ continue;
+ }
+ }
+ Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement,
+ EE->getVectorOperandType(), Idx);
+ }
+ // Add a cost for subvector extracts/inserts if required.
+ for (const auto &Data : ExtractVectorsTys) {
+ auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
+ unsigned NumElts = VecTy->getNumElements();
+ if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) {
+ unsigned Idx = (Data.second / NumElts) * NumElts;
+ unsigned EENumElts = EEVTy->getNumElements();
+ if (Idx + NumElts <= EENumElts) {
+ Cost +=
+ TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, None, Idx, VecTy);
+ } else {
+ // Need to round up the subvector type vectorization factor to avoid a
+ // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
+ // <= EENumElts.
+ auto *SubVT =
+ FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
+ Cost +=
+ TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+ EEVTy, None, Idx, SubVT);
+ }
+ } else {
+ Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+ VecTy, None, 0, EEVTy);
+ }
+ }
+ };
if (E->State == TreeEntry::NeedToGather) {
if (allConstant(VL))
return 0;
- if (isSplat(VL)) {
- return ReuseShuffleCost +
- TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ if (isa<InsertElementInst>(VL[0]))
+ return InstructionCost::getInvalid();
+ SmallVector<int> Mask;
+ SmallVector<const TreeEntry *> Entries;
+ Optional<TargetTransformInfo::ShuffleKind> Shuffle =
+ isGatherShuffledEntry(E, Mask, Entries);
+ if (Shuffle.hasValue()) {
+ InstructionCost GatherCost = 0;
+ if (ShuffleVectorInst::isIdentityMask(Mask)) {
+ // Perfect match in the graph, will reuse the previously vectorized
+ // node. Cost is 0.
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: perfect diamond match for gather bundle that starts with "
+ << *VL.front() << ".\n");
+ if (NeedToShuffleReuses)
+ GatherCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ FinalVecTy, E->ReuseShuffleIndices);
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
+ << " entries for bundle that starts with "
+ << *VL.front() << ".\n");
+ // Detected that instead of gather we can emit a shuffle of single/two
+ // previously vectorized nodes. Add the cost of the permutation rather
+ // than gather.
+ ::addMask(Mask, E->ReuseShuffleIndices);
+ GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask);
+ }
+ return GatherCost;
}
- if (E->getOpcode() == Instruction::ExtractElement &&
- allSameType(VL) && allSameBlock(VL)) {
- Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
+ if (isSplat(VL)) {
+ // Found the broadcasting of the single scalar, calculate the cost as the
+ // broadcast.
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy);
+ }
+ if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) &&
+ allSameBlock(VL) &&
+ !isa<ScalableVectorType>(
+ cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) {
+ // Check that gather of extractelements can be represented as just a
+ // shuffle of a single/two vectors the scalars are extracted from.
+ SmallVector<int> Mask;
+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind =
+ isShuffle(VL, Mask);
if (ShuffleKind.hasValue()) {
+ // Found the bunch of extractelement instructions that must be gathered
+ // into a vector and can be represented as a permutation elements in a
+ // single input vector or of 2 input vectors.
InstructionCost Cost =
- TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
- for (auto *V : VL) {
- // If all users of instruction are going to be vectorized and this
- // instruction itself is not going to be vectorized, consider this
- // instruction as dead and remove its cost from the final cost of the
- // vectorized tree.
- if (areAllUsersVectorized(cast<Instruction>(V)) &&
- !ScalarToTreeEntry.count(V)) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(V)->getIndexOperand());
- Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
- IO->getZExtValue());
- }
- }
- return ReuseShuffleCost + Cost;
- }
- }
+ computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI);
+ AdjustExtractsCost(Cost, /*IsGather=*/true);
+ if (NeedToShuffleReuses)
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+ FinalVecTy, E->ReuseShuffleIndices);
+ return Cost;
+ }
+ }
+ InstructionCost ReuseShuffleCost = 0;
+ if (NeedToShuffleReuses)
+ ReuseShuffleCost = TTI->getShuffleCost(
+ TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices);
return ReuseShuffleCost + getGatherCost(VL);
}
+ InstructionCost CommonCost = 0;
+ SmallVector<int> Mask;
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int> NewMask;
+ if (E->getOpcode() == Instruction::Store) {
+ // For stores the order is actually a mask.
+ NewMask.resize(E->ReorderIndices.size());
+ copy(E->ReorderIndices, NewMask.begin());
+ } else {
+ inversePermutation(E->ReorderIndices, NewMask);
+ }
+ ::addMask(Mask, NewMask);
+ }
+ if (NeedToShuffleReuses)
+ ::addMask(Mask, E->ReuseShuffleIndices);
+ if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask))
+ CommonCost =
+ TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");
@@ -3508,46 +3836,39 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::ExtractValue:
case Instruction::ExtractElement: {
+ // The common cost of removal ExtractElement/ExtractValue instructions +
+ // the cost of shuffles, if required to resuffle the original vector.
if (NeedToShuffleReuses) {
unsigned Idx = 0;
for (unsigned I : E->ReuseShuffleIndices) {
if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(VL[I])->getIndexOperand());
- Idx = IO->getZExtValue();
- ReuseShuffleCost -= TTI->getVectorInstrCost(
- Instruction::ExtractElement, VecTy, Idx);
+ auto *EE = cast<ExtractElementInst>(VL[I]);
+ CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+ EE->getVectorOperandType(),
+ *getExtractIndex(EE));
} else {
- ReuseShuffleCost -= TTI->getVectorInstrCost(
- Instruction::ExtractElement, VecTy, Idx);
+ CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement,
+ VecTy, Idx);
++Idx;
}
}
Idx = ReuseShuffleNumbers;
for (Value *V : VL) {
if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(V)->getIndexOperand());
- Idx = IO->getZExtValue();
+ auto *EE = cast<ExtractElementInst>(V);
+ CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
+ EE->getVectorOperandType(),
+ *getExtractIndex(EE));
} else {
--Idx;
+ CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement,
+ VecTy, Idx);
}
- ReuseShuffleCost +=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
- InstructionCost DeadCost = ReuseShuffleCost;
- if (!E->ReorderIndices.empty()) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- DeadCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
- for (unsigned I = 0, E = VL.size(); I < E; ++I) {
- Instruction *EI = cast<Instruction>(VL[I]);
- // If all users are going to be vectorized, instruction can be
- // considered as dead.
- // The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(EI)) {
+ if (ShuffleOrOp == Instruction::ExtractValue) {
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<Instruction>(VL[I]);
// Take credit for instruction that will become dead.
if (EI->hasOneUse()) {
Instruction *Ext = EI->user_back();
@@ -3556,20 +3877,66 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
[](User *U) { return isa<GetElementPtrInst>(U); })) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
- DeadCost -= TTI->getExtractWithExtendCost(
+ CommonCost -= TTI->getExtractWithExtendCost(
Ext->getOpcode(), Ext->getType(), VecTy, I);
// Add back the cost of s|zext which is subtracted separately.
- DeadCost += TTI->getCastInstrCost(
+ CommonCost += TTI->getCastInstrCost(
Ext->getOpcode(), Ext->getType(), EI->getType(),
TTI::getCastContextHint(Ext), CostKind, Ext);
continue;
}
}
- DeadCost -=
+ CommonCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
}
+ } else {
+ AdjustExtractsCost(CommonCost, /*IsGather=*/false);
+ }
+ return CommonCost;
+ }
+ case Instruction::InsertElement: {
+ auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
+
+ unsigned const NumElts = SrcVecTy->getNumElements();
+ unsigned const NumScalars = VL.size();
+ APInt DemandedElts = APInt::getNullValue(NumElts);
+ // TODO: Add support for Instruction::InsertValue.
+ unsigned Offset = UINT_MAX;
+ bool IsIdentity = true;
+ SmallVector<int> ShuffleMask(NumElts, UndefMaskElem);
+ for (unsigned I = 0; I < NumScalars; ++I) {
+ Optional<int> InsertIdx = getInsertIndex(VL[I], 0);
+ if (!InsertIdx || *InsertIdx == UndefMaskElem)
+ continue;
+ unsigned Idx = *InsertIdx;
+ DemandedElts.setBit(Idx);
+ if (Idx < Offset) {
+ Offset = Idx;
+ IsIdentity &= I == 0;
+ } else {
+ assert(Idx >= Offset && "Failed to find vector index offset");
+ IsIdentity &= Idx - Offset == I;
+ }
+ ShuffleMask[Idx] = I;
+ }
+ assert(Offset < NumElts && "Failed to find vector index offset");
+
+ InstructionCost Cost = 0;
+ Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
+ /*Insert*/ true, /*Extract*/ false);
+
+ if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) {
+ // FIXME: Replace with SK_InsertSubvector once it is properly supported.
+ unsigned Sz = PowerOf2Ceil(Offset + NumScalars);
+ Cost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc,
+ FixedVectorType::get(SrcVecTy->getElementType(), Sz));
+ } else if (!IsIdentity) {
+ Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy,
+ ShuffleMask);
}
- return DeadCost;
+
+ return Cost;
}
case Instruction::ZExt:
case Instruction::SExt:
@@ -3588,7 +3955,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
TTI::getCastContextHint(VL0), CostKind, VL0);
if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
// Calculate the cost of this instruction.
@@ -3598,12 +3965,11 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
InstructionCost VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
- VecCost =
- ReuseShuffleCost +
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
- TTI::getCastContextHint(VL0), CostKind, VL0);
+ VecCost = CommonCost + TTI->getCastInstrCost(
+ E->getOpcode(), VecTy, SrcVecTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
}
- LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
return VecCost - ScalarCost;
}
case Instruction::FCmp:
@@ -3614,7 +3980,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
@@ -3655,8 +4021,8 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
CmpInst::BAD_ICMP_PREDICATE, CostKind);
VecCost = std::min(VecCost, IntrinsicCost);
}
- LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
case Instruction::Add:
@@ -3719,14 +4085,14 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
Op2VK, Op1VP, Op2VP, Operands, VL0);
if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecCost =
TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
Op2VK, Op1VP, Op2VP, Operands, VL0);
- LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
TargetTransformInfo::OperandValueKind Op1VK =
@@ -3737,40 +4103,39 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecCost = TTI->getArithmeticInstrCost(
Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
- LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
- Align alignment = cast<LoadInst>(VL0)->getAlign();
+ Align Alignment = cast<LoadInst>(VL0)->getAlign();
InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
- Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
+ Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0);
if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecLdCost;
if (E->State == TreeEntry::Vectorize) {
- VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+ VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0,
CostKind, VL0);
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+ Align CommonAlignment = Alignment;
+ for (Value *V : VL)
+ CommonAlignment =
+ commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
VecLdCost = TTI->getGatherScatterOpCost(
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
- /*VariableMask=*/false, alignment, CostKind, VL0);
+ /*VariableMask=*/false, CommonAlignment, CostKind, VL0);
}
- if (!E->ReorderIndices.empty()) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- VecLdCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
- LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
- return ReuseShuffleCost + VecLdCost - ScalarLdCost;
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
+ return CommonCost + VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
@@ -3780,29 +4145,22 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
Align Alignment = SI->getAlign();
InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
- if (NeedToShuffleReuses)
- ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecStCost = TTI->getMemoryOpCost(
Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
- if (IsReorder) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- VecStCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
- LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
- return ReuseShuffleCost + VecStCost - ScalarStCost;
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost));
+ return CommonCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
CallInst *CI = cast<CallInst>(VL0);
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
+ IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
InstructionCost ScalarEltCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
@@ -3814,7 +4172,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
<< " for " << *CI << "\n");
- return ReuseShuffleCost + VecCallCost - ScalarCallCost;
+ return CommonCost + VecCallCost - ScalarCallCost;
}
case Instruction::ShuffleVector: {
assert(E->isAltShuffle() &&
@@ -3827,11 +4185,11 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
- ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
+ CommonCost -= TTI->getInstructionCost(I, CostKind);
}
for (Value *V : VL) {
Instruction *I = cast<Instruction>(V);
- ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
+ CommonCost += TTI->getInstructionCost(I, CostKind);
}
}
for (Value *V : VL) {
@@ -3856,9 +4214,17 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
TTI::CastContextHint::None, CostKind);
}
- VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
- LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
+
+ SmallVector<int> Mask(E->Scalars.size());
+ for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) {
+ auto *OpInst = cast<Instruction>(E->Scalars[I]);
+ assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+ Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0);
+ }
+ VecCost +=
+ TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0);
+ LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost));
+ return CommonCost + VecCost - ScalarCost;
}
default:
llvm_unreachable("Unknown instruction");
@@ -3877,10 +4243,20 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
if (VectorizableTree.size() != 2)
return false;
- // Handle splat and all-constants stores.
+ // Handle splat and all-constants stores. Also try to vectorize tiny trees
+ // with the second gather nodes if they have less scalar operands rather than
+ // the initial tree element (may be profitable to shuffle the second gather)
+ // or they are extractelements, which form shuffle.
+ SmallVector<int> Mask;
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
(allConstant(VectorizableTree[1]->Scalars) ||
- isSplat(VectorizableTree[1]->Scalars)))
+ isSplat(VectorizableTree[1]->Scalars) ||
+ (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+ VectorizableTree[1]->Scalars.size() <
+ VectorizableTree[0]->Scalars.size()) ||
+ (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
+ VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
+ isShuffle(VectorizableTree[1]->Scalars, Mask))))
return true;
// Gathering cost would be too much for tiny trees.
@@ -3892,21 +4268,27 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
}
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
- TargetTransformInfo *TTI) {
+ TargetTransformInfo *TTI,
+ bool MustMatchOrInst) {
// Look past the root to find a source value. Arbitrarily follow the
// path through operand 0 of any 'or'. Also, peek through optional
// shift-left-by-multiple-of-8-bits.
Value *ZextLoad = Root;
const APInt *ShAmtC;
+ bool FoundOr = false;
while (!isa<ConstantExpr>(ZextLoad) &&
(match(ZextLoad, m_Or(m_Value(), m_Value())) ||
(match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
- ShAmtC->urem(8) == 0)))
- ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
-
+ ShAmtC->urem(8) == 0))) {
+ auto *BinOp = cast<BinaryOperator>(ZextLoad);
+ ZextLoad = BinOp->getOperand(0);
+ if (BinOp->getOpcode() == Instruction::Or)
+ FoundOr = true;
+ }
// Check if the input is an extended load of the required or/shift expression.
Value *LoadPtr;
- if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
+ !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
return false;
// Require that the total load bit width is a legal integer type.
@@ -3931,7 +4313,8 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
unsigned NumElts = VectorizableTree[0]->Scalars.size();
Value *FirstReduced = VectorizableTree[0]->Scalars[0];
- return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+ return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
+ /* MatchOr */ false);
}
bool BoUpSLP::isLoadCombineCandidate() const {
@@ -3941,13 +4324,19 @@ bool BoUpSLP::isLoadCombineCandidate() const {
for (Value *Scalar : VectorizableTree[0]->Scalars) {
Value *X;
if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
- !isLoadCombineCandidateImpl(X, NumElts, TTI))
+ !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
return false;
}
return true;
}
bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
+ // No need to vectorize inserts of gathered values.
+ if (VectorizableTree.size() == 2 &&
+ isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
+ VectorizableTree[1]->State == TreeEntry::NeedToGather)
+ return true;
+
// We can vectorize the tree if its size is greater than or equal to the
// minimum size specified by the MinTreeSize command line option.
if (VectorizableTree.size() >= MinTreeSize)
@@ -3991,8 +4380,16 @@ InstructionCost BoUpSLP::getSpillCost() const {
continue;
OrderedScalars.push_back(Inst);
}
- llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
- return DT->dominates(B, A);
+ llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
+ auto *NodeA = DT->getNode(A->getParent());
+ auto *NodeB = DT->getNode(B->getParent());
+ assert(NodeA && "Should only process reachable instructions");
+ assert(NodeB && "Should only process reachable instructions");
+ assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeA != NodeB)
+ return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
+ return B->comesBefore(A);
});
for (Instruction *Inst : OrderedScalars) {
@@ -4038,8 +4435,12 @@ InstructionCost BoUpSLP::getSpillCost() const {
if (NumCalls) {
SmallVector<Type*, 4> V;
- for (auto *II : LiveValues)
- V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
+ for (auto *II : LiveValues) {
+ auto *ScalarTy = II->getType();
+ if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
+ ScalarTy = VectorTy->getElementType();
+ V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
+ }
Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
}
@@ -4049,7 +4450,7 @@ InstructionCost BoUpSLP::getSpillCost() const {
return Cost;
}
-InstructionCost BoUpSLP::getTreeCost() {
+InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
@@ -4059,28 +4460,7 @@ InstructionCost BoUpSLP::getTreeCost() {
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I].get();
- // We create duplicate tree entries for gather sequences that have multiple
- // uses. However, we should not compute the cost of duplicate sequences.
- // For example, if we have a build vector (i.e., insertelement sequence)
- // that is used by more than one vector instruction, we only need to
- // compute the cost of the insertelement instructions once. The redundant
- // instructions will be eliminated by CSE.
- //
- // We should consider not creating duplicate tree entries for gather
- // sequences, and instead add additional edges to the tree representing
- // their uses. Since such an approach results in fewer total entries,
- // existing heuristics based on tree size may yield different results.
- //
- if (TE.State == TreeEntry::NeedToGather &&
- std::any_of(std::next(VectorizableTree.begin(), I + 1),
- VectorizableTree.end(),
- [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
- return EntryPtr->State == TreeEntry::NeedToGather &&
- EntryPtr->isSame(TE.Scalars);
- }))
- continue;
-
- InstructionCost C = getEntryCost(&TE);
+ InstructionCost C = getEntryCost(&TE, VectorizedVals);
Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
@@ -4090,6 +4470,10 @@ InstructionCost BoUpSLP::getTreeCost() {
SmallPtrSet<Value *, 16> ExtractCostCalculated;
InstructionCost ExtractCost = 0;
+ SmallVector<unsigned> VF;
+ SmallVector<SmallVector<int>> ShuffleMask;
+ SmallVector<Value *> FirstUsers;
+ SmallVector<APInt> DemandedElts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!ExtractCostCalculated.insert(EU.Scalar).second)
@@ -4101,6 +4485,57 @@ InstructionCost BoUpSLP::getTreeCost() {
if (EphValues.count(EU.User))
continue;
+ // No extract cost for vector "scalar"
+ if (isa<FixedVectorType>(EU.Scalar->getType()))
+ continue;
+
+ // Already counted the cost for external uses when tried to adjust the cost
+ // for extractelements, no need to add it again.
+ if (isa<ExtractElementInst>(EU.Scalar))
+ continue;
+
+ // If found user is an insertelement, do not calculate extract cost but try
+ // to detect it as a final shuffled/identity match.
+ if (EU.User && isa<InsertElementInst>(EU.User)) {
+ if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) {
+ Optional<int> InsertIdx = getInsertIndex(EU.User, 0);
+ if (!InsertIdx || *InsertIdx == UndefMaskElem)
+ continue;
+ Value *VU = EU.User;
+ auto *It = find_if(FirstUsers, [VU](Value *V) {
+ // Checks if 2 insertelements are from the same buildvector.
+ if (VU->getType() != V->getType())
+ return false;
+ auto *IE1 = cast<InsertElementInst>(VU);
+ auto *IE2 = cast<InsertElementInst>(V);
+ // Go though of insertelement instructions trying to find either VU as
+ // the original vector for IE2 or V as the original vector for IE1.
+ do {
+ if (IE1 == VU || IE2 == V)
+ return true;
+ if (IE1)
+ IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0));
+ if (IE2)
+ IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0));
+ } while (IE1 || IE2);
+ return false;
+ });
+ int VecId = -1;
+ if (It == FirstUsers.end()) {
+ VF.push_back(FTy->getNumElements());
+ ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
+ FirstUsers.push_back(EU.User);
+ DemandedElts.push_back(APInt::getNullValue(VF.back()));
+ VecId = FirstUsers.size() - 1;
+ } else {
+ VecId = std::distance(FirstUsers.begin(), It);
+ }
+ int Idx = *InsertIdx;
+ ShuffleMask[VecId][Idx] = EU.Lane;
+ DemandedElts[VecId].setBit(Idx);
+ }
+ }
+
// If we plan to rewrite the tree in a smaller type, we will need to sign
// extend the extracted value back to the original type. Here, we account
// for the extract and the added cost of the sign extend if needed.
@@ -4121,6 +4556,48 @@ InstructionCost BoUpSLP::getTreeCost() {
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
+ for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+ // For the very first element - simple shuffle of the source vector.
+ int Limit = ShuffleMask[I].size() * 2;
+ if (I == 0 &&
+ all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) &&
+ !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
+ InstructionCost C = TTI->getShuffleCost(
+ TTI::SK_PermuteSingleSrc,
+ cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of insertelement external users "
+ << *VectorizableTree.front()->Scalars.front() << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ continue;
+ }
+ // Other elements - permutation of 2 vectors (the initial one and the next
+ // Ith incoming vector).
+ unsigned VF = ShuffleMask[I].size();
+ for (unsigned Idx = 0; Idx < VF; ++Idx) {
+ int &Mask = ShuffleMask[I][Idx];
+ Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
+ }
+ InstructionCost C = TTI->getShuffleCost(
+ TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
+ ShuffleMask[I]);
+ LLVM_DEBUG(
+ dbgs()
+ << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external insertelement users "
+ << *VectorizableTree.front()->Scalars.front() << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ InstructionCost InsertCost = TTI->getScalarizationOverhead(
+ cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
+ /*Insert*/ true,
+ /*Extract*/ false);
+ Cost -= InsertCost;
+ LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
+ << " for insertelements gather.\n"
+ << "SLP: Current total cost = " << Cost << "\n");
+ }
#ifndef NDEBUG
SmallString<256> Str;
@@ -4138,6 +4615,146 @@ InstructionCost BoUpSLP::getTreeCost() {
return Cost;
}
+Optional<TargetTransformInfo::ShuffleKind>
+BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask,
+ SmallVectorImpl<const TreeEntry *> &Entries) {
+ // TODO: currently checking only for Scalars in the tree entry, need to count
+ // reused elements too for better cost estimation.
+ Mask.assign(TE->Scalars.size(), UndefMaskElem);
+ Entries.clear();
+ // Build a lists of values to tree entries.
+ DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs;
+ for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) {
+ if (EntryPtr.get() == TE)
+ break;
+ if (EntryPtr->State != TreeEntry::NeedToGather)
+ continue;
+ for (Value *V : EntryPtr->Scalars)
+ ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get());
+ }
+ // Find all tree entries used by the gathered values. If no common entries
+ // found - not a shuffle.
+ // Here we build a set of tree nodes for each gathered value and trying to
+ // find the intersection between these sets. If we have at least one common
+ // tree node for each gathered value - we have just a permutation of the
+ // single vector. If we have 2 different sets, we're in situation where we
+ // have a permutation of 2 input vectors.
+ SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;
+ DenseMap<Value *, int> UsedValuesEntry;
+ for (Value *V : TE->Scalars) {
+ if (isa<UndefValue>(V))
+ continue;
+ // Build a list of tree entries where V is used.
+ SmallPtrSet<const TreeEntry *, 4> VToTEs;
+ auto It = ValueToTEs.find(V);
+ if (It != ValueToTEs.end())
+ VToTEs = It->second;
+ if (const TreeEntry *VTE = getTreeEntry(V))
+ VToTEs.insert(VTE);
+ if (VToTEs.empty())
+ return None;
+ if (UsedTEs.empty()) {
+ // The first iteration, just insert the list of nodes to vector.
+ UsedTEs.push_back(VToTEs);
+ } else {
+ // Need to check if there are any previously used tree nodes which use V.
+ // If there are no such nodes, consider that we have another one input
+ // vector.
+ SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
+ unsigned Idx = 0;
+ for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
+ // Do we have a non-empty intersection of previously listed tree entries
+ // and tree entries using current V?
+ set_intersect(VToTEs, Set);
+ if (!VToTEs.empty()) {
+ // Yes, write the new subset and continue analysis for the next
+ // scalar.
+ Set.swap(VToTEs);
+ break;
+ }
+ VToTEs = SavedVToTEs;
+ ++Idx;
+ }
+ // No non-empty intersection found - need to add a second set of possible
+ // source vectors.
+ if (Idx == UsedTEs.size()) {
+ // If the number of input vectors is greater than 2 - not a permutation,
+ // fallback to the regular gather.
+ if (UsedTEs.size() == 2)
+ return None;
+ UsedTEs.push_back(SavedVToTEs);
+ Idx = UsedTEs.size() - 1;
+ }
+ UsedValuesEntry.try_emplace(V, Idx);
+ }
+ }
+
+ unsigned VF = 0;
+ if (UsedTEs.size() == 1) {
+ // Try to find the perfect match in another gather node at first.
+ auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) {
+ return EntryPtr->isSame(TE->Scalars);
+ });
+ if (It != UsedTEs.front().end()) {
+ Entries.push_back(*It);
+ std::iota(Mask.begin(), Mask.end(), 0);
+ return TargetTransformInfo::SK_PermuteSingleSrc;
+ }
+ // No perfect match, just shuffle, so choose the first tree node.
+ Entries.push_back(*UsedTEs.front().begin());
+ } else {
+ // Try to find nodes with the same vector factor.
+ assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
+ // FIXME: Shall be replaced by GetVF function once non-power-2 patch is
+ // landed.
+ auto &&GetVF = [](const TreeEntry *TE) {
+ if (!TE->ReuseShuffleIndices.empty())
+ return TE->ReuseShuffleIndices.size();
+ return TE->Scalars.size();
+ };
+ DenseMap<int, const TreeEntry *> VFToTE;
+ for (const TreeEntry *TE : UsedTEs.front())
+ VFToTE.try_emplace(GetVF(TE), TE);
+ for (const TreeEntry *TE : UsedTEs.back()) {
+ auto It = VFToTE.find(GetVF(TE));
+ if (It != VFToTE.end()) {
+ VF = It->first;
+ Entries.push_back(It->second);
+ Entries.push_back(TE);
+ break;
+ }
+ }
+ // No 2 source vectors with the same vector factor - give up and do regular
+ // gather.
+ if (Entries.empty())
+ return None;
+ }
+
+ // Build a shuffle mask for better cost estimation and vector emission.
+ for (int I = 0, E = TE->Scalars.size(); I < E; ++I) {
+ Value *V = TE->Scalars[I];
+ if (isa<UndefValue>(V))
+ continue;
+ unsigned Idx = UsedValuesEntry.lookup(V);
+ const TreeEntry *VTE = Entries[Idx];
+ int FoundLane = VTE->findLaneForValue(V);
+ Mask[I] = Idx * VF + FoundLane;
+ // Extra check required by isSingleSourceMaskImpl function (called by
+ // ShuffleVectorInst::isSingleSourceMask).
+ if (Mask[I] >= 2 * E)
+ return None;
+ }
+ switch (Entries.size()) {
+ case 1:
+ return TargetTransformInfo::SK_PermuteSingleSrc;
+ case 2:
+ return TargetTransformInfo::SK_PermuteTwoSrc;
+ default:
+ break;
+ }
+ return None;
+}
+
InstructionCost
BoUpSLP::getGatherCost(FixedVectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {
@@ -4168,6 +4785,8 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Iterate in reverse order to consider insert elements with the high cost.
for (unsigned I = VL.size(); I > 0; --I) {
unsigned Idx = I - 1;
+ if (isConstant(VL[Idx]))
+ continue;
if (!UniqueElements.insert(VL[Idx]).second)
ShuffledElements.insert(Idx);
}
@@ -4191,7 +4810,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
Right = Ops.getVL(1);
}
-void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
+void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
// Get the basic block this bundle is in. All instructions in the bundle
// should be in this block.
auto *Front = E->getMainOp();
@@ -4253,83 +4872,224 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
}
Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
- Value *Val0 =
- isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
- FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
- Value *Vec = PoisonValue::get(VecTy);
- unsigned InsIndex = 0;
- for (Value *Val : VL) {
- Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
+ // List of instructions/lanes from current block and/or the blocks which are
+ // part of the current loop. These instructions will be inserted at the end to
+ // make it possible to optimize loops and hoist invariant instructions out of
+ // the loops body with better chances for success.
+ SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
+ SmallSet<int, 4> PostponedIndices;
+ Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
+ auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
+ SmallPtrSet<BasicBlock *, 4> Visited;
+ while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
+ InsertBB = InsertBB->getSinglePredecessor();
+ return InsertBB && InsertBB == InstBB;
+ };
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ if (auto *Inst = dyn_cast<Instruction>(VL[I]))
+ if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
+ getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&
+ PostponedIndices.insert(I).second)
+ PostponedInsts.emplace_back(Inst, I);
+ }
+
+ auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {
+ Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
if (!InsElt)
- continue;
+ return Vec;
GatherSeq.insert(InsElt);
CSEBlocks.insert(InsElt->getParent());
// Add to our 'need-to-extract' list.
- if (TreeEntry *Entry = getTreeEntry(Val)) {
+ if (TreeEntry *Entry = getTreeEntry(V)) {
// Find which lane we need to extract.
- unsigned FoundLane = std::distance(Entry->Scalars.begin(),
- find(Entry->Scalars, Val));
- assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
- if (!Entry->ReuseShuffleIndices.empty()) {
- FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
- find(Entry->ReuseShuffleIndices, FoundLane));
- }
- ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
+ unsigned FoundLane = Entry->findLaneForValue(V);
+ ExternalUses.emplace_back(V, InsElt, FoundLane);
+ }
+ return Vec;
+ };
+ Value *Val0 =
+ isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
+ FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
+ Value *Vec = PoisonValue::get(VecTy);
+ SmallVector<int> NonConsts;
+ // Insert constant values at first.
+ for (int I = 0, E = VL.size(); I < E; ++I) {
+ if (PostponedIndices.contains(I))
+ continue;
+ if (!isConstant(VL[I])) {
+ NonConsts.push_back(I);
+ continue;
}
+ Vec = CreateInsertElement(Vec, VL[I], I);
}
+ // Insert non-constant values.
+ for (int I : NonConsts)
+ Vec = CreateInsertElement(Vec, VL[I], I);
+ // Append instructions, which are/may be part of the loop, in the end to make
+ // it possible to hoist non-loop-based instructions.
+ for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
+ Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
return Vec;
}
+namespace {
+/// Merges shuffle masks and emits final shuffle instruction, if required.
+class ShuffleInstructionBuilder {
+ IRBuilderBase &Builder;
+ const unsigned VF = 0;
+ bool IsFinalized = false;
+ SmallVector<int, 4> Mask;
+
+public:
+ ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)
+ : Builder(Builder), VF(VF) {}
+
+ /// Adds a mask, inverting it before applying.
+ void addInversedMask(ArrayRef<unsigned> SubMask) {
+ if (SubMask.empty())
+ return;
+ SmallVector<int, 4> NewMask;
+ inversePermutation(SubMask, NewMask);
+ addMask(NewMask);
+ }
+
+ /// Functions adds masks, merging them into single one.
+ void addMask(ArrayRef<unsigned> SubMask) {
+ SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end());
+ addMask(NewMask);
+ }
+
+ void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); }
+
+ Value *finalize(Value *V) {
+ IsFinalized = true;
+ unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements();
+ if (VF == ValueVF && Mask.empty())
+ return V;
+ SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem);
+ std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0);
+ addMask(NormalizedMask);
+
+ if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))
+ return V;
+ return Builder.CreateShuffleVector(V, Mask, "shuffle");
+ }
+
+ ~ShuffleInstructionBuilder() {
+ assert((IsFinalized || Mask.empty()) &&
+ "Shuffle construction must be finalized.");
+ }
+};
+} // namespace
+
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+ unsigned VF = VL.size();
InstructionsState S = getSameOpcode(VL);
if (S.getOpcode()) {
- if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+ if (TreeEntry *E = getTreeEntry(S.OpValue))
if (E->isSame(VL)) {
Value *V = vectorizeTree(E);
- if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
- // We need to get the vectorized value but without shuffle.
- if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
- V = SV->getOperand(0);
- } else {
+ if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
+ if (!E->ReuseShuffleIndices.empty()) {
// Reshuffle to get only unique values.
- SmallVector<int, 4> UniqueIdxs;
+ // If some of the scalars are duplicated in the vectorization tree
+ // entry, we do not vectorize them but instead generate a mask for
+ // the reuses. But if there are several users of the same entry,
+ // they may have different vectorization factors. This is especially
+ // important for PHI nodes. In this case, we need to adapt the
+ // resulting instruction for the user vectorization factor and have
+ // to reshuffle it again to take only unique elements of the vector.
+ // Without this code the function incorrectly returns reduced vector
+ // instruction with the same elements, not with the unique ones.
+
+ // block:
+ // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
+ // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
+ // ... (use %2)
+ // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
+ // br %block
+ SmallVector<int> UniqueIdxs;
SmallSet<int, 4> UsedIdxs;
- for (int Idx : E->ReuseShuffleIndices)
- if (UsedIdxs.insert(Idx).second)
- UniqueIdxs.emplace_back(Idx);
- V = Builder.CreateShuffleVector(V, UniqueIdxs);
+ int Pos = 0;
+ int Sz = VL.size();
+ for (int Idx : E->ReuseShuffleIndices) {
+ if (Idx != Sz && UsedIdxs.insert(Idx).second)
+ UniqueIdxs.emplace_back(Pos);
+ ++Pos;
+ }
+ assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
+ "less than original vector size.");
+ UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);
+ V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
+ } else {
+ assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
+ "Expected vectorization factor less "
+ "than original vector size.");
+ SmallVector<int> UniformMask(VF, 0);
+ std::iota(UniformMask.begin(), UniformMask.end(), 0);
+ V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
}
}
return V;
}
- }
}
// Check that every instruction appears once in this bundle.
- SmallVector<int, 4> ReuseShuffleIndicies;
- SmallVector<Value *, 4> UniqueValues;
+ SmallVector<int> ReuseShuffleIndicies;
+ SmallVector<Value *> UniqueValues;
if (VL.size() > 2) {
DenseMap<Value *, unsigned> UniquePositions;
- for (Value *V : VL) {
+ unsigned NumValues =
+ std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) {
+ return !isa<UndefValue>(V);
+ }).base());
+ VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));
+ int UniqueVals = 0;
+ bool HasUndefs = false;
+ for (Value *V : VL.drop_back(VL.size() - VF)) {
+ if (isa<UndefValue>(V)) {
+ ReuseShuffleIndicies.emplace_back(UndefMaskElem);
+ HasUndefs = true;
+ continue;
+ }
+ if (isConstant(V)) {
+ ReuseShuffleIndicies.emplace_back(UniqueValues.size());
+ UniqueValues.emplace_back(V);
+ continue;
+ }
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
ReuseShuffleIndicies.emplace_back(Res.first->second);
- if (Res.second || isa<Constant>(V))
+ if (Res.second) {
UniqueValues.emplace_back(V);
+ ++UniqueVals;
+ }
}
- // Do not shuffle single element or if number of unique values is not power
- // of 2.
- if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
- !llvm::isPowerOf2_32(UniqueValues.size()))
+ if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) {
+ // Emit pure splat vector.
+ // FIXME: why it is not identified as an identity.
+ unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem);
+ if (NumUndefs == ReuseShuffleIndicies.size() - 1)
+ ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
+ UndefMaskElem);
+ else
+ ReuseShuffleIndicies.assign(VF, 0);
+ } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
ReuseShuffleIndicies.clear();
- else
- VL = UniqueValues;
+ UniqueValues.clear();
+ UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));
+ }
+ UniqueValues.append(VF - UniqueValues.size(),
+ PoisonValue::get(VL[0]->getType()));
+ VL = UniqueValues;
}
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
Value *Vec = gather(VL);
if (!ReuseShuffleIndicies.empty()) {
- Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
+ ShuffleBuilder.addMask(ReuseShuffleIndicies);
+ Vec = ShuffleBuilder.finalize(Vec);
if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
@@ -4347,11 +5107,28 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ unsigned VF = E->Scalars.size();
+ if (NeedToShuffleReuses)
+ VF = E->ReuseShuffleIndices.size();
+ ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
if (E->State == TreeEntry::NeedToGather) {
setInsertPointAfterBundle(E);
- Value *Vec = gather(E->Scalars);
+ Value *Vec;
+ SmallVector<int> Mask;
+ SmallVector<const TreeEntry *> Entries;
+ Optional<TargetTransformInfo::ShuffleKind> Shuffle =
+ isGatherShuffledEntry(E, Mask, Entries);
+ if (Shuffle.hasValue()) {
+ assert((Entries.size() == 1 || Entries.size() == 2) &&
+ "Expected shuffle of 1 or 2 entries.");
+ Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue,
+ Entries.back()->VectorizedValue, Mask);
+ } else {
+ Vec = gather(E->Scalars);
+ }
if (NeedToShuffleReuses) {
- Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ Vec = ShuffleBuilder.finalize(Vec);
if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
@@ -4370,6 +5147,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Type *ScalarTy = VL0->getType();
if (auto *Store = dyn_cast<StoreInst>(VL0))
ScalarTy = Store->getValueOperand()->getType();
+ else if (auto *IE = dyn_cast<InsertElementInst>(VL0))
+ ScalarTy = IE->getOperand(1)->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
@@ -4409,18 +5188,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
case Instruction::ExtractElement: {
Value *V = E->getSingleOperand(0);
- if (!E->ReorderIndices.empty()) {
- SmallVector<int, 4> Mask;
- inversePermutation(E->ReorderIndices, Mask);
- Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- if (E->ReorderIndices.empty())
- Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- }
+ Builder.SetInsertPoint(VL0);
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
return V;
}
@@ -4431,19 +5202,76 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
Value *NewV = propagateMetadata(V, E->Scalars);
- if (!E->ReorderIndices.empty()) {
- SmallVector<int, 4> Mask;
- inversePermutation(E->ReorderIndices, Mask);
- NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,
- "shuffle");
- }
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ NewV = ShuffleBuilder.finalize(NewV);
E->VectorizedValue = NewV;
return NewV;
}
+ case Instruction::InsertElement: {
+ Builder.SetInsertPoint(VL0);
+ Value *V = vectorizeTree(E->getOperand(1));
+
+ const unsigned NumElts =
+ cast<FixedVectorType>(VL0->getType())->getNumElements();
+ const unsigned NumScalars = E->Scalars.size();
+
+ // Create InsertVector shuffle if necessary
+ Instruction *FirstInsert = nullptr;
+ bool IsIdentity = true;
+ unsigned Offset = UINT_MAX;
+ for (unsigned I = 0; I < NumScalars; ++I) {
+ Value *Scalar = E->Scalars[I];
+ if (!FirstInsert &&
+ !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0)))
+ FirstInsert = cast<Instruction>(Scalar);
+ Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
+ if (!InsertIdx || *InsertIdx == UndefMaskElem)
+ continue;
+ unsigned Idx = *InsertIdx;
+ if (Idx < Offset) {
+ Offset = Idx;
+ IsIdentity &= I == 0;
+ } else {
+ assert(Idx >= Offset && "Failed to find vector index offset");
+ IsIdentity &= Idx - Offset == I;
+ }
+ }
+ assert(Offset < NumElts && "Failed to find vector index offset");
+
+ // Create shuffle to resize vector
+ SmallVector<int> Mask(NumElts, UndefMaskElem);
+ if (!IsIdentity) {
+ for (unsigned I = 0; I < NumScalars; ++I) {
+ Value *Scalar = E->Scalars[I];
+ Optional<int> InsertIdx = getInsertIndex(Scalar, 0);
+ if (!InsertIdx || *InsertIdx == UndefMaskElem)
+ continue;
+ Mask[*InsertIdx - Offset] = I;
+ }
+ } else {
+ std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
+ }
+ if (!IsIdentity || NumElts != NumScalars)
+ V = Builder.CreateShuffleVector(V, Mask);
+
+ if (NumElts != NumScalars) {
+ SmallVector<int> InsertMask(NumElts);
+ std::iota(InsertMask.begin(), InsertMask.end(), 0);
+ for (unsigned I = 0; I < NumElts; I++) {
+ if (Mask[I] != UndefMaskElem)
+ InsertMask[Offset + I] = NumElts + I;
+ }
+
+ V = Builder.CreateShuffleVector(
+ FirstInsert->getOperand(0), V, InsertMask,
+ cast<Instruction>(E->Scalars.back())->getName());
+ }
+
+ ++NumVectorInstructions;
+ E->VectorizedValue = V;
+ return V;
+ }
case Instruction::ZExt:
case Instruction::SExt:
case Instruction::FPToUI:
@@ -4467,8 +5295,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *CI = cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4489,8 +5317,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4509,8 +5337,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateSelect(Cond, True, False);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4532,8 +5360,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4575,8 +5403,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4614,19 +5442,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
for (Value *V : E->Scalars)
CommonAlignment =
commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
- NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
+ NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
}
Value *V = propagateMetadata(NewLI, E->Scalars);
- if (IsReorder) {
- SmallVector<int, 4> Mask;
- inversePermutation(E->ReorderIndices, Mask);
- V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- }
+ ShuffleBuilder.addInversedMask(E->ReorderIndices);
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4640,11 +5462,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E);
Value *VecValue = vectorizeTree(E->getOperand(0));
- if (IsReorder) {
- SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
- E->ReorderIndices.end());
- VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
- }
+ ShuffleBuilder.addMask(E->ReorderIndices);
+ VecValue = ShuffleBuilder.finalize(VecValue);
+
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
ScalarPtr, VecValue->getType()->getPointerTo(AS));
@@ -4658,8 +5478,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
Value *V = propagateMetadata(ST, E->Scalars);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4697,8 +5515,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4721,6 +5539,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
+ SmallVector<Type *, 2> TysForDecl =
+ {FixedVectorType::get(CI->getType(), E->Scalars.size())};
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
@@ -4729,6 +5549,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
+ if (hasVectorInstrinsicOverloadedScalarOpd(IID, j))
+ TysForDecl.push_back(ScalarArg->getType());
continue;
}
@@ -4745,8 +5567,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
} else {
- Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
- CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+ CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
}
SmallVector<OperandBundleDef, 1> OpBundles;
@@ -4760,8 +5581,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
propagateIRFlags(V, E->Scalars, VL0);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4807,17 +5628,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
// Also, gather up main and alt scalar ops to propagate IR flags to
// each vector operation.
ValueList OpScalars, AltScalars;
- unsigned e = E->Scalars.size();
- SmallVector<int, 8> Mask(e);
- for (unsigned i = 0; i < e; ++i) {
- auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ unsigned Sz = E->Scalars.size();
+ SmallVector<int> Mask(Sz);
+ for (unsigned I = 0; I < Sz; ++I) {
+ auto *OpInst = cast<Instruction>(E->Scalars[I]);
assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
if (OpInst->getOpcode() == E->getAltOpcode()) {
- Mask[i] = e + i;
- AltScalars.push_back(E->Scalars[i]);
+ Mask[I] = Sz + I;
+ AltScalars.push_back(E->Scalars[I]);
} else {
- Mask[i] = i;
- OpScalars.push_back(E->Scalars[i]);
+ Mask[I] = I;
+ OpScalars.push_back(E->Scalars[I]);
}
}
@@ -4827,8 +5648,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses)
- V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+ ShuffleBuilder.addMask(E->ReuseShuffleIndices);
+ V = ShuffleBuilder.finalize(V);
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4861,8 +5682,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
// sign extend the extracted values below.
auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
if (MinBWs.count(ScalarRoot)) {
- if (auto *I = dyn_cast<Instruction>(VectorRoot))
- Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+ if (auto *I = dyn_cast<Instruction>(VectorRoot)) {
+ // If current instr is a phi and not the last phi, insert it after the
+ // last phi node.
+ if (isa<PHINode>(I))
+ Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt());
+ else
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+ }
auto BundleWidth = VectorizableTree[0]->Scalars.size();
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
@@ -4873,16 +5700,6 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
<< " values .\n");
- // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
- // specified by ScalarType.
- auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
- if (!MinBWs.count(ScalarRoot))
- return Ex;
- if (MinBWs[ScalarRoot].second)
- return Builder.CreateSExt(Ex, ScalarType);
- return Builder.CreateZExt(Ex, ScalarType);
- };
-
// Extract all of the elements with the external uses.
for (const auto &ExternalUse : ExternalUses) {
Value *Scalar = ExternalUse.Scalar;
@@ -4901,6 +5718,29 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
assert(Vec && "Can't find vectorizable value");
Value *Lane = Builder.getInt32(ExternalUse.Lane);
+ auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
+ if (Scalar->getType() != Vec->getType()) {
+ Value *Ex;
+ // "Reuse" the existing extract to improve final codegen.
+ if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
+ Ex = Builder.CreateExtractElement(ES->getOperand(0),
+ ES->getOperand(1));
+ } else {
+ Ex = Builder.CreateExtractElement(Vec, Lane);
+ }
+ // If necessary, sign-extend or zero-extend ScalarRoot
+ // to the larger type.
+ if (!MinBWs.count(ScalarRoot))
+ return Ex;
+ if (MinBWs[ScalarRoot].second)
+ return Builder.CreateSExt(Ex, Scalar->getType());
+ return Builder.CreateZExt(Ex, Scalar->getType());
+ }
+ assert(isa<FixedVectorType>(Scalar->getType()) &&
+ isa<InsertElementInst>(Scalar) &&
+ "In-tree scalar of vector type is not insertelement?");
+ return Vec;
+ };
// If User == nullptr, the Scalar is used as extra arg. Generate
// ExtractElement instruction and update the record for this scalar in
// ExternallyUsedValues.
@@ -4914,14 +5754,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
}
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ Value *NewInst = ExtractAndExtendIfNeeded(Vec);
CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
- auto &Locs = ExternallyUsedValues[Scalar];
- ExternallyUsedValues.insert({Ex, Locs});
+ auto &NewInstLocs = ExternallyUsedValues[NewInst];
+ auto It = ExternallyUsedValues.find(Scalar);
+ assert(It != ExternallyUsedValues.end() &&
+ "Externally used scalar is not found in ExternallyUsedValues");
+ NewInstLocs.append(It->second);
ExternallyUsedValues.erase(Scalar);
// Required to update internally referenced instructions.
- Scalar->replaceAllUsesWith(Ex);
+ Scalar->replaceAllUsesWith(NewInst);
continue;
}
@@ -4939,25 +5781,22 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
} else {
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
}
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ Value *NewInst = ExtractAndExtendIfNeeded(Vec);
CSEBlocks.insert(PH->getIncomingBlock(i));
- PH->setOperand(i, Ex);
+ PH->setOperand(i, NewInst);
}
}
} else {
Builder.SetInsertPoint(cast<Instruction>(User));
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ Value *NewInst = ExtractAndExtendIfNeeded(Vec);
CSEBlocks.insert(cast<Instruction>(User)->getParent());
- User->replaceUsesOfWith(Scalar, Ex);
+ User->replaceUsesOfWith(Scalar, NewInst);
}
} else {
Builder.SetInsertPoint(&F->getEntryBlock().front());
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ Value *NewInst = ExtractAndExtendIfNeeded(Vec);
CSEBlocks.insert(&F->getEntryBlock());
- User->replaceUsesOfWith(Scalar, Ex);
+ User->replaceUsesOfWith(Scalar, NewInst);
}
LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
@@ -5043,10 +5882,11 @@ void BoUpSLP::optimizeGatherSequence() {
// Sort blocks by domination. This ensures we visit a block after all blocks
// dominating it are visited.
- llvm::stable_sort(CSEWorkList,
- [this](const DomTreeNode *A, const DomTreeNode *B) {
- return DT->properlyDominates(A, B);
- });
+ llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
+ assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ return A->getDFSNumIn() < B->getDFSNumIn();
+ });
// Perform O(N^2) search over the gather sequences and merge identical
// instructions. TODO: We can further optimize this scan if we split the
@@ -5091,7 +5931,7 @@ void BoUpSLP::optimizeGatherSequence() {
Optional<BoUpSLP::ScheduleData *>
BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
const InstructionsState &S) {
- if (isa<PHINode>(S.OpValue))
+ if (isa<PHINode>(S.OpValue) || isa<InsertElementInst>(S.OpValue))
return nullptr;
// Initialize the instruction bundle.
@@ -5101,11 +5941,53 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
bool ReSchedule = false;
LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
+ auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule,
+ ScheduleData *Bundle) {
+ // The scheduling region got new instructions at the lower end (or it is a
+ // new region for the first bundle). This makes it necessary to
+ // recalculate all dependencies.
+ // It is seldom that this needs to be done a second time after adding the
+ // initial bundle to the region.
+ if (ScheduleEnd != OldScheduleEnd) {
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
+ doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
+ ReSchedule = true;
+ }
+ if (ReSchedule) {
+ resetSchedule();
+ initialFillReadyList(ReadyInsts);
+ }
+ if (Bundle) {
+ LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
+ << " in block " << BB->getName() << "\n");
+ calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
+ }
+
+ // Now try to schedule the new bundle or (if no bundle) just calculate
+ // dependencies. As soon as the bundle is "ready" it means that there are no
+ // cyclic dependencies and we can schedule it. Note that's important that we
+ // don't "schedule" the bundle yet (see cancelScheduling).
+ while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
+ !ReadyInsts.empty()) {
+ ScheduleData *Picked = ReadyInsts.pop_back_val();
+ if (Picked->isSchedulingEntity() && Picked->isReady())
+ schedule(Picked, ReadyInsts);
+ }
+ };
+
// Make sure that the scheduling region contains all
// instructions of the bundle.
for (Value *V : VL) {
- if (!extendSchedulingRegion(V, S))
+ if (!extendSchedulingRegion(V, S)) {
+ // If the scheduling region got new instructions at the lower end (or it
+ // is a new region for the first bundle). This makes it necessary to
+ // recalculate all dependencies.
+ // Otherwise the compiler may crash trying to incorrectly calculate
+ // dependencies and emit instruction in the wrong order at the actual
+ // scheduling.
+ TryScheduleBundle(/*ReSchedule=*/false, nullptr);
return None;
+ }
}
for (Value *V : VL) {
@@ -5134,42 +6016,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
BundleMember->FirstInBundle = Bundle;
PrevInBundle = BundleMember;
}
- if (ScheduleEnd != OldScheduleEnd) {
- // The scheduling region got new instructions at the lower end (or it is a
- // new region for the first bundle). This makes it necessary to
- // recalculate all dependencies.
- // It is seldom that this needs to be done a second time after adding the
- // initial bundle to the region.
- for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- doForAllOpcodes(I, [](ScheduleData *SD) {
- SD->clearDependencies();
- });
- }
- ReSchedule = true;
- }
- if (ReSchedule) {
- resetSchedule();
- initialFillReadyList(ReadyInsts);
- }
assert(Bundle && "Failed to find schedule bundle");
-
- LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
- << BB->getName() << "\n");
-
- calculateDependencies(Bundle, true, SLP);
-
- // Now try to schedule the new bundle. As soon as the bundle is "ready" it
- // means that there are no cyclic dependencies and we can schedule it.
- // Note that's important that we don't "schedule" the bundle yet (see
- // cancelScheduling).
- while (!Bundle->isReady() && !ReadyInsts.empty()) {
-
- ScheduleData *pickedSD = ReadyInsts.pop_back_val();
-
- if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
- schedule(pickedSD, ReadyInsts);
- }
- }
+ TryScheduleBundle(ReSchedule, Bundle);
if (!Bundle->isReady()) {
cancelScheduling(VL, S.OpValue);
return None;
@@ -5179,7 +6027,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
Value *OpValue) {
- if (isa<PHINode>(OpValue))
+ if (isa<PHINode>(OpValue) || isa<InsertElementInst>(OpValue))
return;
ScheduleData *Bundle = getScheduleData(OpValue);
@@ -5219,7 +6067,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
return true;
Instruction *I = dyn_cast<Instruction>(V);
assert(I && "bundle member must be an instruction");
- assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+ assert(!isa<PHINode>(I) && !isa<InsertElementInst>(I) &&
+ "phi nodes/insertelements don't need to be scheduled");
auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
ScheduleData *ISD = getScheduleData(I);
if (!ISD)
@@ -5252,41 +6101,39 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
BasicBlock::reverse_iterator UpperEnd = BB->rend();
BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
BasicBlock::iterator LowerEnd = BB->end();
- while (true) {
+ while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
+ &*DownIter != I) {
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
return false;
}
- if (UpIter != UpperEnd) {
- if (&*UpIter == I) {
- initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
- ScheduleStart = I;
- if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
- LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
- << "\n");
- return true;
- }
- ++UpIter;
- }
- if (DownIter != LowerEnd) {
- if (&*DownIter == I) {
- initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
- nullptr);
- ScheduleEnd = I->getNextNode();
- if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
- assert(ScheduleEnd && "tried to vectorize a terminator?");
- LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
- << "\n");
- return true;
- }
- ++DownIter;
- }
- assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
- "instruction not found in block");
+ ++UpIter;
+ ++DownIter;
}
+ if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
+ assert(I->getParent() == ScheduleStart->getParent() &&
+ "Instruction is in wrong basic block.");
+ initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+ ScheduleStart = I;
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
+ << "\n");
+ return true;
+ }
+ assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
+ "Expected to reach top of the basic block or instruction down the "
+ "lower end.");
+ assert(I->getParent() == ScheduleEnd->getParent() &&
+ "Instruction is in wrong basic block.");
+ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+ nullptr);
+ ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ assert(ScheduleEnd && "tried to vectorize a terminator?");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
return true;
}
@@ -5491,8 +6338,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
- assert(SD->isPartOfBundle() ==
- (getTreeEntry(SD->Inst) != nullptr) &&
+ assert((isa<InsertElementInst>(SD->Inst) ||
+ SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;
if (SD->isSchedulingEntity()) {
@@ -5515,7 +6362,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
ScheduleData *BundleMember = picked;
while (BundleMember) {
Instruction *pickedInst = BundleMember->Inst;
- if (LastScheduledInst->getNextNode() != pickedInst) {
+ if (pickedInst->getNextNode() != LastScheduledInst) {
BS->BB->getInstList().remove(pickedInst);
BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
pickedInst);
@@ -5540,10 +6387,12 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
if (auto *Store = dyn_cast<StoreInst>(V)) {
if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
return DL->getTypeSizeInBits(Trunc->getSrcTy());
- else
- return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+ return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
}
+ if (auto *IEI = dyn_cast<InsertElementInst>(V))
+ return getVectorElementSize(IEI->getOperand(1));
+
auto E = InstrElementSize.find(V);
if (E != InstrElementSize.end())
return E->second;
@@ -5552,53 +6401,58 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
// that feed it. The type of the loaded value may indicate a more suitable
// width than V's type. We want to base the vector element size on the width
// of memory operations where possible.
- SmallVector<Instruction *, 16> Worklist;
+ SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist;
SmallPtrSet<Instruction *, 16> Visited;
if (auto *I = dyn_cast<Instruction>(V)) {
- Worklist.push_back(I);
+ Worklist.emplace_back(I, I->getParent());
Visited.insert(I);
}
// Traverse the expression tree in bottom-up order looking for loads. If we
// encounter an instruction we don't yet handle, we give up.
- auto MaxWidth = 0u;
- auto FoundUnknownInst = false;
- while (!Worklist.empty() && !FoundUnknownInst) {
- auto *I = Worklist.pop_back_val();
+ auto Width = 0u;
+ while (!Worklist.empty()) {
+ Instruction *I;
+ BasicBlock *Parent;
+ std::tie(I, Parent) = Worklist.pop_back_val();
// We should only be looking at scalar instructions here. If the current
- // instruction has a vector type, give up.
+ // instruction has a vector type, skip.
auto *Ty = I->getType();
if (isa<VectorType>(Ty))
- FoundUnknownInst = true;
+ continue;
// If the current instruction is a load, update MaxWidth to reflect the
// width of the loaded value.
- else if (isa<LoadInst>(I))
- MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+ if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ExtractValueInst>(I))
+ Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
// Otherwise, we need to visit the operands of the instruction. We only
// handle the interesting cases from buildTree here. If an operand is an
- // instruction we haven't yet visited, we add it to the worklist.
+ // instruction we haven't yet visited and from the same basic block as the
+ // user or the use is a PHI node, we add it to the worklist.
else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
- isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+ isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) ||
+ isa<UnaryOperator>(I)) {
for (Use &U : I->operands())
if (auto *J = dyn_cast<Instruction>(U.get()))
- if (Visited.insert(J).second)
- Worklist.push_back(J);
+ if (Visited.insert(J).second &&
+ (isa<PHINode>(I) || J->getParent() == Parent))
+ Worklist.emplace_back(J, J->getParent());
+ } else {
+ break;
}
-
- // If we don't yet handle the instruction, give up.
- else
- FoundUnknownInst = true;
}
- int Width = MaxWidth;
// If we didn't encounter a memory access in the expression tree, or if we
// gave up for some reason, just return the width of V. Otherwise, return the
// maximum width we found.
- if (!MaxWidth || FoundUnknownInst)
+ if (!Width) {
+ if (auto *CI = dyn_cast<CmpInst>(V))
+ V = CI->getOperand(0);
Width = DL->getTypeSizeInBits(V->getType());
+ }
for (Instruction *I : Visited)
InstrElementSize[I] = Width;
@@ -5633,6 +6487,9 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
break;
case Instruction::ZExt:
case Instruction::SExt:
+ if (isa<ExtractElementInst>(I->getOperand(0)) ||
+ isa<InsertElementInst>(I->getOperand(0)))
+ return false;
break;
// We can demote certain binary operations if we can demote both of their
@@ -5884,8 +6741,6 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
- PA.preserve<AAManager>();
- PA.preserve<GlobalsAA>();
return PA;
}
@@ -5929,6 +6784,9 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
// delete instructions.
+ // Update DFS numbers now so that we can use them for ordering.
+ DT->updateDFSNumbers();
+
// Scan the blocks in the function in post order.
for (auto BB : post_order(&F.getEntryBlock())) {
collectSeedInstructions(BB);
@@ -5960,6 +6818,44 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
return Changed;
}
+/// Order may have elements assigned special value (size) which is out of
+/// bounds. Such indices only appear on places which correspond to undef values
+/// (see canReuseExtract for details) and used in order to avoid undef values
+/// have effect on operands ordering.
+/// The first loop below simply finds all unused indices and then the next loop
+/// nest assigns these indices for undef values positions.
+/// As an example below Order has two undef positions and they have assigned
+/// values 3 and 7 respectively:
+/// before: 6 9 5 4 9 2 1 0
+/// after: 6 3 5 4 7 2 1 0
+/// \returns Fixed ordering.
+static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef<unsigned> Order) {
+ BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end());
+ const unsigned Sz = NewOrder.size();
+ SmallBitVector UsedIndices(Sz);
+ SmallVector<int> MaskedIndices;
+ for (int I = 0, E = NewOrder.size(); I < E; ++I) {
+ if (NewOrder[I] < Sz)
+ UsedIndices.set(NewOrder[I]);
+ else
+ MaskedIndices.push_back(I);
+ }
+ if (MaskedIndices.empty())
+ return NewOrder;
+ SmallVector<int> AvailableIndices(MaskedIndices.size());
+ unsigned Cnt = 0;
+ int Idx = UsedIndices.find_first();
+ do {
+ AvailableIndices[Cnt] = Idx;
+ Idx = UsedIndices.find_next(Idx);
+ ++Cnt;
+ } while (Idx > 0);
+ assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices.");
+ for (int I = 0, E = MaskedIndices.size(); I < E; ++I)
+ NewOrder[MaskedIndices[I]] = AvailableIndices[I];
+ return NewOrder;
+}
+
bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
unsigned Idx) {
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
@@ -5979,9 +6875,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
// TODO: Handle orders of size less than number of elements in the vector.
if (Order && Order->size() == Chain.size()) {
// TODO: reorder tree nodes without tree rebuilding.
- SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
- llvm::transform(*Order, ReorderedOps.begin(),
- [Chain](const unsigned Idx) { return Chain[Idx]; });
+ SmallVector<Value *, 4> ReorderedOps(Chain.size());
+ transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
+ [Chain](const unsigned Idx) { return Chain[Idx]; });
R.buildTree(ReorderedOps);
}
if (R.isTreeTinyAndNotFullyVectorizable())
@@ -6021,20 +6917,42 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
int E = Stores.size();
SmallBitVector Tails(E, false);
- SmallVector<int, 16> ConsecutiveChain(E, E + 1);
int MaxIter = MaxStoreLookup.getValue();
+ SmallVector<std::pair<int, int>, 16> ConsecutiveChain(
+ E, std::make_pair(E, INT_MAX));
+ SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false));
int IterCnt;
auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
+ &CheckedPairs,
&ConsecutiveChain](int K, int Idx) {
if (IterCnt >= MaxIter)
return true;
+ if (CheckedPairs[Idx].test(K))
+ return ConsecutiveChain[K].second == 1 &&
+ ConsecutiveChain[K].first == Idx;
++IterCnt;
- if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
+ CheckedPairs[Idx].set(K);
+ CheckedPairs[K].set(Idx);
+ Optional<int> Diff = getPointersDiff(
+ Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(),
+ Stores[Idx]->getValueOperand()->getType(),
+ Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true);
+ if (!Diff || *Diff == 0)
+ return false;
+ int Val = *Diff;
+ if (Val < 0) {
+ if (ConsecutiveChain[Idx].second > -Val) {
+ Tails.set(K);
+ ConsecutiveChain[Idx] = std::make_pair(K, -Val);
+ }
+ return false;
+ }
+ if (ConsecutiveChain[K].second <= Val)
return false;
Tails.set(Idx);
- ConsecutiveChain[K] = Idx;
- return true;
+ ConsecutiveChain[K] = std::make_pair(Idx, Val);
+ return Val == 1;
};
// Do a quadratic search on all of the given stores in reverse order and find
// all of the pairs of stores that follow each other.
@@ -6051,32 +6969,51 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
break;
}
+ // Tracks if we tried to vectorize stores starting from the given tail
+ // already.
+ SmallBitVector TriedTails(E, false);
// For stores that start but don't end a link in the chain:
for (int Cnt = E; Cnt > 0; --Cnt) {
int I = Cnt - 1;
- if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
+ if (ConsecutiveChain[I].first == E || Tails.test(I))
continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
BoUpSLP::ValueList Operands;
// Collect the chain into a list.
- while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
+ while (I != E && !VectorizedStores.count(Stores[I])) {
Operands.push_back(Stores[I]);
+ Tails.set(I);
+ if (ConsecutiveChain[I].second != 1) {
+ // Mark the new end in the chain and go back, if required. It might be
+ // required if the original stores come in reversed order, for example.
+ if (ConsecutiveChain[I].first != E &&
+ Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) &&
+ !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) {
+ TriedTails.set(I);
+ Tails.reset(ConsecutiveChain[I].first);
+ if (Cnt < ConsecutiveChain[I].first + 2)
+ Cnt = ConsecutiveChain[I].first + 2;
+ }
+ break;
+ }
// Move to the next value in the chain.
- I = ConsecutiveChain[I];
+ I = ConsecutiveChain[I].first;
}
+ assert(!Operands.empty() && "Expected non-empty list of stores.");
- // If a vector register can't hold 1 element, we are done.
unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(Operands[0]);
- if (MaxVecRegSize % EltSize != 0)
- continue;
+ unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize);
+
+ unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize);
+ unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store),
+ MaxElts);
- unsigned MaxElts = MaxVecRegSize / EltSize;
// FIXME: Is division-by-2 the correct step? Should we assert that the
// register size is a power-of-2?
unsigned StartIdx = 0;
- for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
+ for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) {
for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
if (!VectorizedStores.count(Slice.front()) &&
@@ -6146,8 +7083,7 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
}
bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- bool AllowReorder,
- ArrayRef<Value *> InsertUses) {
+ bool AllowReorder) {
if (VL.size() < 2)
return false;
@@ -6165,7 +7101,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// determining vectorization factor for scalar instructions.
for (Value *V : VL) {
Type *Ty = V->getType();
- if (!isValidElementType(Ty)) {
+ if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
// NOTE: the following will give user internal llvm type name, which may
// not be useful.
R.getORE()->emit([&]() {
@@ -6196,20 +7132,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
bool Changed = false;
bool CandidateFound = false;
InstructionCost MinCost = SLPCostThreshold.getValue();
-
- bool CompensateUseCost =
- !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
- return V && isa<InsertElementInst>(V);
- });
- assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
- "Each scalar expected to have an associated InsertElement user.");
+ Type *ScalarTy = VL[0]->getType();
+ if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+ ScalarTy = IE->getOperand(1)->getType();
unsigned NextInst = 0, MaxInst = VL.size();
for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
// No actual vectorization should happen, if number of parts is the same as
// provided vectorization factor (i.e. the scalar type is used for vector
// code during codegen).
- auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
+ auto *VecTy = FixedVectorType::get(ScalarTy, VF);
if (TTI->getNumberOfParts(VecTy) == VF)
continue;
for (unsigned I = NextInst; I < MaxInst; ++I) {
@@ -6220,7 +7152,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
else
OpsWidth = VF;
- if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+ if (!isPowerOf2_32(OpsWidth))
+ continue;
+
+ if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2))
break;
ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
@@ -6235,17 +7170,15 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
<< "\n");
R.buildTree(Ops);
- Optional<ArrayRef<unsigned>> Order = R.bestOrder();
- // TODO: check if we can allow reordering for more cases.
- if (AllowReorder && Order) {
- // TODO: reorder tree nodes without tree rebuilding.
- // Conceptually, there is nothing actually preventing us from trying to
- // reorder a larger list. In fact, we do exactly this when vectorizing
- // reductions. However, at this point, we only expect to get here when
- // there are exactly two operations.
- assert(Ops.size() == 2);
- Value *ReorderedOps[] = {Ops[1], Ops[0]};
- R.buildTree(ReorderedOps, None);
+ if (AllowReorder) {
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+ if (Order) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(Ops.size());
+ transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
+ [Ops](const unsigned Idx) { return Ops[Idx]; });
+ R.buildTree(ReorderedOps);
+ }
}
if (R.isTreeTinyAndNotFullyVectorizable())
continue;
@@ -6253,46 +7186,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
R.computeMinimumValueSizes();
InstructionCost Cost = R.getTreeCost();
CandidateFound = true;
- if (CompensateUseCost) {
- // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
- // rather than sum of single inserts as the latter may overestimate
- // cost. This work should imply improving cost estimation for extracts
- // that added in for external (for vectorization tree) users,i.e. that
- // part should also switch to same interface.
- // For example, the following case is projected code after SLP:
- // %4 = extractelement <4 x i64> %3, i32 0
- // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
- // %5 = extractelement <4 x i64> %3, i32 1
- // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
- // %6 = extractelement <4 x i64> %3, i32 2
- // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
- // %7 = extractelement <4 x i64> %3, i32 3
- // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
- //
- // Extracts here added by SLP in order to feed users (the inserts) of
- // original scalars and contribute to "ExtractCost" at cost evaluation.
- // The inserts in turn form sequence to build an aggregate that
- // detected by findBuildAggregate routine.
- // SLP makes an assumption that such sequence will be optimized away
- // later (instcombine) so it tries to compensate ExctractCost with
- // cost of insert sequence.
- // Current per element cost calculation approach is not quite accurate
- // and tends to create bias toward favoring vectorization.
- // Switching to the TTI interface might help a bit.
- // Alternative solution could be pattern-match to detect a no-op or
- // shuffle.
- InstructionCost UserCost = 0;
- for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
- auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
- UserCost += TTI->getVectorInstrCost(
- Instruction::InsertElement, IE->getType(), CI->getZExtValue());
- }
- LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
- << ".\n");
- Cost -= UserCost;
- }
-
MinCost = std::min(MinCost, Cost);
if (Cost < -SLPCostThreshold) {
@@ -6411,11 +7304,29 @@ class HorizontalReduction {
/// The type of reduction operation.
RecurKind RdxKind;
+ const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max();
+
+ static bool isCmpSelMinMax(Instruction *I) {
+ return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
+ RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));
+ }
+
+ // And/or are potentially poison-safe logical patterns like:
+ // select x, y, false
+ // select x, true, y
+ static bool isBoolLogicOp(Instruction *I) {
+ return match(I, m_LogicalAnd(m_Value(), m_Value())) ||
+ match(I, m_LogicalOr(m_Value(), m_Value()));
+ }
+
/// Checks if instruction is associative and can be vectorized.
static bool isVectorizable(RecurKind Kind, Instruction *I) {
if (Kind == RecurKind::None)
return false;
- if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
+
+ // Integer ops that map to select instructions or intrinsics are fine.
+ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||
+ isBoolLogicOp(I))
return true;
if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
@@ -6428,6 +7339,16 @@ class HorizontalReduction {
return I->isAssociative();
}
+ static Value *getRdxOperand(Instruction *I, unsigned Index) {
+ // Poison-safe 'or' takes the form: select X, true, Y
+ // To make that work with the normal operand processing, we skip the
+ // true value operand.
+ // TODO: Change the code and data structures to handle this without a hack.
+ if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
+ return I->getOperand(2);
+ return I->getOperand(Index);
+ }
+
/// Checks if the ParentStackElem.first should be marked as a reduction
/// operation with an extra argument or as extra argument itself.
void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
@@ -6440,8 +7361,7 @@ class HorizontalReduction {
// in this case.
// Do not perform analysis of remaining operands of ParentStackElem.first
// instruction, this whole instruction is an extra argument.
- RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first);
- ParentStackElem.second = getNumberOfOperands(ParentRdxKind);
+ ParentStackElem.second = INVALID_OPERAND_INDEX;
} else {
// We ran into something like:
// ParentStackElem.first += ... + ExtraArg + ...
@@ -6451,7 +7371,7 @@ class HorizontalReduction {
/// Creates reduction operation with the current opcode.
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
- Value *RHS, const Twine &Name) {
+ Value *RHS, const Twine &Name, bool UseSelect) {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
switch (Kind) {
case RecurKind::Add:
@@ -6467,23 +7387,30 @@ class HorizontalReduction {
return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
case RecurKind::FMin:
return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
-
- case RecurKind::SMax: {
- Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
- case RecurKind::SMin: {
- Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
- case RecurKind::UMax: {
- Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
- case RecurKind::UMin: {
- Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
+ case RecurKind::SMax:
+ if (UseSelect) {
+ Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
+ case RecurKind::SMin:
+ if (UseSelect) {
+ Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
+ case RecurKind::UMax:
+ if (UseSelect) {
+ Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
+ case RecurKind::UMin:
+ if (UseSelect) {
+ Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
default:
llvm_unreachable("Unknown reduction operation.");
}
@@ -6494,26 +7421,30 @@ class HorizontalReduction {
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
Value *RHS, const Twine &Name,
const ReductionOpsListType &ReductionOps) {
- Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
+ bool UseSelect = ReductionOps.size() == 2;
+ assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) &&
+ "Expected cmp + select pairs for reduction");
+ Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
- if (auto *Sel = dyn_cast<SelectInst>(Op))
+ if (auto *Sel = dyn_cast<SelectInst>(Op)) {
propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
- propagateIRFlags(Op, ReductionOps[1]);
- return Op;
+ propagateIRFlags(Op, ReductionOps[1]);
+ return Op;
+ }
}
propagateIRFlags(Op, ReductionOps[0]);
return Op;
}
+
/// Creates reduction operation with the current opcode with the IR flags
/// from \p I.
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
Value *RHS, const Twine &Name, Instruction *I) {
- Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
- if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
- if (auto *Sel = dyn_cast<SelectInst>(Op)) {
- propagateIRFlags(Sel->getCondition(),
- cast<SelectInst>(I)->getCondition());
- }
+ auto *SelI = dyn_cast<SelectInst>(I);
+ Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr);
+ if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
+ if (auto *Sel = dyn_cast<SelectInst>(Op))
+ propagateIRFlags(Sel->getCondition(), SelI->getCondition());
}
propagateIRFlags(Op, I);
return Op;
@@ -6526,9 +7457,11 @@ class HorizontalReduction {
return RecurKind::Add;
if (match(I, m_Mul(m_Value(), m_Value())))
return RecurKind::Mul;
- if (match(I, m_And(m_Value(), m_Value())))
+ if (match(I, m_And(m_Value(), m_Value())) ||
+ match(I, m_LogicalAnd(m_Value(), m_Value())))
return RecurKind::And;
- if (match(I, m_Or(m_Value(), m_Value())))
+ if (match(I, m_Or(m_Value(), m_Value())) ||
+ match(I, m_LogicalOr(m_Value(), m_Value())))
return RecurKind::Or;
if (match(I, m_Xor(m_Value(), m_Value())))
return RecurKind::Xor;
@@ -6542,6 +7475,10 @@ class HorizontalReduction {
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
return RecurKind::FMin;
+ // This matches either cmp+select or intrinsics. SLP is expected to handle
+ // either form.
+ // TODO: If we are canonicalizing to intrinsics, we can remove several
+ // special-case paths that deal with selects.
if (match(I, m_SMax(m_Value(), m_Value())))
return RecurKind::SMax;
if (match(I, m_SMin(m_Value(), m_Value())))
@@ -6610,61 +7547,52 @@ class HorizontalReduction {
return RecurKind::None;
}
- /// Return true if this operation is a cmp+select idiom.
- static bool isCmpSel(RecurKind Kind) {
- return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind);
- }
-
/// Get the index of the first operand.
- static unsigned getFirstOperandIndex(RecurKind Kind) {
- // We allow calling this before 'Kind' is set, so handle that specially.
- if (Kind == RecurKind::None)
- return 0;
- return isCmpSel(Kind) ? 1 : 0;
+ static unsigned getFirstOperandIndex(Instruction *I) {
+ return isCmpSelMinMax(I) ? 1 : 0;
}
/// Total number of operands in the reduction operation.
- static unsigned getNumberOfOperands(RecurKind Kind) {
- return isCmpSel(Kind) ? 3 : 2;
+ static unsigned getNumberOfOperands(Instruction *I) {
+ return isCmpSelMinMax(I) ? 3 : 2;
}
/// Checks if the instruction is in basic block \p BB.
- /// For a min/max reduction check that both compare and select are in \p BB.
- static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB,
- bool IsRedOp) {
- if (IsRedOp && isCmpSel(Kind)) {
- auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
- return I->getParent() == BB && Cmp && Cmp->getParent() == BB;
+ /// For a cmp+sel min/max reduction check that both ops are in \p BB.
+ static bool hasSameParent(Instruction *I, BasicBlock *BB) {
+ if (isCmpSelMinMax(I)) {
+ auto *Sel = cast<SelectInst>(I);
+ auto *Cmp = cast<Instruction>(Sel->getCondition());
+ return Sel->getParent() == BB && Cmp->getParent() == BB;
}
return I->getParent() == BB;
}
/// Expected number of uses for reduction operations/reduced values.
- static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I,
- bool IsReductionOp) {
- // SelectInst must be used twice while the condition op must have single
- // use only.
- if (isCmpSel(Kind))
- return I->hasNUses(2) &&
- (!IsReductionOp ||
- cast<SelectInst>(I)->getCondition()->hasOneUse());
+ static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
+ if (IsCmpSelMinMax) {
+ // SelectInst must be used twice while the condition op must have single
+ // use only.
+ if (auto *Sel = dyn_cast<SelectInst>(I))
+ return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
+ return I->hasNUses(2);
+ }
// Arithmetic reduction operation must be used once only.
return I->hasOneUse();
}
/// Initializes the list of reduction operations.
- void initReductionOps(RecurKind Kind) {
- if (isCmpSel(Kind))
+ void initReductionOps(Instruction *I) {
+ if (isCmpSelMinMax(I))
ReductionOps.assign(2, ReductionOpsType());
else
ReductionOps.assign(1, ReductionOpsType());
}
/// Add all reduction operations for the reduction instruction \p I.
- void addReductionOps(RecurKind Kind, Instruction *I) {
- assert(Kind != RecurKind::None && "Expected reduction operation.");
- if (isCmpSel(Kind)) {
+ void addReductionOps(Instruction *I) {
+ if (isCmpSelMinMax(I)) {
ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
ReductionOps[1].emplace_back(I);
} else {
@@ -6675,53 +7603,61 @@ class HorizontalReduction {
static Value *getLHS(RecurKind Kind, Instruction *I) {
if (Kind == RecurKind::None)
return nullptr;
- return I->getOperand(getFirstOperandIndex(Kind));
+ return I->getOperand(getFirstOperandIndex(I));
}
static Value *getRHS(RecurKind Kind, Instruction *I) {
if (Kind == RecurKind::None)
return nullptr;
- return I->getOperand(getFirstOperandIndex(Kind) + 1);
+ return I->getOperand(getFirstOperandIndex(I) + 1);
}
public:
HorizontalReduction() = default;
/// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
- assert((!Phi || is_contained(Phi->operands(), B)) &&
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) {
+ assert((!Phi || is_contained(Phi->operands(), Inst)) &&
"Phi needs to use the binary operator");
-
- RdxKind = getRdxKind(B);
+ assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) ||
+ isa<IntrinsicInst>(Inst)) &&
+ "Expected binop, select, or intrinsic for reduction matching");
+ RdxKind = getRdxKind(Inst);
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
- if (getLHS(RdxKind, B) == Phi) {
+ if (getLHS(RdxKind, Inst) == Phi) {
Phi = nullptr;
- B = dyn_cast<Instruction>(getRHS(RdxKind, B));
- if (!B)
+ Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst));
+ if (!Inst)
return false;
- RdxKind = getRdxKind(B);
- } else if (getRHS(RdxKind, B) == Phi) {
+ RdxKind = getRdxKind(Inst);
+ } else if (getRHS(RdxKind, Inst) == Phi) {
Phi = nullptr;
- B = dyn_cast<Instruction>(getLHS(RdxKind, B));
- if (!B)
+ Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst));
+ if (!Inst)
return false;
- RdxKind = getRdxKind(B);
+ RdxKind = getRdxKind(Inst);
}
}
- if (!isVectorizable(RdxKind, B))
+ if (!isVectorizable(RdxKind, Inst))
return false;
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
- Type *Ty = B->getType();
+ Type *Ty = Inst->getType();
if (!isValidElementType(Ty) || Ty->isPointerTy())
return false;
- ReductionRoot = B;
+ // Though the ultimate reduction may have multiple uses, its condition must
+ // have only single use.
+ if (auto *Sel = dyn_cast<SelectInst>(Inst))
+ if (!Sel->getCondition()->hasOneUse())
+ return false;
+
+ ReductionRoot = Inst;
// The opcode for leaf values that we perform a reduction on.
// For example: load(x) + load(y) + load(z) + fptoui(w)
@@ -6729,11 +7665,11 @@ public:
// potential candidate for the reduction.
unsigned LeafOpcode = 0;
- // Post order traverse the reduction tree starting at B. We only handle true
- // trees containing only binary operators.
+ // Post-order traverse the reduction tree starting at Inst. We only handle
+ // true trees containing binary operators or selects.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind)));
- initReductionOps(RdxKind);
+ Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst)));
+ initReductionOps(Inst);
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
unsigned EdgeToVisit = Stack.back().second++;
@@ -6741,12 +7677,12 @@ public:
bool IsReducedValue = TreeRdxKind != RdxKind;
// Postorder visit.
- if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) {
+ if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) {
if (IsReducedValue)
ReducedVals.push_back(TreeN);
else {
- auto I = ExtraArgs.find(TreeN);
- if (I != ExtraArgs.end() && !I->second) {
+ auto ExtraArgsIter = ExtraArgs.find(TreeN);
+ if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) {
// Check if TreeN is an extra argument of its parent operation.
if (Stack.size() <= 1) {
// TreeN can't be an extra argument as it is a root reduction
@@ -6759,23 +7695,23 @@ public:
markExtraArg(Stack[Stack.size() - 2], TreeN);
ExtraArgs.erase(TreeN);
} else
- addReductionOps(RdxKind, TreeN);
+ addReductionOps(TreeN);
}
// Retract.
Stack.pop_back();
continue;
}
- // Visit left or right.
- Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
- auto *I = dyn_cast<Instruction>(EdgeVal);
- if (!I) {
+ // Visit operands.
+ Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit);
+ auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
+ if (!EdgeInst) {
// Edge value is not a reduction instruction or a leaf instruction.
// (It may be a constant, function argument, or something else.)
markExtraArg(Stack.back(), EdgeVal);
continue;
}
- RecurKind EdgeRdxKind = getRdxKind(I);
+ RecurKind EdgeRdxKind = getRdxKind(EdgeInst);
// Continue analysis if the next operand is a reduction operation or
// (possibly) a leaf value. If the leaf value opcode is not set,
// the first met operation != reduction operation is considered as the
@@ -6784,25 +7720,26 @@ public:
// Each tree node needs to have minimal number of users except for the
// ultimate reduction.
const bool IsRdxInst = EdgeRdxKind == RdxKind;
- if (I != Phi && I != B &&
- hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) &&
- hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) &&
- (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) {
+ if (EdgeInst != Phi && EdgeInst != Inst &&
+ hasSameParent(EdgeInst, Inst->getParent()) &&
+ hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) &&
+ (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) {
if (IsRdxInst) {
// We need to be able to reassociate the reduction operations.
- if (!isVectorizable(EdgeRdxKind, I)) {
+ if (!isVectorizable(EdgeRdxKind, EdgeInst)) {
// I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
+ markExtraArg(Stack.back(), EdgeInst);
continue;
}
} else if (!LeafOpcode) {
- LeafOpcode = I->getOpcode();
+ LeafOpcode = EdgeInst->getOpcode();
}
- Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind)));
+ Stack.push_back(
+ std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst)));
continue;
}
// I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
+ markExtraArg(Stack.back(), EdgeInst);
}
return true;
}
@@ -6896,8 +7833,8 @@ public:
"instructions.");
// TODO: reorder tree nodes without tree rebuilding.
SmallVector<Value *, 4> ReorderedOps(VL.size());
- llvm::transform(*Order, ReorderedOps.begin(),
- [VL](const unsigned Idx) { return VL[Idx]; });
+ transform(fixupOrderingIndices(*Order), ReorderedOps.begin(),
+ [VL](const unsigned Idx) { return VL[Idx]; });
V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
}
if (V.isTreeTinyAndNotFullyVectorizable())
@@ -6905,12 +7842,21 @@ public:
if (V.isLoadCombineReductionCandidate(RdxKind))
break;
+ // For a poison-safe boolean logic reduction, do not replace select
+ // instructions with logic ops. All reduced values will be frozen (see
+ // below) to prevent leaking poison.
+ if (isa<SelectInst>(ReductionRoot) &&
+ isBoolLogicOp(cast<Instruction>(ReductionRoot)) &&
+ NumReducedVals != ReduxWidth)
+ break;
+
V.computeMinimumValueSizes();
// Estimate cost.
- InstructionCost TreeCost = V.getTreeCost();
+ InstructionCost TreeCost =
+ V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth));
InstructionCost ReductionCost =
- getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF);
InstructionCost Cost = TreeCost + ReductionCost;
if (!Cost.isValid()) {
LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
@@ -6945,11 +7891,16 @@ public:
// Emit a reduction. If the root is a select (min/max idiom), the insert
// point is the compare condition of that select.
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- if (isCmpSel(RdxKind))
+ if (isCmpSelMinMax(RdxRootInst))
Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
else
Builder.SetInsertPoint(RdxRootInst);
+ // To prevent poison from leaking across what used to be sequential, safe,
+ // scalar boolean logic operations, the reduction operand must be frozen.
+ if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst))
+ VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
+
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
@@ -6983,17 +7934,6 @@ public:
}
}
- // Update users. For a min/max reduction that ends with a compare and
- // select, we also have to RAUW for the compare instruction feeding the
- // reduction root. That's because the original compare may have extra uses
- // besides the final select of the reduction.
- if (isCmpSel(RdxKind)) {
- if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
- Instruction *ScalarCmp =
- getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
- ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());
- }
- }
ReductionRoot->replaceAllUsesWith(VectorizedTree);
// Mark all scalar reduction ops for deletion, they are replaced by the
@@ -7008,8 +7948,8 @@ public:
private:
/// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
- Value *FirstReducedVal,
- unsigned ReduxWidth) {
+ Value *FirstReducedVal, unsigned ReduxWidth,
+ FastMathFlags FMF) {
Type *ScalarTy = FirstReducedVal->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
InstructionCost VectorCost, ScalarCost;
@@ -7022,17 +7962,15 @@ private:
case RecurKind::FAdd:
case RecurKind::FMul: {
unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
- VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
- /*IsPairwiseForm=*/false);
+ VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
break;
}
case RecurKind::FMax:
case RecurKind::FMin: {
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
- VectorCost =
- TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*pairwise=*/false, /*unsigned=*/false);
+ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*unsigned=*/false);
ScalarCost =
TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
@@ -7046,9 +7984,7 @@ private:
auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
bool IsUnsigned =
RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
- VectorCost =
- TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
- /*IsPairwiseForm=*/false, IsUnsigned);
+ VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned);
ScalarCost =
TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
@@ -7109,36 +8045,6 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
} while (true);
}
-static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
- unsigned OperandOffset) {
- unsigned OperandIndex = OperandOffset;
- if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
- auto *VT = cast<FixedVectorType>(IE->getType());
- OperandIndex *= VT->getNumElements();
- OperandIndex += CI->getZExtValue();
- return OperandIndex;
- }
- return None;
- }
-
- auto *IV = cast<InsertValueInst>(InsertInst);
- Type *CurrentType = IV->getType();
- for (unsigned int Index : IV->indices()) {
- if (auto *ST = dyn_cast<StructType>(CurrentType)) {
- OperandIndex *= ST->getNumElements();
- CurrentType = ST->getElementType(Index);
- } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
- OperandIndex *= AT->getNumElements();
- CurrentType = AT->getElementType();
- } else {
- return None;
- }
- OperandIndex += Index;
- }
- return OperandIndex;
-}
-
static bool findBuildAggregate_rec(Instruction *LastInsertInst,
TargetTransformInfo *TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,
@@ -7146,8 +8052,7 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst,
unsigned OperandOffset) {
do {
Value *InsertedOperand = LastInsertInst->getOperand(1);
- Optional<unsigned> OperandIndex =
- getOperandIndex(LastInsertInst, OperandOffset);
+ Optional<int> OperandIndex = getInsertIndex(LastInsertInst, OperandOffset);
if (!OperandIndex)
return false;
if (isa<InsertElementInst>(InsertedOperand) ||
@@ -7159,14 +8064,12 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst,
BuildVectorOpds[*OperandIndex] = InsertedOperand;
InsertElts[*OperandIndex] = LastInsertInst;
}
- if (isa<UndefValue>(LastInsertInst->getOperand(0)))
- return true;
LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
} while (LastInsertInst != nullptr &&
(isa<InsertValueInst>(LastInsertInst) ||
isa<InsertElementInst>(LastInsertInst)) &&
LastInsertInst->hasOneUse());
- return false;
+ return true;
}
/// Recognize construction of vectors like
@@ -7212,10 +8115,6 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
return false;
}
-static bool PhiTypeSorterFunc(Value *V, Value *V2) {
- return V->getType() < V2->getType();
-}
-
/// Try and get a reduction value from a phi node.
///
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
@@ -7274,6 +8173,14 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
return true;
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
return true;
+ if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
+ return true;
return false;
}
@@ -7309,6 +8216,9 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// horizontal reduction.
// Interrupt the process if the Root instruction itself was vectorized or all
// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
+ // Skip the analysis of CmpInsts.Compiler implements postanalysis of the
+ // CmpInsts so we can skip extra attempts in
+ // tryToVectorizeHorReductionOrInstOperands and save compile time.
SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
SmallPtrSet<Value *, 8> VisitedInstrs;
bool Res = false;
@@ -7316,6 +8226,11 @@ static bool tryToVectorizeHorReductionOrInstOperands(
Instruction *Inst;
unsigned Level;
std::tie(Inst, Level) = Stack.pop_back_val();
+ // Do not try to analyze instruction that has already been vectorized.
+ // This may happen when we vectorize instruction operands on a previous
+ // iteration while stack was populated before that happened.
+ if (R.isDeleted(Inst))
+ continue;
Value *B0, *B1;
bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
@@ -7345,7 +8260,8 @@ static bool tryToVectorizeHorReductionOrInstOperands(
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
P = nullptr;
- if (Vectorize(Inst, R)) {
+ // Do not try to vectorize CmpInst operands, this is done separately.
+ if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) {
Res = true;
continue;
}
@@ -7357,7 +8273,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
for (auto *Op : Inst->operand_values())
if (VisitedInstrs.insert(Op).second)
if (auto *I = dyn_cast<Instruction>(Op))
- if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+ // Do not try to vectorize CmpInst operands, this is done
+ // separately.
+ if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) &&
+ I->getParent() == BB)
Stack.emplace_back(I, Level);
}
return Res;
@@ -7394,42 +8313,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
// Aggregate value is unlikely to be processed in vector register, we need to
// extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
- BuildVectorInsts);
+ return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false);
}
bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
BasicBlock *BB, BoUpSLP &R) {
SmallVector<Value *, 16> BuildVectorInsts;
SmallVector<Value *, 16> BuildVectorOpds;
+ SmallVector<int> Mask;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
(llvm::all_of(BuildVectorOpds,
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
- isShuffle(BuildVectorOpds)))
+ isShuffle(BuildVectorOpds, Mask)))
return false;
- // Vectorize starting with the build vector operands ignoring the BuildVector
- // instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
- BuildVectorInsts);
-}
-
-bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
- BoUpSLP &R) {
- if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
- return true;
-
- bool OpsChanged = false;
- for (int Idx = 0; Idx < 2; ++Idx) {
- OpsChanged |=
- vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
- }
- return OpsChanged;
+ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
+ return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true);
}
bool SLPVectorizerPass::vectorizeSimpleInstructions(
- SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R,
+ bool AtTerminator) {
bool OpsChanged = false;
+ SmallVector<Instruction *, 4> PostponedCmps;
for (auto *I : reverse(Instructions)) {
if (R.isDeleted(I))
continue;
@@ -7437,10 +8343,29 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions(
OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- else if (auto *CI = dyn_cast<CmpInst>(I))
- OpsChanged |= vectorizeCmpInst(CI, BB, R);
+ else if (isa<CmpInst>(I))
+ PostponedCmps.push_back(I);
+ }
+ if (AtTerminator) {
+ // Try to find reductions first.
+ for (Instruction *I : PostponedCmps) {
+ if (R.isDeleted(I))
+ continue;
+ for (Value *Op : I->operands())
+ OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI);
+ }
+ // Try to vectorize operands as vector bundles.
+ for (Instruction *I : PostponedCmps) {
+ if (R.isDeleted(I))
+ continue;
+ OpsChanged |= tryToVectorize(I, R);
+ }
+ Instructions.clear();
+ } else {
+ // Insert in reverse order since the PostponedCmps vector was filled in
+ // reverse order.
+ Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend());
}
- Instructions.clear();
return OpsChanged;
}
@@ -7448,6 +8373,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallPtrSet<Value *, 16> VisitedInstrs;
+ // Maps phi nodes to the non-phi nodes found in the use tree for each phi
+ // node. Allows better to identify the chains that can be vectorized in the
+ // better way.
+ DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -7460,22 +8389,125 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (!P)
break;
- if (!VisitedInstrs.count(P) && !R.isDeleted(P))
+ // No need to analyze deleted, vectorized and non-vectorizable
+ // instructions.
+ if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
+ isValidElementType(P->getType()))
Incoming.push_back(P);
}
- // Sort by type.
- llvm::stable_sort(Incoming, PhiTypeSorterFunc);
+ // Find the corresponding non-phi nodes for better matching when trying to
+ // build the tree.
+ for (Value *V : Incoming) {
+ SmallVectorImpl<Value *> &Opcodes =
+ PHIToOpcodes.try_emplace(V).first->getSecond();
+ if (!Opcodes.empty())
+ continue;
+ SmallVector<Value *, 4> Nodes(1, V);
+ SmallPtrSet<Value *, 4> Visited;
+ while (!Nodes.empty()) {
+ auto *PHI = cast<PHINode>(Nodes.pop_back_val());
+ if (!Visited.insert(PHI).second)
+ continue;
+ for (Value *V : PHI->incoming_values()) {
+ if (auto *PHI1 = dyn_cast<PHINode>((V))) {
+ Nodes.push_back(PHI1);
+ continue;
+ }
+ Opcodes.emplace_back(V);
+ }
+ }
+ }
+
+ // Sort by type, parent, operands.
+ stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) {
+ assert(isValidElementType(V1->getType()) &&
+ isValidElementType(V2->getType()) &&
+ "Expected vectorizable types only.");
+ // It is fine to compare type IDs here, since we expect only vectorizable
+ // types, like ints, floats and pointers, we don't care about other type.
+ if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
+ return true;
+ if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
+ return false;
+ ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+ ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+ if (Opcodes1.size() < Opcodes2.size())
+ return true;
+ if (Opcodes1.size() > Opcodes2.size())
+ return false;
+ for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+ // Undefs are compatible with any other value.
+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+ continue;
+ if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+ if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+ DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
+ DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
+ if (!NodeI1)
+ return NodeI2 != nullptr;
+ if (!NodeI2)
+ return false;
+ assert((NodeI1 == NodeI2) ==
+ (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeI1 != NodeI2)
+ return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
+ InstructionsState S = getSameOpcode({I1, I2});
+ if (S.getOpcode())
+ continue;
+ return I1->getOpcode() < I2->getOpcode();
+ }
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ continue;
+ if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID())
+ return true;
+ if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID())
+ return false;
+ }
+ return false;
+ });
+
+ auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) {
+ if (V1 == V2)
+ return true;
+ if (V1->getType() != V2->getType())
+ return false;
+ ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
+ ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
+ if (Opcodes1.size() != Opcodes2.size())
+ return false;
+ for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
+ // Undefs are compatible with any other value.
+ if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
+ continue;
+ if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
+ if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
+ if (I1->getParent() != I2->getParent())
+ return false;
+ InstructionsState S = getSameOpcode({I1, I2});
+ if (S.getOpcode())
+ continue;
+ return false;
+ }
+ if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
+ continue;
+ if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
+ return false;
+ }
+ return true;
+ };
// Try to vectorize elements base on their type.
+ SmallVector<Value *, 4> Candidates;
for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
E = Incoming.end();
IncIt != E;) {
- // Look for the next elements with the same type.
+ // Look for the next elements with the same type, parent and operand
+ // kinds.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
- while (SameTypeIt != E &&
- (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+ while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) {
VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;
}
@@ -7488,13 +8520,25 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// So allow tryToVectorizeList to reorder them if it is beneficial. This
// is done when there are exactly two elements since tryToVectorizeList
// asserts that there are only two values when AllowReorder is true.
- bool AllowReorder = NumElts == 2;
- if (NumElts > 1 &&
- tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
+ if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
+ /*AllowReorder=*/true)) {
// Success start over because instructions might have been changed.
HaveVectorizedPhiNodes = true;
Changed = true;
- break;
+ } else if (NumElts < 4 &&
+ (Candidates.empty() ||
+ Candidates.front()->getType() == (*IncIt)->getType())) {
+ Candidates.append(IncIt, std::next(IncIt, NumElts));
+ }
+ // Final attempt to vectorize phis with the same types.
+ if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) {
+ if (Candidates.size() > 1 &&
+ tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) {
+ // Success start over because instructions might have been changed.
+ HaveVectorizedPhiNodes = true;
+ Changed = true;
+ }
+ Candidates.clear();
}
// Start over at the next instruction of a different type (or the end).
@@ -7518,7 +8562,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
if (it->use_empty() && KeyNodes.contains(&*it) &&
- vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+ vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
+ it->isTerminator())) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
Changed = true;
@@ -7577,7 +8622,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Start vectorization of post-process list of instructions from the
// top-tree instructions to try to vectorize as many instructions as
// possible.
- OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+ OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R,
+ it->isTerminator());
if (OpsChanged) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
@@ -7690,16 +8736,103 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
bool Changed = false;
+ // Sort by type, base pointers and values operand. Value operands must be
+ // compatible (have the same opcode, same parent), otherwise it is
+ // definitely not profitable to try to vectorize them.
+ auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
+ if (V->getPointerOperandType()->getTypeID() <
+ V2->getPointerOperandType()->getTypeID())
+ return true;
+ if (V->getPointerOperandType()->getTypeID() >
+ V2->getPointerOperandType()->getTypeID())
+ return false;
+ // UndefValues are compatible with all other values.
+ if (isa<UndefValue>(V->getValueOperand()) ||
+ isa<UndefValue>(V2->getValueOperand()))
+ return false;
+ if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
+ if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
+ DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =
+ DT->getNode(I1->getParent());
+ DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =
+ DT->getNode(I2->getParent());
+ assert(NodeI1 && "Should only process reachable instructions");
+ assert(NodeI1 && "Should only process reachable instructions");
+ assert((NodeI1 == NodeI2) ==
+ (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
+ "Different nodes should have different DFS numbers");
+ if (NodeI1 != NodeI2)
+ return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
+ InstructionsState S = getSameOpcode({I1, I2});
+ if (S.getOpcode())
+ return false;
+ return I1->getOpcode() < I2->getOpcode();
+ }
+ if (isa<Constant>(V->getValueOperand()) &&
+ isa<Constant>(V2->getValueOperand()))
+ return false;
+ return V->getValueOperand()->getValueID() <
+ V2->getValueOperand()->getValueID();
+ };
+
+ auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) {
+ if (V1 == V2)
+ return true;
+ if (V1->getPointerOperandType() != V2->getPointerOperandType())
+ return false;
+ // Undefs are compatible with any other value.
+ if (isa<UndefValue>(V1->getValueOperand()) ||
+ isa<UndefValue>(V2->getValueOperand()))
+ return true;
+ if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
+ if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
+ if (I1->getParent() != I2->getParent())
+ return false;
+ InstructionsState S = getSameOpcode({I1, I2});
+ return S.getOpcode() > 0;
+ }
+ if (isa<Constant>(V1->getValueOperand()) &&
+ isa<Constant>(V2->getValueOperand()))
+ return true;
+ return V1->getValueOperand()->getValueID() ==
+ V2->getValueOperand()->getValueID();
+ };
+
// Attempt to sort and vectorize each of the store-groups.
- for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
- ++it) {
- if (it->second.size() < 2)
+ for (auto &Pair : Stores) {
+ if (Pair.second.size() < 2)
continue;
LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
- << it->second.size() << ".\n");
+ << Pair.second.size() << ".\n");
+
+ stable_sort(Pair.second, StoreSorter);
- Changed |= vectorizeStores(it->second, R);
+ // Try to vectorize elements based on their compatibility.
+ for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(),
+ E = Pair.second.end();
+ IncIt != E;) {
+
+ // Look for the next elements with the same type.
+ ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt;
+ Type *EltTy = (*IncIt)->getPointerOperand()->getType();
+
+ while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt))
+ ++SameTypeIt;
+
+ // Try to vectorize them.
+ unsigned NumElts = (SameTypeIt - IncIt);
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores ("
+ << NumElts << ")\n");
+ if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() &&
+ vectorizeStores(makeArrayRef(IncIt, NumElts), R)) {
+ // Success start over because instructions might have been changed.
+ Changed = true;
+ }
+
+ // Start over at the next instruction of a different type (or the end).
+ IncIt = SameTypeIt;
+ }
}
return Changed;
}