aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp2251
1 files changed, 1164 insertions, 1087 deletions
diff --git a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f950d0d4eb2b..0b630197911a 100644
--- a/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -17,11 +17,8 @@
//===----------------------------------------------------------------------===//
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
-#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
@@ -29,14 +26,16 @@
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IVDescriptors.h"
#include "llvm/Analysis/LoopAccessAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemoryLocation.h"
@@ -47,7 +46,6 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constant.h"
@@ -66,7 +64,6 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
@@ -83,6 +80,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/InstructionCost.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
@@ -130,6 +128,10 @@ static cl::opt<int>
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
cl::desc("Attempt to vectorize for this register size in bits"));
+static cl::opt<unsigned>
+MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
+ cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
+
static cl::opt<int>
MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
cl::desc("Maximum depth of the lookup for consecutive stores."));
@@ -204,12 +206,12 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
if (!I0)
return false;
BasicBlock *BB = I0->getParent();
- for (int i = 1, e = VL.size(); i < e; i++) {
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I)
+ for (int I = 1, E = VL.size(); I < E; I++) {
+ auto *II = dyn_cast<Instruction>(VL[I]);
+ if (!II)
return false;
- if (BB != I->getParent())
+ if (BB != II->getParent())
return false;
}
return true;
@@ -234,11 +236,16 @@ static bool isSplat(ArrayRef<Value *> VL) {
return true;
}
-/// \returns True if \p I is commutative, handles CmpInst as well as Instruction.
+/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
static bool isCommutative(Instruction *I) {
- if (auto *IC = dyn_cast<CmpInst>(I))
- return IC->isCommutative();
- return I->isCommutative();
+ if (auto *Cmp = dyn_cast<CmpInst>(I))
+ return Cmp->isCommutative();
+ if (auto *BO = dyn_cast<BinaryOperator>(I))
+ return BO->isCommutative();
+ // TODO: This should check for generic Instruction::isCommutative(), but
+ // we need to confirm that the caller code correctly handles Intrinsics
+ // for example (does not have 2 operands).
+ return false;
}
/// Checks if the vector of instructions can be represented as a shuffle, like:
@@ -250,7 +257,7 @@ static bool isCommutative(Instruction *I) {
/// %x3x3 = mul i8 %x3, %x3
/// %y1y1 = mul i8 %y1, %y1
/// %y2y2 = mul i8 %y2, %y2
-/// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
@@ -265,13 +272,13 @@ static bool isCommutative(Instruction *I) {
/// %x3 = extractelement <4 x i8> %x, i32 3
/// %y1 = extractelement <4 x i8> %y, i32 1
/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
+/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
/// %5 = mul <4 x i8> %4, %4
/// %6 = extractelement <4 x i8> %5, i32 0
-/// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
+/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
/// %7 = extractelement <4 x i8> %5, i32 1
/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
/// %8 = extractelement <4 x i8> %5, i32 2
@@ -285,7 +292,8 @@ static bool isCommutative(Instruction *I) {
static Optional<TargetTransformInfo::ShuffleKind>
isShuffle(ArrayRef<Value *> VL) {
auto *EI0 = cast<ExtractElementInst>(VL[0]);
- unsigned Size = EI0->getVectorOperandType()->getNumElements();
+ unsigned Size =
+ cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
Value *Vec1 = nullptr;
Value *Vec2 = nullptr;
enum ShuffleMode { Unknown, Select, Permute };
@@ -294,7 +302,7 @@ isShuffle(ArrayRef<Value *> VL) {
auto *EI = cast<ExtractElementInst>(VL[I]);
auto *Vec = EI->getVectorOperand();
// All vector operands must have the same number of vector elements.
- if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
+ if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
return None;
auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
if (!Idx)
@@ -303,7 +311,7 @@ isShuffle(ArrayRef<Value *> VL) {
if (Idx->getValue().uge(Size))
continue;
unsigned IntIdx = Idx->getValue().getZExtValue();
- // We can extractelement from undef vector.
+ // We can extractelement from undef or poison vector.
if (isa<UndefValue>(Vec))
continue;
// For correct shuffling we have to have at most 2 different vector operands
@@ -500,7 +508,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
}
/// \returns the AA location that is being access by the instruction.
-static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
+static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return MemoryLocation::get(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -521,6 +529,15 @@ static bool isSimple(Instruction *I) {
namespace llvm {
+static void inversePermutation(ArrayRef<unsigned> Indices,
+ SmallVectorImpl<int> &Mask) {
+ Mask.clear();
+ const unsigned E = Indices.size();
+ Mask.resize(E, E + 1);
+ for (unsigned I = 0; I < E; ++I)
+ Mask[Indices[I]] = I;
+}
+
namespace slpvectorizer {
/// Bottom Up SLP Vectorizer.
@@ -535,9 +552,10 @@ public:
using StoreList = SmallVector<StoreInst *, 8>;
using ExtraValueToDebugLocsMap =
MapVector<Value *, SmallVector<Instruction *, 2>>;
+ using OrdersType = SmallVector<unsigned, 4>;
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
- TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
+ TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
const DataLayout *DL, OptimizationRemarkEmitter *ORE)
: F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
@@ -571,11 +589,11 @@ public:
/// \returns the cost incurred by unwanted spills and fills, caused by
/// holding live values over call sites.
- int getSpillCost() const;
+ InstructionCost getSpillCost() const;
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- int getTreeCost();
+ InstructionCost getTreeCost();
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
@@ -612,6 +630,14 @@ public:
/// \returns The best order of instructions for vectorization.
Optional<ArrayRef<unsigned>> bestOrder() const {
+ assert(llvm::all_of(
+ NumOpsWantToKeepOrder,
+ [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
+ return D.getFirst().size() ==
+ VectorizableTree[0]->Scalars.size();
+ }) &&
+ "All orders must have the same size as number of instructions in "
+ "tree node.");
auto I = std::max_element(
NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
[](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
@@ -625,6 +651,81 @@ public:
return makeArrayRef(I->getFirst());
}
+ /// Builds the correct order for root instructions.
+ /// If some leaves have the same instructions to be vectorized, we may
+ /// incorrectly evaluate the best order for the root node (it is built for the
+ /// vector of instructions without repeated instructions and, thus, has less
+ /// elements than the root node). This function builds the correct order for
+ /// the root node.
+ /// For example, if the root node is \<a+b, a+c, a+d, f+e\>, then the leaves
+ /// are \<a, a, a, f\> and \<b, c, d, e\>. When we try to vectorize the first
+ /// leaf, it will be shrink to \<a, b\>. If instructions in this leaf should
+ /// be reordered, the best order will be \<1, 0\>. We need to extend this
+ /// order for the root node. For the root node this order should look like
+ /// \<3, 0, 1, 2\>. This function extends the order for the reused
+ /// instructions.
+ void findRootOrder(OrdersType &Order) {
+ // If the leaf has the same number of instructions to vectorize as the root
+ // - order must be set already.
+ unsigned RootSize = VectorizableTree[0]->Scalars.size();
+ if (Order.size() == RootSize)
+ return;
+ SmallVector<unsigned, 4> RealOrder(Order.size());
+ std::swap(Order, RealOrder);
+ SmallVector<int, 4> Mask;
+ inversePermutation(RealOrder, Mask);
+ Order.assign(Mask.begin(), Mask.end());
+ // The leaf has less number of instructions - need to find the true order of
+ // the root.
+ // Scan the nodes starting from the leaf back to the root.
+ const TreeEntry *PNode = VectorizableTree.back().get();
+ SmallVector<const TreeEntry *, 4> Nodes(1, PNode);
+ SmallPtrSet<const TreeEntry *, 4> Visited;
+ while (!Nodes.empty() && Order.size() != RootSize) {
+ const TreeEntry *PNode = Nodes.pop_back_val();
+ if (!Visited.insert(PNode).second)
+ continue;
+ const TreeEntry &Node = *PNode;
+ for (const EdgeInfo &EI : Node.UserTreeIndices)
+ if (EI.UserTE)
+ Nodes.push_back(EI.UserTE);
+ if (Node.ReuseShuffleIndices.empty())
+ continue;
+ // Build the order for the parent node.
+ OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize);
+ SmallVector<unsigned, 4> OrderCounter(Order.size(), 0);
+ // The algorithm of the order extension is:
+ // 1. Calculate the number of the same instructions for the order.
+ // 2. Calculate the index of the new order: total number of instructions
+ // with order less than the order of the current instruction + reuse
+ // number of the current instruction.
+ // 3. The new order is just the index of the instruction in the original
+ // vector of the instructions.
+ for (unsigned I : Node.ReuseShuffleIndices)
+ ++OrderCounter[Order[I]];
+ SmallVector<unsigned, 4> CurrentCounter(Order.size(), 0);
+ for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) {
+ unsigned ReusedIdx = Node.ReuseShuffleIndices[I];
+ unsigned OrderIdx = Order[ReusedIdx];
+ unsigned NewIdx = 0;
+ for (unsigned J = 0; J < OrderIdx; ++J)
+ NewIdx += OrderCounter[J];
+ NewIdx += CurrentCounter[OrderIdx];
+ ++CurrentCounter[OrderIdx];
+ assert(NewOrder[NewIdx] == RootSize &&
+ "The order index should not be written already.");
+ NewOrder[NewIdx] = I;
+ }
+ std::swap(Order, NewOrder);
+ }
+ assert(Order.size() == RootSize &&
+ "Root node is expected or the size of the order must be the same as "
+ "the number of elements in the root node.");
+ assert(llvm::all_of(Order,
+ [RootSize](unsigned Val) { return Val != RootSize; }) &&
+ "All indices must be initialized");
+ }
+
/// \return The vector element size in bits to use when vectorizing the
/// expression tree ending at \p V. If V is a store, the size is the width of
/// the stored value. Otherwise, the size is the width of the largest loaded
@@ -646,6 +747,12 @@ public:
return MinVecRegSize;
}
+ unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
+ unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
+ MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
+ return MaxVF ? MaxVF : UINT_MAX;
+ }
+
/// Check if homogeneous aggregate is isomorphic to some VectorType.
/// Accepts homogeneous multidimensional aggregate of scalars/vectors like
/// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
@@ -665,7 +772,7 @@ public:
/// effectively impossible for the backend to undo.
/// TODO: If load combining is allowed in the IR optimizer, this analysis
/// may not be necessary.
- bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
+ bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
/// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
/// can be load combined in the backend. Load combining may not be allowed in
@@ -880,6 +987,14 @@ public:
std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
Value *V = Values[Idx].first;
+ if (isa<Constant>(V)) {
+ // Since this is a function pass, it doesn't make semantic sense to
+ // walk the users of a subclass of Constant. The users could be in
+ // another function, or even another module that happens to be in
+ // the same LLVMContext.
+ continue;
+ }
+
// Calculate the absolute lane, using the minimum relative lane of LHS
// and RHS as base and Idx as the offset.
int Ln = std::min(LHS.second, RHS.second) + Idx;
@@ -1388,7 +1503,7 @@ private:
bool areAllUsersVectorized(Instruction *I) const;
/// \returns the cost of the vectorizable entry.
- int getEntryCost(TreeEntry *E);
+ InstructionCost getEntryCost(TreeEntry *E);
/// This is the recursive part of buildTree.
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
@@ -1410,20 +1525,21 @@ private:
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
- int getGatherCost(VectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices) const;
+ InstructionCost
+ getGatherCost(FixedVectorType *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) const;
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
- int getGatherCost(ArrayRef<Value *> VL) const;
+ InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
/// Set the Builder insert point to one after the last instruction in
/// the bundle
void setInsertPointAfterBundle(TreeEntry *E);
/// \returns a vector from a collection of scalars in \p VL.
- Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+ Value *gather(ArrayRef<Value *> VL);
/// \returns whether the VectorizableTree is fully vectorizable and will
/// be beneficial even the tree height is tiny.
@@ -1457,15 +1573,17 @@ private:
/// The Scalars are vectorized into this value. It is initialized to Null.
Value *VectorizedValue = nullptr;
- /// Do we need to gather this sequence ?
- enum EntryState { Vectorize, NeedToGather };
+ /// Do we need to gather this sequence or vectorize it
+ /// (either with vector instruction or with scatter/gather
+ /// intrinsics for store/load)?
+ enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
EntryState State;
/// Does this sequence require some shuffling?
SmallVector<int, 4> ReuseShuffleIndices;
/// Does this entry require reordering?
- ArrayRef<unsigned> ReorderIndices;
+ SmallVector<unsigned, 4> ReorderIndices;
/// Points back to the VectorizableTree.
///
@@ -1606,6 +1724,9 @@ private:
case Vectorize:
dbgs() << "Vectorize\n";
break;
+ case ScatterVectorize:
+ dbgs() << "ScatterVectorize\n";
+ break;
case NeedToGather:
dbgs() << "NeedToGather\n";
break;
@@ -1627,7 +1748,7 @@ private:
dbgs() << "NULL\n";
dbgs() << "ReuseShuffleIndices: ";
if (ReuseShuffleIndices.empty())
- dbgs() << "Emtpy";
+ dbgs() << "Empty";
else
for (unsigned ReuseIdx : ReuseShuffleIndices)
dbgs() << ReuseIdx << ", ";
@@ -1644,26 +1765,55 @@ private:
#endif
};
+#ifndef NDEBUG
+ void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
+ InstructionCost VecCost,
+ InstructionCost ScalarCost) const {
+ dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump();
+ dbgs() << "SLP: Costs:\n";
+ dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
+ dbgs() << "SLP: VectorCost = " << VecCost << "\n";
+ dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
+ dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = " <<
+ ReuseShuffleCost + VecCost - ScalarCost << "\n";
+ }
+#endif
+
/// Create a new VectorizableTree entry.
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
const InstructionsState &S,
const EdgeInfo &UserTreeIdx,
ArrayRef<unsigned> ReuseShuffleIndices = None,
ArrayRef<unsigned> ReorderIndices = None) {
- bool Vectorized = (bool)Bundle;
+ TreeEntry::EntryState EntryState =
+ Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
+ return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
+ ReuseShuffleIndices, ReorderIndices);
+ }
+
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
+ TreeEntry::EntryState EntryState,
+ Optional<ScheduleData *> Bundle,
+ const InstructionsState &S,
+ const EdgeInfo &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
+ assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
+ (Bundle && EntryState != TreeEntry::NeedToGather)) &&
+ "Need to vectorize gather entry?");
VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
TreeEntry *Last = VectorizableTree.back().get();
Last->Idx = VectorizableTree.size() - 1;
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
- Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
+ Last->State = EntryState;
Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
ReuseShuffleIndices.end());
- Last->ReorderIndices = ReorderIndices;
+ Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
Last->setOperations(S);
- if (Vectorized) {
- for (int i = 0, e = VL.size(); i != e; ++i) {
- assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
- ScalarToTreeEntry[VL[i]] = Last;
+ if (Last->State != TreeEntry::NeedToGather) {
+ for (Value *V : VL) {
+ assert(!getTreeEntry(V) && "Scalar already in tree!");
+ ScalarToTreeEntry[V] = Last;
}
// Update the scheduler bundle to point to this TreeEntry.
unsigned Lane = 0;
@@ -1699,18 +1849,10 @@ private:
}
#endif
- TreeEntry *getTreeEntry(Value *V) {
- auto I = ScalarToTreeEntry.find(V);
- if (I != ScalarToTreeEntry.end())
- return I->second;
- return nullptr;
- }
+ TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
const TreeEntry *getTreeEntry(Value *V) const {
- auto I = ScalarToTreeEntry.find(V);
- if (I != ScalarToTreeEntry.end())
- return I->second;
- return nullptr;
+ return ScalarToTreeEntry.lookup(V);
}
/// Maps a specific scalar to its tree entry.
@@ -2195,7 +2337,6 @@ private:
/// List of users to ignore during scheduling and that don't need extracting.
ArrayRef<Value *> UserIgnoreList;
- using OrdersType = SmallVector<unsigned, 4>;
/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
/// sorted SmallVectors of unsigned.
struct OrdersTypeDenseMapInfo {
@@ -2233,7 +2374,7 @@ private:
ScalarEvolution *SE;
TargetTransformInfo *TTI;
TargetLibraryInfo *TLI;
- AliasAnalysis *AA;
+ AAResults *AA;
LoopInfo *LI;
DominatorTree *DT;
AssumptionCache *AC;
@@ -2332,9 +2473,9 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
}
for (auto V : Entry->Scalars) {
OS << *V;
- if (std::any_of(
- R->ExternalUses.begin(), R->ExternalUses.end(),
- [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+ if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
+ return EU.Scalar == V;
+ }))
OS << " <extract>";
OS << "\n";
}
@@ -2366,13 +2507,17 @@ BoUpSLP::~BoUpSLP() {
"trying to erase instruction with users.");
Pair.getFirst()->eraseFromParent();
}
+#ifdef EXPENSIVE_CHECKS
+ // If we could guarantee that this call is not extremely slow, we could
+ // remove the ifdef limitation (see PR47712).
assert(!verifyFunction(*F, &dbgs()));
+#endif
}
void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
for (auto *V : AV) {
if (auto *I = dyn_cast<Instruction>(V))
- eraseInstruction(I, /*ReplaceWithUndef=*/true);
+ eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
};
}
@@ -2597,11 +2742,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
auto *PH = cast<PHINode>(VL0);
// Check for terminator values (e.g. invoke).
- for (unsigned j = 0; j < VL.size(); ++j)
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ for (Value *V : VL)
+ for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
Instruction *Term = dyn_cast<Instruction>(
- cast<PHINode>(VL[j])->getIncomingValueForBlock(
- PH->getIncomingBlock(i)));
+ cast<PHINode>(V)->getIncomingValueForBlock(
+ PH->getIncomingBlock(I)));
if (Term && Term->isTerminator()) {
LLVM_DEBUG(dbgs()
<< "SLP: Need to swizzle PHINodes (terminator use).\n");
@@ -2618,13 +2763,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
// Keeps the reordered operands to avoid code duplication.
SmallVector<ValueList, 2> OperandsVec;
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
- PH->getIncomingBlock(i)));
- TE->setOperand(i, Operands);
+ for (Value *V : VL)
+ Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
+ PH->getIncomingBlock(I)));
+ TE->setOperand(I, Operands);
OperandsVec.push_back(Operands);
}
for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
@@ -2657,12 +2802,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
});
// Insert new order with initial value 0, if it does not exist,
// otherwise return the iterator to the existing one.
- auto StoredCurrentOrderAndNum =
- NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
- ++StoredCurrentOrderAndNum->getSecond();
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies,
- StoredCurrentOrderAndNum->getFirst());
+ ReuseShuffleIndicies, CurrentOrder);
+ findRootOrder(CurrentOrder);
+ ++NumOpsWantToKeepOrder[CurrentOrder];
// This is a special case, as it does not gather, but at the same time
// we are not extending buildTree_rec() towards the operands.
ValueList Op0;
@@ -2739,16 +2882,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
} else {
// Need to reorder.
- auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
- ++I->getSecond();
TreeEntry *TE =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, I->getFirst());
+ ReuseShuffleIndicies, CurrentOrder);
TE->setOperandsInOrder();
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
+ findRootOrder(CurrentOrder);
+ ++NumOpsWantToKeepOrder[CurrentOrder];
}
return;
}
+ // Vectorizing non-consecutive loads with `llvm.masked.gather`.
+ TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(PointerOps, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
+ return;
}
LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
@@ -2883,8 +3033,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
ValueList Operands;
// Prepare the operand vector.
- for (Value *j : VL)
- Operands.push_back(cast<Instruction>(j)->getOperand(i));
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
buildTree_rec(Operands, Depth + 1, {TE, i});
}
@@ -2952,6 +3102,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::Store: {
// Check if the stores are consecutive or if we need to swizzle them.
llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+ // Avoid types that are padded when being allocated as scalars, while
+ // being packed together in a vector (such as i1).
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
+ return;
+ }
// Make sure all stores in the bundle are simple - we can't vectorize
// atomic or volatile stores.
SmallVector<Value *, 4> PointerOps(VL.size());
@@ -3001,15 +3161,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
} else {
- // Need to reorder.
- auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first;
- ++(I->getSecond());
TreeEntry *TE =
newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies, I->getFirst());
+ ReuseShuffleIndicies, CurrentOrder);
TE->setOperandsInOrder();
buildTree_rec(Operands, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+ findRootOrder(CurrentOrder);
+ ++NumOpsWantToKeepOrder[CurrentOrder];
}
return;
}
@@ -3028,7 +3187,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
VFShape Shape = VFShape::get(
- *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/},
+ *CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
@@ -3165,7 +3324,7 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
N *= AT->getNumElements();
EltTy = AT->getElementType();
} else {
- auto *VT = cast<VectorType>(EltTy);
+ auto *VT = cast<FixedVectorType>(EltTy);
N *= VT->getNumElements();
EltTy = VT->getElementType();
}
@@ -3203,7 +3362,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
return false;
} else {
- NElts = cast<VectorType>(Vec->getType())->getNumElements();
+ NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
}
if (NElts != VL.size())
@@ -3247,27 +3406,26 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
}
bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
- return I->hasOneUse() ||
- std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
+ return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
return ScalarToTreeEntry.count(U) > 0;
});
}
-static std::pair<unsigned, unsigned>
-getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
- TargetLibraryInfo *TLI) {
+static std::pair<InstructionCost, InstructionCost>
+getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
- int IntrinsicCost =
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
+ auto IntrinsicCost =
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
- auto Shape =
- VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
- false /*HasGlobalPred*/);
+ auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
+ VecTy->getNumElements())),
+ false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
- int LibCost = IntrinsicCost;
+ auto LibCost = IntrinsicCost;
if (!CI->isNoBuiltin() && VecFunc) {
// Calculate the cost of the vector library call.
SmallVector<Type *, 4> VecTys;
@@ -3282,7 +3440,7 @@ getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
return {IntrinsicCost, LibCost};
}
-int BoUpSLP::getEntryCost(TreeEntry *E) {
+InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
Type *ScalarTy = VL[0]->getType();
@@ -3301,7 +3459,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- int ReuseShuffleCost = 0;
+ InstructionCost ReuseShuffleCost = 0;
if (NeedToShuffleReuses) {
ReuseShuffleCost =
TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
@@ -3317,7 +3475,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
allSameType(VL) && allSameBlock(VL)) {
Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
if (ShuffleKind.hasValue()) {
- int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
+ InstructionCost Cost =
+ TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
for (auto *V : VL) {
// If all users of instruction are going to be vectorized and this
// instruction itself is not going to be vectorized, consider this
@@ -3336,7 +3495,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
return ReuseShuffleCost + getGatherCost(VL);
}
- assert(E->State == TreeEntry::Vectorize && "Unhandled state");
+ assert((E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::ScatterVectorize) &&
+ "Unhandled state");
assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
Instruction *VL0 = E->getMainOp();
unsigned ShuffleOrOp =
@@ -3375,37 +3536,37 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
}
}
- int DeadCost = ReuseShuffleCost;
+ InstructionCost DeadCost = ReuseShuffleCost;
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
DeadCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- Instruction *E = cast<Instruction>(VL[i]);
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ Instruction *EI = cast<Instruction>(VL[I]);
// If all users are going to be vectorized, instruction can be
// considered as dead.
// The same, if have only one user, it will be vectorized for sure.
- if (areAllUsersVectorized(E)) {
+ if (areAllUsersVectorized(EI)) {
// Take credit for instruction that will become dead.
- if (E->hasOneUse()) {
- Instruction *Ext = E->user_back();
+ if (EI->hasOneUse()) {
+ Instruction *Ext = EI->user_back();
if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
all_of(Ext->users(),
[](User *U) { return isa<GetElementPtrInst>(U); })) {
// Use getExtractWithExtendCost() to calculate the cost of
// extractelement/ext pair.
DeadCost -= TTI->getExtractWithExtendCost(
- Ext->getOpcode(), Ext->getType(), VecTy, i);
+ Ext->getOpcode(), Ext->getType(), VecTy, I);
// Add back the cost of s|zext which is subtracted separately.
DeadCost += TTI->getCastInstrCost(
- Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
- Ext);
+ Ext->getOpcode(), Ext->getType(), EI->getType(),
+ TTI::getCastContextHint(Ext), CostKind, Ext);
continue;
}
}
DeadCost -=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
}
}
return DeadCost;
@@ -3423,40 +3584,78 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
case Instruction::FPTrunc:
case Instruction::BitCast: {
Type *SrcTy = VL0->getOperand(0)->getType();
- int ScalarEltCost =
- TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
- VL0);
+ InstructionCost ScalarEltCost =
+ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
// Calculate the cost of this instruction.
- int ScalarCost = VL.size() * ScalarEltCost;
+ InstructionCost ScalarCost = VL.size() * ScalarEltCost;
auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
- int VecCost = 0;
+ InstructionCost VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
- VecCost = ReuseShuffleCost +
- TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
- CostKind, VL0);
+ VecCost =
+ ReuseShuffleCost +
+ TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+ TTI::getCastContextHint(VL0), CostKind, VL0);
}
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return VecCost - ScalarCost;
}
case Instruction::FCmp:
case Instruction::ICmp:
case Instruction::Select: {
// Calculate the cost of this instruction.
- int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
- Builder.getInt1Ty(),
- CostKind, VL0);
+ InstructionCost ScalarEltCost =
+ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
+ CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
- int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
- CostKind, VL0);
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+
+ // Check if all entries in VL are either compares or selects with compares
+ // as condition that have the same predicates.
+ CmpInst::Predicate VecPred = CmpInst::BAD_ICMP_PREDICATE;
+ bool First = true;
+ for (auto *V : VL) {
+ CmpInst::Predicate CurrentPred;
+ auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
+ if ((!match(V, m_Select(MatchCmp, m_Value(), m_Value())) &&
+ !match(V, MatchCmp)) ||
+ (!First && VecPred != CurrentPred)) {
+ VecPred = CmpInst::BAD_ICMP_PREDICATE;
+ break;
+ }
+ First = false;
+ VecPred = CurrentPred;
+ }
+
+ InstructionCost VecCost = TTI->getCmpSelInstrCost(
+ E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
+ // Check if it is possible and profitable to use min/max for selects in
+ // VL.
+ //
+ auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
+ if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
+ IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
+ {VecTy, VecTy});
+ InstructionCost IntrinsicCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+ // If the selects are the only uses of the compares, they will be dead
+ // and we can adjust the cost by removing their cost.
+ if (IntrinsicAndUse.second)
+ IntrinsicCost -=
+ TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy, MaskTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ VecCost = std::min(VecCost, IntrinsicCost);
+ }
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::FNeg:
@@ -3516,16 +3715,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
SmallVector<const Value *, 4> Operands(VL0->operand_values());
- int ScalarEltCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
- Operands, VL0);
+ InstructionCost ScalarEltCost =
+ TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost = TTI->getArithmeticInstrCost(
- E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
- Operands, VL0);
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecCost =
+ TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
+ Op2VK, Op1VP, Op2VP, Operands, VL0);
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::GetElementPtr: {
@@ -3534,36 +3734,42 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
TargetTransformInfo::OperandValueKind Op2VK =
TargetTransformInfo::OK_UniformConstantValue;
- int ScalarEltCost =
- TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
- Op1VK, Op2VK);
+ InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
+ Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
- int VecCost =
- TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
- Op1VK, Op2VK);
+ InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecCost = TTI->getArithmeticInstrCost(
+ Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
case Instruction::Load: {
// Cost of wide load - cost of scalar loads.
Align alignment = cast<LoadInst>(VL0)->getAlign();
- int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
- CostKind, VL0);
+ InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
+ Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
- int VecLdCost =
- TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
- CostKind, VL0);
+ InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecLdCost;
+ if (E->State == TreeEntry::Vectorize) {
+ VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+ CostKind, VL0);
+ } else {
+ assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
+ VecLdCost = TTI->getGatherScatterOpCost(
+ Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
+ /*VariableMask=*/false, alignment, CostKind, VL0);
+ }
if (!E->ReorderIndices.empty()) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecLdCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
return ReuseShuffleCost + VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
@@ -3572,19 +3778,19 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
auto *SI =
cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
Align Alignment = SI->getAlign();
- int ScalarEltCost =
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
- CostKind, VL0);
+ InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
+ Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
if (NeedToShuffleReuses)
ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
- VecTy, Alignment, 0, CostKind, VL0);
+ InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost VecStCost = TTI->getMemoryOpCost(
+ Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
if (IsReorder) {
// TODO: Merge this shuffle with the ReuseShuffleCost.
VecStCost += TTI->getShuffleCost(
TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
}
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
return ReuseShuffleCost + VecStCost - ScalarStCost;
}
case Instruction::Call: {
@@ -3592,15 +3798,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
- IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
- int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
+ IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
+ InstructionCost ScalarEltCost =
+ TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
if (NeedToShuffleReuses) {
ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
}
- int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
+ InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
- int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
+ InstructionCost VecCallCost =
+ std::min(VecCallCosts.first, VecCallCosts.second);
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -3615,7 +3823,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
(Instruction::isCast(E->getOpcode()) &&
Instruction::isCast(E->getAltOpcode()))) &&
"Invalid Shuffle Vector Operand");
- int ScalarCost = 0;
+ InstructionCost ScalarCost = 0;
if (NeedToShuffleReuses) {
for (unsigned Idx : E->ReuseShuffleIndices) {
Instruction *I = cast<Instruction>(VL[Idx]);
@@ -3633,7 +3841,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
- int VecCost = 0;
+ InstructionCost VecCost = 0;
if (Instruction::isBinaryOp(E->getOpcode())) {
VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
@@ -3644,11 +3852,12 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
- CostKind);
+ TTI::CastContextHint::None, CostKind);
VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
- CostKind);
+ TTI::CastContextHint::None, CostKind);
}
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+ LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
return ReuseShuffleCost + VecCost - ScalarCost;
}
default:
@@ -3686,11 +3895,13 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
TargetTransformInfo *TTI) {
// Look past the root to find a source value. Arbitrarily follow the
// path through operand 0 of any 'or'. Also, peek through optional
- // shift-left-by-constant.
+ // shift-left-by-multiple-of-8-bits.
Value *ZextLoad = Root;
+ const APInt *ShAmtC;
while (!isa<ConstantExpr>(ZextLoad) &&
(match(ZextLoad, m_Or(m_Value(), m_Value())) ||
- match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
+ (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
+ ShAmtC->urem(8) == 0)))
ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
// Check if the input is an extended load of the required or/shift expression.
@@ -3714,8 +3925,8 @@ static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
return true;
}
-bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
- if (RdxOpcode != Instruction::Or)
+bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
+ if (RdxKind != RecurKind::Or)
return false;
unsigned NumElts = VectorizableTree[0]->Scalars.size();
@@ -3756,22 +3967,35 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
return true;
}
-int BoUpSLP::getSpillCost() const {
+InstructionCost BoUpSLP::getSpillCost() const {
// Walk from the bottom of the tree to the top, tracking which values are
// live. When we see a call instruction that is not part of our tree,
// query TTI to see if there is a cost to keeping values live over it
// (for example, if spills and fills are required).
unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
- int Cost = 0;
+ InstructionCost Cost = 0;
SmallPtrSet<Instruction*, 4> LiveValues;
Instruction *PrevInst = nullptr;
+ // The entries in VectorizableTree are not necessarily ordered by their
+ // position in basic blocks. Collect them and order them by dominance so later
+ // instructions are guaranteed to be visited first. For instructions in
+ // different basic blocks, we only scan to the beginning of the block, so
+ // their order does not matter, as long as all instructions in a basic block
+ // are grouped together. Using dominance ensures a deterministic order.
+ SmallVector<Instruction *, 16> OrderedScalars;
for (const auto &TEPtr : VectorizableTree) {
Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
if (!Inst)
continue;
+ OrderedScalars.push_back(Inst);
+ }
+ llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
+ return DT->dominates(B, A);
+ });
+ for (Instruction *Inst : OrderedScalars) {
if (!PrevInst) {
PrevInst = Inst;
continue;
@@ -3825,8 +4049,8 @@ int BoUpSLP::getSpillCost() const {
return Cost;
}
-int BoUpSLP::getTreeCost() {
- int Cost = 0;
+InstructionCost BoUpSLP::getTreeCost() {
+ InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
@@ -3856,15 +4080,16 @@ int BoUpSLP::getTreeCost() {
}))
continue;
- int C = getEntryCost(&TE);
+ InstructionCost C = getEntryCost(&TE);
+ Cost += C;
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
<< " for bundle that starts with " << *TE.Scalars[0]
- << ".\n");
- Cost += C;
+ << ".\n"
+ << "SLP: Current total cost = " << Cost << "\n");
}
SmallPtrSet<Value *, 16> ExtractCostCalculated;
- int ExtractCost = 0;
+ InstructionCost ExtractCost = 0;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (!ExtractCostCalculated.insert(EU.Scalar).second)
@@ -3894,39 +4119,42 @@ int BoUpSLP::getTreeCost() {
}
}
- int SpillCost = getSpillCost();
+ InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- std::string Str;
+#ifndef NDEBUG
+ SmallString<256> Str;
{
- raw_string_ostream OS(Str);
+ raw_svector_ostream OS(Str);
OS << "SLP: Spill Cost = " << SpillCost << ".\n"
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
<< "SLP: Total Cost = " << Cost << ".\n";
}
LLVM_DEBUG(dbgs() << Str);
-
if (ViewSLPTree)
ViewGraph(this, "SLP" + F->getName(), false, Str);
+#endif
return Cost;
}
-int BoUpSLP::getGatherCost(VectorType *Ty,
- const DenseSet<unsigned> &ShuffledIndices) const {
+InstructionCost
+BoUpSLP::getGatherCost(FixedVectorType *Ty,
+ const DenseSet<unsigned> &ShuffledIndices) const {
unsigned NumElts = Ty->getNumElements();
APInt DemandedElts = APInt::getNullValue(NumElts);
- for (unsigned i = 0; i < NumElts; ++i)
- if (!ShuffledIndices.count(i))
- DemandedElts.setBit(i);
- int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
- /*Extract*/ false);
+ for (unsigned I = 0; I < NumElts; ++I)
+ if (!ShuffledIndices.count(I))
+ DemandedElts.setBit(I);
+ InstructionCost Cost =
+ TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+ /*Extract*/ false);
if (!ShuffledIndices.empty())
Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
return Cost;
}
-int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
+InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -3968,11 +4196,10 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
// should be in this block.
auto *Front = E->getMainOp();
auto *BB = Front->getParent();
- assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()),
- [=](Value *V) -> bool {
- auto *I = cast<Instruction>(V);
- return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
- }));
+ assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
+ auto *I = cast<Instruction>(V);
+ return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
+ }));
// The last instruction in the bundle in program order.
Instruction *LastInst = nullptr;
@@ -4025,34 +4252,30 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
Builder.SetCurrentDebugLocation(Front->getDebugLoc());
}
-Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
- Value *Vec = UndefValue::get(Ty);
- // Generate the 'InsertElement' instruction.
- for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
- Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
- if (auto *Insrt = dyn_cast<InsertElementInst>(Vec)) {
- GatherSeq.insert(Insrt);
- CSEBlocks.insert(Insrt->getParent());
-
- // Add to our 'need-to-extract' list.
- if (TreeEntry *E = getTreeEntry(VL[i])) {
- // Find which lane we need to extract.
- int FoundLane = -1;
- for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) {
- // Is this the lane of the scalar that we are looking for ?
- if (E->Scalars[Lane] == VL[i]) {
- FoundLane = Lane;
- break;
- }
- }
- assert(FoundLane >= 0 && "Could not find the correct lane");
- if (!E->ReuseShuffleIndices.empty()) {
- FoundLane =
- std::distance(E->ReuseShuffleIndices.begin(),
- llvm::find(E->ReuseShuffleIndices, FoundLane));
- }
- ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
+ Value *Val0 =
+ isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
+ FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
+ Value *Vec = PoisonValue::get(VecTy);
+ unsigned InsIndex = 0;
+ for (Value *Val : VL) {
+ Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
+ auto *InsElt = dyn_cast<InsertElementInst>(Vec);
+ if (!InsElt)
+ continue;
+ GatherSeq.insert(InsElt);
+ CSEBlocks.insert(InsElt->getParent());
+ // Add to our 'need-to-extract' list.
+ if (TreeEntry *Entry = getTreeEntry(Val)) {
+ // Find which lane we need to extract.
+ unsigned FoundLane = std::distance(Entry->Scalars.begin(),
+ find(Entry->Scalars, Val));
+ assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
+ if (!Entry->ReuseShuffleIndices.empty()) {
+ FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
+ find(Entry->ReuseShuffleIndices, FoundLane));
}
+ ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
}
}
@@ -4076,8 +4299,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
for (int Idx : E->ReuseShuffleIndices)
if (UsedIdxs.insert(Idx).second)
UniqueIdxs.emplace_back(Idx);
- V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
- UniqueIdxs);
+ V = Builder.CreateShuffleVector(V, UniqueIdxs);
}
}
return V;
@@ -4085,10 +4307,6 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
}
}
- Type *ScalarTy = S.OpValue->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
- ScalarTy = SI->getValueOperand()->getType();
-
// Check that every instruction appears once in this bundle.
SmallVector<int, 4> ReuseShuffleIndicies;
SmallVector<Value *, 4> UniqueValues;
@@ -4108,27 +4326,16 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
else
VL = UniqueValues;
}
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
- Value *V = Gather(VL, VecTy);
+ Value *Vec = gather(VL);
if (!ReuseShuffleIndicies.empty()) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- ReuseShuffleIndicies, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
+ Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
- return V;
-}
-
-static void inversePermutation(ArrayRef<unsigned> Indices,
- SmallVectorImpl<int> &Mask) {
- Mask.clear();
- const unsigned E = Indices.size();
- Mask.resize(E);
- for (unsigned I = 0; I < E; ++I)
- Mask[Indices[I]] = I;
+ return Vec;
}
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
@@ -4139,32 +4346,31 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
return E->VectorizedValue;
}
- Instruction *VL0 = E->getMainOp();
- Type *ScalarTy = VL0->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
- ScalarTy = SI->getValueOperand()->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
-
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
-
if (E->State == TreeEntry::NeedToGather) {
setInsertPointAfterBundle(E);
- auto *V = Gather(E->Scalars, VecTy);
+ Value *Vec = gather(E->Scalars);
if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- if (auto *I = dyn_cast<Instruction>(V)) {
+ Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle");
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
GatherSeq.insert(I);
CSEBlocks.insert(I->getParent());
}
}
- E->VectorizedValue = V;
- return V;
+ E->VectorizedValue = Vec;
+ return Vec;
}
- assert(E->State == TreeEntry::Vectorize && "Unhandled state");
+ assert((E->State == TreeEntry::Vectorize ||
+ E->State == TreeEntry::ScatterVectorize) &&
+ "Unhandled state");
unsigned ShuffleOrOp =
E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ Instruction *VL0 = E->getMainOp();
+ Type *ScalarTy = VL0->getType();
+ if (auto *Store = dyn_cast<StoreInst>(VL0))
+ ScalarTy = Store->getValueOperand()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
switch (ShuffleOrOp) {
case Instruction::PHI: {
auto *PH = cast<PHINode>(VL0);
@@ -4172,10 +4378,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
Value *V = NewPhi;
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
// PHINodes may have multiple entries from the same block. We want to
@@ -4208,37 +4413,33 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
+ V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
}
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
if (E->ReorderIndices.empty())
Builder.SetInsertPoint(VL0);
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
return V;
}
case Instruction::ExtractValue: {
- LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
+ auto *LI = cast<LoadInst>(E->getSingleOperand(0));
Builder.SetInsertPoint(LI);
- PointerType *PtrTy =
- PointerType::get(VecTy, LI->getPointerAddressSpace());
+ auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
Value *NewV = propagateMetadata(V, E->Scalars);
if (!E->ReorderIndices.empty()) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
- NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
- "reorder_shuffle");
+ NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");
}
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
- NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
+ NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,
+ "shuffle");
}
E->VectorizedValue = NewV;
return NewV;
@@ -4266,10 +4467,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
auto *CI = cast<CastInst>(VL0);
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4289,10 +4489,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
Value *V = Builder.CreateCmp(P0, L, R);
propagateIRFlags(V, E->Scalars, VL0);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4310,10 +4509,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = Builder.CreateSelect(Cond, True, False);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4334,10 +4532,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4378,10 +4575,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (auto *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4396,30 +4592,40 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
setInsertPointAfterBundle(E);
LoadInst *LI = cast<LoadInst>(VL0);
+ Instruction *NewLI;
unsigned AS = LI->getPointerAddressSpace();
+ Value *PO = LI->getPointerOperand();
+ if (E->State == TreeEntry::Vectorize) {
- Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
- VecTy->getPointerTo(AS));
+ Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
- // The pointer operand uses an in-tree scalar so we add the new BitCast to
- // ExternalUses list to make sure that an extract will be generated in the
- // future.
- Value *PO = LI->getPointerOperand();
- if (getTreeEntry(PO))
- ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
+ // The pointer operand uses an in-tree scalar so we add the new BitCast
+ // to ExternalUses list to make sure that an extract will be generated
+ // in the future.
+ if (getTreeEntry(PO))
+ ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
+
+ NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
+ } else {
+ assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
+ Value *VecPtr = vectorizeTree(E->getOperand(0));
+ // Use the minimum alignment of the gathered loads.
+ Align CommonAlignment = LI->getAlign();
+ for (Value *V : E->Scalars)
+ CommonAlignment =
+ commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
+ NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment);
+ }
+ Value *V = propagateMetadata(NewLI, E->Scalars);
- LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
- Value *V = propagateMetadata(LI, E->Scalars);
if (IsReorder) {
SmallVector<int, 4> Mask;
inversePermutation(E->ReorderIndices, Mask);
- V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
- Mask, "reorder_shuffle");
+ V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
}
if (NeedToShuffleReuses) {
// TODO: Merge this shuffle with the ReorderShuffleMask.
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
}
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4437,9 +4643,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (IsReorder) {
SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
E->ReorderIndices.end());
- VecValue = Builder.CreateShuffleVector(
- VecValue, UndefValue::get(VecValue->getType()), Mask,
- "reorder_shuffle");
+ VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
}
Value *ScalarPtr = SI->getPointerOperand();
Value *VecPtr = Builder.CreateBitCast(
@@ -4454,10 +4658,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
Value *V = propagateMetadata(ST, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4494,10 +4697,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4537,9 +4739,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Function *CF;
if (!UseIntrinsic) {
- VFShape Shape = VFShape::get(
- *CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
- false /*HasGlobalPred*/);
+ VFShape Shape =
+ VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
+ VecTy->getNumElements())),
+ false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
} else {
Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
@@ -4557,10 +4760,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
propagateIRFlags(V, E->Scalars, VL0);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
return V;
@@ -4625,10 +4827,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
if (Instruction *I = dyn_cast<Instruction>(V))
V = propagateMetadata(I, E->Scalars);
- if (NeedToShuffleReuses) {
- V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
- E->ReuseShuffleIndices, "shuffle");
- }
+ if (NeedToShuffleReuses)
+ V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
+
E->VectorizedValue = V;
++NumVectorInstructions;
@@ -4693,7 +4894,8 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
continue;
TreeEntry *E = getTreeEntry(Scalar);
assert(E && "Invalid scalar");
- assert(E->State == TreeEntry::Vectorize && "Extracting from a gather list");
+ assert(E->State != TreeEntry::NeedToGather &&
+ "Extracting from a gather list");
Value *Vec = E->VectorizedValue;
assert(Vec && "Can't find vectorizable value");
@@ -4851,7 +5053,8 @@ void BoUpSLP::optimizeGatherSequence() {
// instructions into different buckets based on the insert lane.
SmallVector<Instruction *, 16> Visited;
for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
- assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
+ assert(*I &&
+ (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
"Worklist not sorted properly!");
BasicBlock *BB = (*I)->getBlock();
// For all instructions in blocks containing gather sequences:
@@ -4961,8 +5164,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
// cancelScheduling).
while (!Bundle->isReady() && !ReadyInsts.empty()) {
- ScheduleData *pickedSD = ReadyInsts.back();
- ReadyInsts.pop_back();
+ ScheduleData *pickedSD = ReadyInsts.pop_back_val();
if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
schedule(pickedSD, ReadyInsts);
@@ -5106,7 +5308,9 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
if (I->mayReadOrWriteMemory() &&
(!isa<IntrinsicInst>(I) ||
- cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
+ (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
+ cast<IntrinsicInst>(I)->getIntrinsicID() !=
+ Intrinsic::pseudoprobe))) {
// Update the linked list of memory accessing instructions.
if (CurrentLoadStore) {
CurrentLoadStore->NextLoadStore = SD;
@@ -5133,8 +5337,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
WorkList.push_back(SD);
while (!WorkList.empty()) {
- ScheduleData *SD = WorkList.back();
- WorkList.pop_back();
+ ScheduleData *SD = WorkList.pop_back_val();
ScheduleData *BundleMember = SD;
while (BundleMember) {
@@ -5331,10 +5534,15 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
}
unsigned BoUpSLP::getVectorElementSize(Value *V) {
- // If V is a store, just return the width of the stored value without
- // traversing the expression tree. This is the common case.
- if (auto *Store = dyn_cast<StoreInst>(V))
- return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+ // If V is a store, just return the width of the stored value (or value
+ // truncated just before storing) without traversing the expression tree.
+ // This is the common case.
+ if (auto *Store = dyn_cast<StoreInst>(V)) {
+ if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
+ return DL->getTypeSizeInBits(Trunc->getSrcTy());
+ else
+ return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
+ }
auto E = InstrElementSize.find(V);
if (E != InstrElementSize.end())
@@ -5683,7 +5891,7 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A
bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
TargetTransformInfo *TTI_,
- TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
+ TargetLibraryInfo *TLI_, AAResults *AA_,
LoopInfo *LI_, DominatorTree *DT_,
AssumptionCache *AC_, DemandedBits *DB_,
OptimizationRemarkEmitter *ORE_) {
@@ -5783,11 +5991,11 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ InstructionCost Cost = R.getTreeCost();
- LLVM_DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
if (Cost < -SLPCostThreshold) {
- LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
using namespace ore;
@@ -5860,7 +6068,7 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
// If a vector register can't hold 1 element, we are done.
unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(Stores[0]);
+ unsigned EltSize = R.getVectorElementSize(Operands[0]);
if (MaxVecRegSize % EltSize != 0)
continue;
@@ -5911,7 +6119,7 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
continue;
if (!isValidElementType(SI->getValueOperand()->getType()))
continue;
- Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
+ Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
}
// Ignore getelementptr instructions that have more than one index, a
@@ -5975,6 +6183,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
unsigned Sz = R.getVectorElementSize(I0);
unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
if (MaxVF < 2) {
R.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
@@ -5986,7 +6195,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
bool Changed = false;
bool CandidateFound = false;
- int MinCost = SLPCostThreshold;
+ InstructionCost MinCost = SLPCostThreshold.getValue();
bool CompensateUseCost =
!InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
@@ -6042,7 +6251,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
continue;
R.computeMinimumValueSizes();
- int Cost = R.getTreeCost();
+ InstructionCost Cost = R.getTreeCost();
CandidateFound = true;
if (CompensateUseCost) {
// TODO: Use TTI's getScalarizationOverhead for sequence of inserts
@@ -6052,7 +6261,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// part should also switch to same interface.
// For example, the following case is projected code after SLP:
// %4 = extractelement <4 x i64> %3, i32 0
- // %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+ // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
// %5 = extractelement <4 x i64> %3, i32 1
// %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
// %6 = extractelement <4 x i64> %3, i32 2
@@ -6072,7 +6281,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
// Switching to the TTI interface might help a bit.
// Alternative solution could be pattern-match to detect a no-op or
// shuffle.
- unsigned UserCost = 0;
+ InstructionCost UserCost = 0;
for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
@@ -6163,50 +6372,20 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
return false;
}
-/// Generate a shuffle mask to be used in a reduction tree.
-///
-/// \param VecLen The length of the vector to be reduced.
-/// \param NumEltsToRdx The number of elements that should be reduced in the
-/// vector.
-/// \param IsPairwise Whether the reduction is a pairwise or splitting
-/// reduction. A pairwise reduction will generate a mask of
-/// <0,2,...> or <1,3,..> while a splitting reduction will generate
-/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
-/// \param IsLeft True will generate a mask of even elements, odd otherwise.
-static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
- unsigned NumEltsToRdx,
- bool IsPairwise, bool IsLeft) {
- assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
-
- SmallVector<int, 32> ShuffleMask(VecLen, -1);
-
- if (IsPairwise)
- // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
- for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = 2 * i + !IsLeft;
- else
- // Move the upper half of the vector to the lower half.
- for (unsigned i = 0; i != NumEltsToRdx; ++i)
- ShuffleMask[i] = NumEltsToRdx + i;
-
- return ShuffleMask;
-}
-
namespace {
/// Model horizontal reductions.
///
-/// A horizontal reduction is a tree of reduction operations (currently add and
-/// fadd) that has operations that can be put into a vector as its leaf.
-/// For example, this tree:
+/// A horizontal reduction is a tree of reduction instructions that has values
+/// that can be put into a vector as its leaves. For example:
///
/// mul mul mul mul
/// \ / \ /
/// + +
/// \ /
/// +
-/// This tree has "mul" as its reduced values and "+" as its reduction
-/// operations. A reduction might be feeding into a store or a binary operation
+/// This tree has "mul" as its leaf values and "+" as its reduction
+/// instructions. A reduction can feed into a store or a binary operation
/// feeding a phi.
/// ...
/// \ /
@@ -6224,333 +6403,30 @@ namespace {
class HorizontalReduction {
using ReductionOpsType = SmallVector<Value *, 16>;
using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
- ReductionOpsListType ReductionOps;
+ ReductionOpsListType ReductionOps;
SmallVector<Value *, 32> ReducedVals;
// Use map vector to make stable output.
MapVector<Instruction *, Value *> ExtraArgs;
-
- /// Kind of the reduction data.
- enum ReductionKind {
- RK_None, /// Not a reduction.
- RK_Arithmetic, /// Binary reduction data.
- RK_Min, /// Minimum reduction data.
- RK_UMin, /// Unsigned minimum reduction data.
- RK_Max, /// Maximum reduction data.
- RK_UMax, /// Unsigned maximum reduction data.
- };
-
- /// Contains info about operation, like its opcode, left and right operands.
- class OperationData {
- /// Opcode of the instruction.
- unsigned Opcode = 0;
-
- /// Left operand of the reduction operation.
- Value *LHS = nullptr;
-
- /// Right operand of the reduction operation.
- Value *RHS = nullptr;
-
- /// Kind of the reduction operation.
- ReductionKind Kind = RK_None;
-
- /// True if float point min/max reduction has no NaNs.
- bool NoNaN = false;
-
- /// Checks if the reduction operation can be vectorized.
- bool isVectorizable() const {
- return LHS && RHS &&
- // We currently only support add/mul/logical && min/max reductions.
- ((Kind == RK_Arithmetic &&
- (Opcode == Instruction::Add || Opcode == Instruction::FAdd ||
- Opcode == Instruction::Mul || Opcode == Instruction::FMul ||
- Opcode == Instruction::And || Opcode == Instruction::Or ||
- Opcode == Instruction::Xor)) ||
- ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
- (Kind == RK_Min || Kind == RK_Max)) ||
- (Opcode == Instruction::ICmp &&
- (Kind == RK_UMin || Kind == RK_UMax)));
- }
-
- /// Creates reduction operation with the current opcode.
- Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
- assert(isVectorizable() &&
- "Expected add|fadd or min/max reduction operation.");
- Value *Cmp = nullptr;
- switch (Kind) {
- case RK_Arithmetic:
- return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
- Name);
- case RK_Min:
- Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
- : Builder.CreateFCmpOLT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_Max:
- Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
- : Builder.CreateFCmpOGT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_UMin:
- assert(Opcode == Instruction::ICmp && "Expected integer types.");
- Cmp = Builder.CreateICmpULT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_UMax:
- assert(Opcode == Instruction::ICmp && "Expected integer types.");
- Cmp = Builder.CreateICmpUGT(LHS, RHS);
- return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- case RK_None:
- break;
- }
- llvm_unreachable("Unknown reduction operation.");
- }
-
- public:
- explicit OperationData() = default;
-
- /// Construction for reduced values. They are identified by opcode only and
- /// don't have associated LHS/RHS values.
- explicit OperationData(Value *V) {
- if (auto *I = dyn_cast<Instruction>(V))
- Opcode = I->getOpcode();
- }
-
- /// Constructor for reduction operations with opcode and its left and
- /// right operands.
- OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
- bool NoNaN = false)
- : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
- assert(Kind != RK_None && "One of the reduction operations is expected.");
- }
-
- explicit operator bool() const { return Opcode; }
-
- /// Return true if this operation is any kind of minimum or maximum.
- bool isMinMax() const {
- switch (Kind) {
- case RK_Arithmetic:
- return false;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- return true;
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
- }
-
- /// Get the index of the first operand.
- unsigned getFirstOperandIndex() const {
- assert(!!*this && "The opcode is not set.");
- // We allow calling this before 'Kind' is set, so handle that specially.
- if (Kind == RK_None)
- return 0;
- return isMinMax() ? 1 : 0;
- }
-
- /// Total number of operands in the reduction operation.
- unsigned getNumberOfOperands() const {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- return isMinMax() ? 3 : 2;
- }
-
- /// Checks if the operation has the same parent as \p P.
- bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (!IsRedOp)
- return I->getParent() == P;
- if (isMinMax()) {
- // SelectInst must be used twice while the condition op must have single
- // use only.
- auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
- return I->getParent() == P && Cmp && Cmp->getParent() == P;
- }
- // Arithmetic reduction operation must be used once only.
- return I->getParent() == P;
- }
-
- /// Expected number of uses for reduction operations/reduced values.
- bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (isMinMax())
- return I->hasNUses(2) &&
- (!IsReductionOp ||
- cast<SelectInst>(I)->getCondition()->hasOneUse());
- return I->hasOneUse();
- }
-
- /// Initializes the list of reduction operations.
- void initReductionOps(ReductionOpsListType &ReductionOps) {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (isMinMax())
- ReductionOps.assign(2, ReductionOpsType());
- else
- ReductionOps.assign(1, ReductionOpsType());
- }
-
- /// Add all reduction operations for the reduction instruction \p I.
- void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
- assert(Kind != RK_None && !!*this && LHS && RHS &&
- "Expected reduction operation.");
- if (isMinMax()) {
- ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
- ReductionOps[1].emplace_back(I);
- } else {
- ReductionOps[0].emplace_back(I);
- }
- }
-
- /// Checks if instruction is associative and can be vectorized.
- bool isAssociative(Instruction *I) const {
- assert(Kind != RK_None && *this && LHS && RHS &&
- "Expected reduction operation.");
- switch (Kind) {
- case RK_Arithmetic:
- return I->isAssociative();
- case RK_Min:
- case RK_Max:
- return Opcode == Instruction::ICmp ||
- cast<Instruction>(I->getOperand(0))->isFast();
- case RK_UMin:
- case RK_UMax:
- assert(Opcode == Instruction::ICmp &&
- "Only integer compare operation is expected.");
- return true;
- case RK_None:
- break;
- }
- llvm_unreachable("Reduction kind is not set");
- }
-
- /// Checks if the reduction operation can be vectorized.
- bool isVectorizable(Instruction *I) const {
- return isVectorizable() && isAssociative(I);
- }
-
- /// Checks if two operation data are both a reduction op or both a reduced
- /// value.
- bool operator==(const OperationData &OD) const {
- assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
- "One of the comparing operations is incorrect.");
- return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
- }
- bool operator!=(const OperationData &OD) const { return !(*this == OD); }
- void clear() {
- Opcode = 0;
- LHS = nullptr;
- RHS = nullptr;
- Kind = RK_None;
- NoNaN = false;
- }
-
- /// Get the opcode of the reduction operation.
- unsigned getOpcode() const {
- assert(isVectorizable() && "Expected vectorizable operation.");
- return Opcode;
- }
-
- /// Get kind of reduction data.
- ReductionKind getKind() const { return Kind; }
- Value *getLHS() const { return LHS; }
- Value *getRHS() const { return RHS; }
- Type *getConditionType() const {
- return isMinMax() ? CmpInst::makeCmpResultType(LHS->getType()) : nullptr;
- }
-
- /// Creates reduction operation with the current opcode with the IR flags
- /// from \p ReductionOps.
- Value *createOp(IRBuilder<> &Builder, const Twine &Name,
- const ReductionOpsListType &ReductionOps) const {
- assert(isVectorizable() &&
- "Expected add|fadd or min/max reduction operation.");
- auto *Op = createOp(Builder, Name);
- switch (Kind) {
- case RK_Arithmetic:
- propagateIRFlags(Op, ReductionOps[0]);
- return Op;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- if (auto *SI = dyn_cast<SelectInst>(Op))
- propagateIRFlags(SI->getCondition(), ReductionOps[0]);
- propagateIRFlags(Op, ReductionOps[1]);
- return Op;
- case RK_None:
- break;
- }
- llvm_unreachable("Unknown reduction operation.");
- }
- /// Creates reduction operation with the current opcode with the IR flags
- /// from \p I.
- Value *createOp(IRBuilder<> &Builder, const Twine &Name,
- Instruction *I) const {
- assert(isVectorizable() &&
- "Expected add|fadd or min/max reduction operation.");
- auto *Op = createOp(Builder, Name);
- switch (Kind) {
- case RK_Arithmetic:
- propagateIRFlags(Op, I);
- return Op;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- if (auto *SI = dyn_cast<SelectInst>(Op)) {
- propagateIRFlags(SI->getCondition(),
- cast<SelectInst>(I)->getCondition());
- }
- propagateIRFlags(Op, I);
- return Op;
- case RK_None:
- break;
- }
- llvm_unreachable("Unknown reduction operation.");
- }
-
- TargetTransformInfo::ReductionFlags getFlags() const {
- TargetTransformInfo::ReductionFlags Flags;
- Flags.NoNaN = NoNaN;
- switch (Kind) {
- case RK_Arithmetic:
- break;
- case RK_Min:
- Flags.IsSigned = Opcode == Instruction::ICmp;
- Flags.IsMaxOp = false;
- break;
- case RK_Max:
- Flags.IsSigned = Opcode == Instruction::ICmp;
- Flags.IsMaxOp = true;
- break;
- case RK_UMin:
- Flags.IsSigned = false;
- Flags.IsMaxOp = false;
- break;
- case RK_UMax:
- Flags.IsSigned = false;
- Flags.IsMaxOp = true;
- break;
- case RK_None:
- llvm_unreachable("Reduction kind is not set");
- }
- return Flags;
- }
- };
-
WeakTrackingVH ReductionRoot;
+ /// The type of reduction operation.
+ RecurKind RdxKind;
- /// The operation data of the reduction operation.
- OperationData ReductionData;
+ /// Checks if instruction is associative and can be vectorized.
+ static bool isVectorizable(RecurKind Kind, Instruction *I) {
+ if (Kind == RecurKind::None)
+ return false;
+ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
+ return true;
- /// The operation data of the values we perform a reduction on.
- OperationData ReducedValueData;
+ if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
+ // FP min/max are associative except for NaN and -0.0. We do not
+ // have to rule out -0.0 here because the intrinsic semantics do not
+ // specify a fixed result for it.
+ return I->getFastMathFlags().noNaNs();
+ }
- /// Should we model this reduction as a pairwise reduction tree or a tree that
- /// splits the vector in halves and adds those halves.
- bool IsPairwiseReduction = false;
+ return I->isAssociative();
+ }
/// Checks if the ParentStackElem.first should be marked as a reduction
/// operation with an extra argument or as extra argument itself.
@@ -6564,7 +6440,8 @@ class HorizontalReduction {
// in this case.
// Do not perform analysis of remaining operands of ParentStackElem.first
// instruction, this whole instruction is an extra argument.
- ParentStackElem.second = ParentStackElem.first->getNumOperands();
+ RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first);
+ ParentStackElem.second = getNumberOfOperands(ParentRdxKind);
} else {
// We ran into something like:
// ParentStackElem.first += ... + ExtraArg + ...
@@ -6572,110 +6449,238 @@ class HorizontalReduction {
}
}
- static OperationData getOperationData(Value *V) {
- if (!V)
- return OperationData();
-
- Value *LHS;
- Value *RHS;
- if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
- return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
- RK_Arithmetic);
- }
- if (auto *Select = dyn_cast<SelectInst>(V)) {
- // Look for a min/max pattern.
- if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
- } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
- } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
- m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(
- Instruction::FCmp, LHS, RHS, RK_Min,
- cast<Instruction>(Select->getCondition())->hasNoNaNs());
- } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
- } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
- } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
- m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
- return OperationData(
- Instruction::FCmp, LHS, RHS, RK_Max,
- cast<Instruction>(Select->getCondition())->hasNoNaNs());
- } else {
- // Try harder: look for min/max pattern based on instructions producing
- // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
- // During the intermediate stages of SLP, it's very common to have
- // pattern like this (since optimizeGatherSequence is run only once
- // at the end):
- // %1 = extractelement <2 x i32> %a, i32 0
- // %2 = extractelement <2 x i32> %a, i32 1
- // %cond = icmp sgt i32 %1, %2
- // %3 = extractelement <2 x i32> %a, i32 0
- // %4 = extractelement <2 x i32> %a, i32 1
- // %select = select i1 %cond, i32 %3, i32 %4
- CmpInst::Predicate Pred;
- Instruction *L1;
- Instruction *L2;
-
- LHS = Select->getTrueValue();
- RHS = Select->getFalseValue();
- Value *Cond = Select->getCondition();
-
- // TODO: Support inverse predicates.
- if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
- if (!isa<ExtractElementInst>(RHS) ||
- !L2->isIdenticalTo(cast<Instruction>(RHS)))
- return OperationData(V);
- } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
- if (!isa<ExtractElementInst>(LHS) ||
- !L1->isIdenticalTo(cast<Instruction>(LHS)))
- return OperationData(V);
- } else {
- if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
- return OperationData(V);
- if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
- !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
- !L2->isIdenticalTo(cast<Instruction>(RHS)))
- return OperationData(V);
- }
- switch (Pred) {
- default:
- return OperationData(V);
-
- case CmpInst::ICMP_ULT:
- case CmpInst::ICMP_ULE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
-
- case CmpInst::ICMP_SLT:
- case CmpInst::ICMP_SLE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
-
- case CmpInst::FCMP_OLT:
- case CmpInst::FCMP_OLE:
- case CmpInst::FCMP_ULT:
- case CmpInst::FCMP_ULE:
- return OperationData(Instruction::FCmp, LHS, RHS, RK_Min,
- cast<Instruction>(Cond)->hasNoNaNs());
-
- case CmpInst::ICMP_UGT:
- case CmpInst::ICMP_UGE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
-
- case CmpInst::ICMP_SGT:
- case CmpInst::ICMP_SGE:
- return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
-
- case CmpInst::FCMP_OGT:
- case CmpInst::FCMP_OGE:
- case CmpInst::FCMP_UGT:
- case CmpInst::FCMP_UGE:
- return OperationData(Instruction::FCmp, LHS, RHS, RK_Max,
- cast<Instruction>(Cond)->hasNoNaNs());
- }
+ /// Creates reduction operation with the current opcode.
+ static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
+ Value *RHS, const Twine &Name) {
+ unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
+ switch (Kind) {
+ case RecurKind::Add:
+ case RecurKind::Mul:
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::FAdd:
+ case RecurKind::FMul:
+ return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
+ Name);
+ case RecurKind::FMax:
+ return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
+ case RecurKind::FMin:
+ return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
+
+ case RecurKind::SMax: {
+ Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ case RecurKind::SMin: {
+ Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ case RecurKind::UMax: {
+ Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ case RecurKind::UMin: {
+ Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
+ return Builder.CreateSelect(Cmp, LHS, RHS, Name);
+ }
+ default:
+ llvm_unreachable("Unknown reduction operation.");
+ }
+ }
+
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p ReductionOps.
+ static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+ Value *RHS, const Twine &Name,
+ const ReductionOpsListType &ReductionOps) {
+ Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
+ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
+ if (auto *Sel = dyn_cast<SelectInst>(Op))
+ propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
+ propagateIRFlags(Op, ReductionOps[1]);
+ return Op;
+ }
+ propagateIRFlags(Op, ReductionOps[0]);
+ return Op;
+ }
+ /// Creates reduction operation with the current opcode with the IR flags
+ /// from \p I.
+ static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
+ Value *RHS, const Twine &Name, Instruction *I) {
+ Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name);
+ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {
+ if (auto *Sel = dyn_cast<SelectInst>(Op)) {
+ propagateIRFlags(Sel->getCondition(),
+ cast<SelectInst>(I)->getCondition());
}
}
- return OperationData(V);
+ propagateIRFlags(Op, I);
+ return Op;
+ }
+
+ static RecurKind getRdxKind(Instruction *I) {
+ assert(I && "Expected instruction for reduction matching");
+ TargetTransformInfo::ReductionFlags RdxFlags;
+ if (match(I, m_Add(m_Value(), m_Value())))
+ return RecurKind::Add;
+ if (match(I, m_Mul(m_Value(), m_Value())))
+ return RecurKind::Mul;
+ if (match(I, m_And(m_Value(), m_Value())))
+ return RecurKind::And;
+ if (match(I, m_Or(m_Value(), m_Value())))
+ return RecurKind::Or;
+ if (match(I, m_Xor(m_Value(), m_Value())))
+ return RecurKind::Xor;
+ if (match(I, m_FAdd(m_Value(), m_Value())))
+ return RecurKind::FAdd;
+ if (match(I, m_FMul(m_Value(), m_Value())))
+ return RecurKind::FMul;
+
+ if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+ return RecurKind::FMax;
+ if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+ return RecurKind::FMin;
+
+ if (match(I, m_SMax(m_Value(), m_Value())))
+ return RecurKind::SMax;
+ if (match(I, m_SMin(m_Value(), m_Value())))
+ return RecurKind::SMin;
+ if (match(I, m_UMax(m_Value(), m_Value())))
+ return RecurKind::UMax;
+ if (match(I, m_UMin(m_Value(), m_Value())))
+ return RecurKind::UMin;
+
+ if (auto *Select = dyn_cast<SelectInst>(I)) {
+ // Try harder: look for min/max pattern based on instructions producing
+ // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
+ // During the intermediate stages of SLP, it's very common to have
+ // pattern like this (since optimizeGatherSequence is run only once
+ // at the end):
+ // %1 = extractelement <2 x i32> %a, i32 0
+ // %2 = extractelement <2 x i32> %a, i32 1
+ // %cond = icmp sgt i32 %1, %2
+ // %3 = extractelement <2 x i32> %a, i32 0
+ // %4 = extractelement <2 x i32> %a, i32 1
+ // %select = select i1 %cond, i32 %3, i32 %4
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
+
+ Value *LHS = Select->getTrueValue();
+ Value *RHS = Select->getFalseValue();
+ Value *Cond = Select->getCondition();
+
+ // TODO: Support inverse predicates.
+ if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
+ if (!isa<ExtractElementInst>(RHS) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return RecurKind::None;
+ } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
+ if (!isa<ExtractElementInst>(LHS) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)))
+ return RecurKind::None;
+ } else {
+ if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
+ return RecurKind::None;
+ if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
+ !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
+ !L2->isIdenticalTo(cast<Instruction>(RHS)))
+ return RecurKind::None;
+ }
+
+ TargetTransformInfo::ReductionFlags RdxFlags;
+ switch (Pred) {
+ default:
+ return RecurKind::None;
+ case CmpInst::ICMP_SGT:
+ case CmpInst::ICMP_SGE:
+ return RecurKind::SMax;
+ case CmpInst::ICMP_SLT:
+ case CmpInst::ICMP_SLE:
+ return RecurKind::SMin;
+ case CmpInst::ICMP_UGT:
+ case CmpInst::ICMP_UGE:
+ return RecurKind::UMax;
+ case CmpInst::ICMP_ULT:
+ case CmpInst::ICMP_ULE:
+ return RecurKind::UMin;
+ }
+ }
+ return RecurKind::None;
+ }
+
+ /// Return true if this operation is a cmp+select idiom.
+ static bool isCmpSel(RecurKind Kind) {
+ return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind);
+ }
+
+ /// Get the index of the first operand.
+ static unsigned getFirstOperandIndex(RecurKind Kind) {
+ // We allow calling this before 'Kind' is set, so handle that specially.
+ if (Kind == RecurKind::None)
+ return 0;
+ return isCmpSel(Kind) ? 1 : 0;
+ }
+
+ /// Total number of operands in the reduction operation.
+ static unsigned getNumberOfOperands(RecurKind Kind) {
+ return isCmpSel(Kind) ? 3 : 2;
+ }
+
+ /// Checks if the instruction is in basic block \p BB.
+ /// For a min/max reduction check that both compare and select are in \p BB.
+ static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB,
+ bool IsRedOp) {
+ if (IsRedOp && isCmpSel(Kind)) {
+ auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
+ return I->getParent() == BB && Cmp && Cmp->getParent() == BB;
+ }
+ return I->getParent() == BB;
+ }
+
+ /// Expected number of uses for reduction operations/reduced values.
+ static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I,
+ bool IsReductionOp) {
+ // SelectInst must be used twice while the condition op must have single
+ // use only.
+ if (isCmpSel(Kind))
+ return I->hasNUses(2) &&
+ (!IsReductionOp ||
+ cast<SelectInst>(I)->getCondition()->hasOneUse());
+
+ // Arithmetic reduction operation must be used once only.
+ return I->hasOneUse();
+ }
+
+ /// Initializes the list of reduction operations.
+ void initReductionOps(RecurKind Kind) {
+ if (isCmpSel(Kind))
+ ReductionOps.assign(2, ReductionOpsType());
+ else
+ ReductionOps.assign(1, ReductionOpsType());
+ }
+
+ /// Add all reduction operations for the reduction instruction \p I.
+ void addReductionOps(RecurKind Kind, Instruction *I) {
+ assert(Kind != RecurKind::None && "Expected reduction operation.");
+ if (isCmpSel(Kind)) {
+ ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
+ ReductionOps[1].emplace_back(I);
+ } else {
+ ReductionOps[0].emplace_back(I);
+ }
+ }
+
+ static Value *getLHS(RecurKind Kind, Instruction *I) {
+ if (Kind == RecurKind::None)
+ return nullptr;
+ return I->getOperand(getFirstOperandIndex(Kind));
+ }
+ static Value *getRHS(RecurKind Kind, Instruction *I) {
+ if (Kind == RecurKind::None)
+ return nullptr;
+ return I->getOperand(getFirstOperandIndex(Kind) + 1);
}
public:
@@ -6684,50 +6689,59 @@ public:
/// Try to find a reduction tree.
bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
assert((!Phi || is_contained(Phi->operands(), B)) &&
- "Thi phi needs to use the binary operator");
+ "Phi needs to use the binary operator");
- ReductionData = getOperationData(B);
+ RdxKind = getRdxKind(B);
// We could have a initial reductions that is not an add.
// r *= v1 + v2 + v3 + v4
// In such a case start looking for a tree rooted in the first '+'.
if (Phi) {
- if (ReductionData.getLHS() == Phi) {
+ if (getLHS(RdxKind, B) == Phi) {
Phi = nullptr;
- B = dyn_cast<Instruction>(ReductionData.getRHS());
- ReductionData = getOperationData(B);
- } else if (ReductionData.getRHS() == Phi) {
+ B = dyn_cast<Instruction>(getRHS(RdxKind, B));
+ if (!B)
+ return false;
+ RdxKind = getRdxKind(B);
+ } else if (getRHS(RdxKind, B) == Phi) {
Phi = nullptr;
- B = dyn_cast<Instruction>(ReductionData.getLHS());
- ReductionData = getOperationData(B);
+ B = dyn_cast<Instruction>(getLHS(RdxKind, B));
+ if (!B)
+ return false;
+ RdxKind = getRdxKind(B);
}
}
- if (!ReductionData.isVectorizable(B))
+ if (!isVectorizable(RdxKind, B))
return false;
+ // Analyze "regular" integer/FP types for reductions - no target-specific
+ // types or pointers.
Type *Ty = B->getType();
- if (!isValidElementType(Ty))
- return false;
- if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy())
+ if (!isValidElementType(Ty) || Ty->isPointerTy())
return false;
- ReducedValueData.clear();
ReductionRoot = B;
+ // The opcode for leaf values that we perform a reduction on.
+ // For example: load(x) + load(y) + load(z) + fptoui(w)
+ // The leaf opcode for 'w' does not match, so we don't include it as a
+ // potential candidate for the reduction.
+ unsigned LeafOpcode = 0;
+
// Post order traverse the reduction tree starting at B. We only handle true
// trees containing only binary operators.
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
- Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
- ReductionData.initReductionOps(ReductionOps);
+ Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind)));
+ initReductionOps(RdxKind);
while (!Stack.empty()) {
Instruction *TreeN = Stack.back().first;
- unsigned EdgeToVist = Stack.back().second++;
- OperationData OpData = getOperationData(TreeN);
- bool IsReducedValue = OpData != ReductionData;
+ unsigned EdgeToVisit = Stack.back().second++;
+ const RecurKind TreeRdxKind = getRdxKind(TreeN);
+ bool IsReducedValue = TreeRdxKind != RdxKind;
- // Postorder vist.
- if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
+ // Postorder visit.
+ if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) {
if (IsReducedValue)
ReducedVals.push_back(TreeN);
else {
@@ -6745,7 +6759,7 @@ public:
markExtraArg(Stack[Stack.size() - 2], TreeN);
ExtraArgs.erase(TreeN);
} else
- ReductionData.addReductionOps(TreeN, ReductionOps);
+ addReductionOps(RdxKind, TreeN);
}
// Retract.
Stack.pop_back();
@@ -6753,91 +6767,72 @@ public:
}
// Visit left or right.
- Value *NextV = TreeN->getOperand(EdgeToVist);
- if (NextV != Phi) {
- auto *I = dyn_cast<Instruction>(NextV);
- OpData = getOperationData(I);
- // Continue analysis if the next operand is a reduction operation or
- // (possibly) a reduced value. If the reduced value opcode is not set,
- // the first met operation != reduction operation is considered as the
- // reduced value class.
- if (I && (!ReducedValueData || OpData == ReducedValueData ||
- OpData == ReductionData)) {
- const bool IsReductionOperation = OpData == ReductionData;
- // Only handle trees in the current basic block.
- if (!ReductionData.hasSameParent(I, B->getParent(),
- IsReductionOperation)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
- continue;
- }
-
- // Each tree node needs to have minimal number of users except for the
- // ultimate reduction.
- if (!ReductionData.hasRequiredNumberOfUses(I,
- OpData == ReductionData) &&
- I != B) {
+ Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
+ auto *I = dyn_cast<Instruction>(EdgeVal);
+ if (!I) {
+ // Edge value is not a reduction instruction or a leaf instruction.
+ // (It may be a constant, function argument, or something else.)
+ markExtraArg(Stack.back(), EdgeVal);
+ continue;
+ }
+ RecurKind EdgeRdxKind = getRdxKind(I);
+ // Continue analysis if the next operand is a reduction operation or
+ // (possibly) a leaf value. If the leaf value opcode is not set,
+ // the first met operation != reduction operation is considered as the
+ // leaf opcode.
+ // Only handle trees in the current basic block.
+ // Each tree node needs to have minimal number of users except for the
+ // ultimate reduction.
+ const bool IsRdxInst = EdgeRdxKind == RdxKind;
+ if (I != Phi && I != B &&
+ hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) &&
+ hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) &&
+ (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) {
+ if (IsRdxInst) {
+ // We need to be able to reassociate the reduction operations.
+ if (!isVectorizable(EdgeRdxKind, I)) {
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
continue;
}
-
- if (IsReductionOperation) {
- // We need to be able to reassociate the reduction operations.
- if (!OpData.isAssociative(I)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
- continue;
- }
- } else if (ReducedValueData &&
- ReducedValueData != OpData) {
- // Make sure that the opcodes of the operations that we are going to
- // reduce match.
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
- continue;
- } else if (!ReducedValueData)
- ReducedValueData = OpData;
-
- Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
- continue;
+ } else if (!LeafOpcode) {
+ LeafOpcode = I->getOpcode();
}
+ Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind)));
+ continue;
}
- // NextV is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), NextV);
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
}
return true;
}
- /// Attempt to vectorize the tree found by
- /// matchAssociativeReduction.
+ /// Attempt to vectorize the tree found by matchAssociativeReduction.
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
- if (ReducedVals.empty())
- return false;
-
- // If there is a sufficient number of reduction values, reduce
- // to a nearby power-of-2. Can safely generate oversized
+ // If there are a sufficient number of reduction values, reduce
+ // to a nearby power-of-2. We can safely generate oversized
// vectors and rely on the backend to split them to legal sizes.
unsigned NumReducedVals = ReducedVals.size();
if (NumReducedVals < 4)
return false;
- unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
-
- Value *VectorizedTree = nullptr;
+ // Intersect the fast-math-flags from all reduction operations.
+ FastMathFlags RdxFMF;
+ RdxFMF.set();
+ for (ReductionOpsType &RdxOp : ReductionOps) {
+ for (Value *RdxVal : RdxOp) {
+ if (auto *FPMO = dyn_cast<FPMathOperator>(RdxVal))
+ RdxFMF &= FPMO->getFastMathFlags();
+ }
+ }
- // FIXME: Fast-math-flags should be set based on the instructions in the
- // reduction (not all of 'fast' are required).
IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
- FastMathFlags Unsafe;
- Unsafe.setFast();
- Builder.setFastMathFlags(Unsafe);
- unsigned i = 0;
+ Builder.setFastMathFlags(RdxFMF);
BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
- // The same extra argument may be used several time, so log each attempt
+ // The same extra argument may be used several times, so log each attempt
// to use it.
- for (auto &Pair : ExtraArgs) {
+ for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
assert(Pair.first && "DebugLoc must be set.");
ExternallyUsedValues[Pair.second].push_back(Pair.first);
}
@@ -6857,14 +6852,48 @@ public:
// so set it as externally used to prevent it from being deleted.
ExternallyUsedValues[ReductionRoot];
SmallVector<Value *, 16> IgnoreList;
- for (auto &V : ReductionOps)
- IgnoreList.append(V.begin(), V.end());
+ for (ReductionOpsType &RdxOp : ReductionOps)
+ IgnoreList.append(RdxOp.begin(), RdxOp.end());
+
+ unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
+ if (NumReducedVals > ReduxWidth) {
+ // In the loop below, we are building a tree based on a window of
+ // 'ReduxWidth' values.
+ // If the operands of those values have common traits (compare predicate,
+ // constant operand, etc), then we want to group those together to
+ // minimize the cost of the reduction.
+
+ // TODO: This should be extended to count common operands for
+ // compares and binops.
+
+ // Step 1: Count the number of times each compare predicate occurs.
+ SmallDenseMap<unsigned, unsigned> PredCountMap;
+ for (Value *RdxVal : ReducedVals) {
+ CmpInst::Predicate Pred;
+ if (match(RdxVal, m_Cmp(Pred, m_Value(), m_Value())))
+ ++PredCountMap[Pred];
+ }
+ // Step 2: Sort the values so the most common predicates come first.
+ stable_sort(ReducedVals, [&PredCountMap](Value *A, Value *B) {
+ CmpInst::Predicate PredA, PredB;
+ if (match(A, m_Cmp(PredA, m_Value(), m_Value())) &&
+ match(B, m_Cmp(PredB, m_Value(), m_Value()))) {
+ return PredCountMap[PredA] > PredCountMap[PredB];
+ }
+ return false;
+ });
+ }
+
+ Value *VectorizedTree = nullptr;
+ unsigned i = 0;
while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
- auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
+ ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
V.buildTree(VL, ExternallyUsedValues, IgnoreList);
Optional<ArrayRef<unsigned>> Order = V.bestOrder();
- // TODO: Handle orders of size less than number of elements in the vector.
- if (Order && Order->size() == VL.size()) {
+ if (Order) {
+ assert(Order->size() == VL.size() &&
+ "Order size must be the same as number of vectorized "
+ "instructions.");
// TODO: reorder tree nodes without tree rebuilding.
SmallVector<Value *, 4> ReorderedOps(VL.size());
llvm::transform(*Order, ReorderedOps.begin(),
@@ -6873,60 +6902,66 @@ public:
}
if (V.isTreeTinyAndNotFullyVectorizable())
break;
- if (V.isLoadCombineReductionCandidate(ReductionData.getOpcode()))
+ if (V.isLoadCombineReductionCandidate(RdxKind))
break;
V.computeMinimumValueSizes();
// Estimate cost.
- int TreeCost = V.getTreeCost();
- int ReductionCost = getReductionCost(TTI, ReducedVals[i], ReduxWidth);
- int Cost = TreeCost + ReductionCost;
+ InstructionCost TreeCost = V.getTreeCost();
+ InstructionCost ReductionCost =
+ getReductionCost(TTI, ReducedVals[i], ReduxWidth);
+ InstructionCost Cost = TreeCost + ReductionCost;
+ if (!Cost.isValid()) {
+ LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
+ return false;
+ }
if (Cost >= -SLPCostThreshold) {
- V.getORE()->emit([&]() {
- return OptimizationRemarkMissed(
- SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
- << "Vectorizing horizontal reduction is possible"
- << "but not beneficial with cost "
- << ore::NV("Cost", Cost) << " and threshold "
- << ore::NV("Threshold", -SLPCostThreshold);
- });
- break;
+ V.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
+ cast<Instruction>(VL[0]))
+ << "Vectorizing horizontal reduction is possible"
+ << "but not beneficial with cost " << ore::NV("Cost", Cost)
+ << " and threshold "
+ << ore::NV("Threshold", -SLPCostThreshold);
+ });
+ break;
}
LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
<< Cost << ". (HorRdx)\n");
V.getORE()->emit([&]() {
- return OptimizationRemark(
- SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
- << "Vectorized horizontal reduction with cost "
- << ore::NV("Cost", Cost) << " and with tree size "
- << ore::NV("TreeSize", V.getTreeSize());
+ return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
+ cast<Instruction>(VL[0]))
+ << "Vectorized horizontal reduction with cost "
+ << ore::NV("Cost", Cost) << " and with tree size "
+ << ore::NV("TreeSize", V.getTreeSize());
});
// Vectorize a tree.
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
- // Emit a reduction. For min/max, the root is a select, but the insertion
+ // Emit a reduction. If the root is a select (min/max idiom), the insert
// point is the compare condition of that select.
Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
- if (ReductionData.isMinMax())
+ if (isCmpSel(RdxKind))
Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
else
Builder.SetInsertPoint(RdxRootInst);
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
- if (VectorizedTree) {
- Builder.SetCurrentDebugLocation(Loc);
- OperationData VectReductionData(ReductionData.getOpcode(),
- VectorizedTree, ReducedSubTree,
- ReductionData.getKind());
- VectorizedTree =
- VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
- } else
+
+ if (!VectorizedTree) {
+ // Initialize the final value in the reduction.
VectorizedTree = ReducedSubTree;
+ } else {
+ // Update the final value in the reduction.
+ Builder.SetCurrentDebugLocation(Loc);
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ ReducedSubTree, "op.rdx", ReductionOps);
+ }
i += ReduxWidth;
ReduxWidth = PowerOf2Floor(NumReducedVals - i);
}
@@ -6936,19 +6971,15 @@ public:
for (; i < NumReducedVals; ++i) {
auto *I = cast<Instruction>(ReducedVals[i]);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- OperationData VectReductionData(ReductionData.getOpcode(),
- VectorizedTree, I,
- ReductionData.getKind());
- VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
+ VectorizedTree =
+ createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
}
for (auto &Pair : ExternallyUsedValues) {
// Add each externally used value to the final reduction.
for (auto *I : Pair.second) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
- OperationData VectReductionData(ReductionData.getOpcode(),
- VectorizedTree, Pair.first,
- ReductionData.getKind());
- VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
+ VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
+ Pair.first, "op.extra", I);
}
}
@@ -6956,7 +6987,7 @@ public:
// select, we also have to RAUW for the compare instruction feeding the
// reduction root. That's because the original compare may have extra uses
// besides the final select of the reduction.
- if (ReductionData.isMinMax()) {
+ if (isCmpSel(RdxKind)) {
if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
Instruction *ScalarCmp =
getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
@@ -6972,77 +7003,68 @@ public:
return VectorizedTree != nullptr;
}
- unsigned numReductionValues() const {
- return ReducedVals.size();
- }
+ unsigned numReductionValues() const { return ReducedVals.size(); }
private:
/// Calculate the cost of a reduction.
- int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
- unsigned ReduxWidth) {
+ InstructionCost getReductionCost(TargetTransformInfo *TTI,
+ Value *FirstReducedVal,
+ unsigned ReduxWidth) {
Type *ScalarTy = FirstReducedVal->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);
-
- int PairwiseRdxCost;
- int SplittingRdxCost;
- switch (ReductionData.getKind()) {
- case RK_Arithmetic:
- PairwiseRdxCost =
- TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
- /*IsPairwiseForm=*/true);
- SplittingRdxCost =
- TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
- /*IsPairwiseForm=*/false);
- break;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax: {
- auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
- bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
- ReductionData.getKind() == RK_UMax;
- PairwiseRdxCost =
- TTI->getMinMaxReductionCost(VecTy, VecCondTy,
- /*IsPairwiseForm=*/true, IsUnsigned);
- SplittingRdxCost =
- TTI->getMinMaxReductionCost(VecTy, VecCondTy,
- /*IsPairwiseForm=*/false, IsUnsigned);
+ FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
+ InstructionCost VectorCost, ScalarCost;
+ switch (RdxKind) {
+ case RecurKind::Add:
+ case RecurKind::Mul:
+ case RecurKind::Or:
+ case RecurKind::And:
+ case RecurKind::Xor:
+ case RecurKind::FAdd:
+ case RecurKind::FMul: {
+ unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
+ VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
+ /*IsPairwiseForm=*/false);
+ ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
break;
}
- case RK_None:
- llvm_unreachable("Expected arithmetic or min/max reduction operation");
- }
-
- IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
- int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
-
- int ScalarReduxCost = 0;
- switch (ReductionData.getKind()) {
- case RK_Arithmetic:
- ScalarReduxCost =
- TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
+ case RecurKind::FMax:
+ case RecurKind::FMin: {
+ auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ VectorCost =
+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*pairwise=*/false, /*unsigned=*/false);
+ ScalarCost =
+ TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
break;
- case RK_Min:
- case RK_Max:
- case RK_UMin:
- case RK_UMax:
- ScalarReduxCost =
- TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
+ }
+ case RecurKind::SMax:
+ case RecurKind::SMin:
+ case RecurKind::UMax:
+ case RecurKind::UMin: {
+ auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
+ bool IsUnsigned =
+ RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin;
+ VectorCost =
+ TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
+ /*IsPairwiseForm=*/false, IsUnsigned);
+ ScalarCost =
+ TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
CmpInst::makeCmpResultType(ScalarTy));
break;
- case RK_None:
+ }
+ default:
llvm_unreachable("Expected arithmetic or min/max reduction operation");
}
- ScalarReduxCost *= (ReduxWidth - 1);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+ // Scalar cost is repeated for N-1 elements.
+ ScalarCost *= (ReduxWidth - 1);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
<< " for reduction that starts with " << *FirstReducedVal
- << " (It is a "
- << (IsPairwiseReduction ? "pairwise" : "splitting")
- << " reduction)\n");
-
- return VecReduxCost - ScalarReduxCost;
+ << " (It is a splitting reduction)\n");
+ return VectorCost - ScalarCost;
}
/// Emit a horizontal reduction of the vectorized value.
@@ -7052,92 +7074,142 @@ private:
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
- if (!IsPairwiseReduction) {
- // FIXME: The builder should use an FMF guard. It should not be hard-coded
- // to 'fast'.
- assert(Builder.getFastMathFlags().isFast() && "Expected 'fast' FMF");
- return createSimpleTargetReduction(
- Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
- ReductionData.getFlags(), ReductionOps.back());
- }
+ return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
+ ReductionOps.back());
+ }
+};
- Value *TmpVec = VectorizedValue;
- for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
- auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);
+} // end anonymous namespace
+
+static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
+ if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
+ return cast<FixedVectorType>(IE->getType())->getNumElements();
- Value *LeftShuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
- Value *RightShuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
- "rdx.shuf.r");
- OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
- RightShuf, ReductionData.getKind());
- TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
+ unsigned AggregateSize = 1;
+ auto *IV = cast<InsertValueInst>(InsertInst);
+ Type *CurrentType = IV->getType();
+ do {
+ if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+ for (auto *Elt : ST->elements())
+ if (Elt != ST->getElementType(0)) // check homogeneity
+ return None;
+ AggregateSize *= ST->getNumElements();
+ CurrentType = ST->getElementType(0);
+ } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ AggregateSize *= AT->getNumElements();
+ CurrentType = AT->getElementType();
+ } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
+ AggregateSize *= VT->getNumElements();
+ return AggregateSize;
+ } else if (CurrentType->isSingleValueType()) {
+ return AggregateSize;
+ } else {
+ return None;
}
+ } while (true);
+}
- // The result is in the first element of the vector.
- return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
+ unsigned OperandOffset) {
+ unsigned OperandIndex = OperandOffset;
+ if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
+ auto *VT = cast<FixedVectorType>(IE->getType());
+ OperandIndex *= VT->getNumElements();
+ OperandIndex += CI->getZExtValue();
+ return OperandIndex;
+ }
+ return None;
}
-};
-} // end anonymous namespace
+ auto *IV = cast<InsertValueInst>(InsertInst);
+ Type *CurrentType = IV->getType();
+ for (unsigned int Index : IV->indices()) {
+ if (auto *ST = dyn_cast<StructType>(CurrentType)) {
+ OperandIndex *= ST->getNumElements();
+ CurrentType = ST->getElementType(Index);
+ } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
+ OperandIndex *= AT->getNumElements();
+ CurrentType = AT->getElementType();
+ } else {
+ return None;
+ }
+ OperandIndex += Index;
+ }
+ return OperandIndex;
+}
+
+static bool findBuildAggregate_rec(Instruction *LastInsertInst,
+ TargetTransformInfo *TTI,
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ SmallVectorImpl<Value *> &InsertElts,
+ unsigned OperandOffset) {
+ do {
+ Value *InsertedOperand = LastInsertInst->getOperand(1);
+ Optional<unsigned> OperandIndex =
+ getOperandIndex(LastInsertInst, OperandOffset);
+ if (!OperandIndex)
+ return false;
+ if (isa<InsertElementInst>(InsertedOperand) ||
+ isa<InsertValueInst>(InsertedOperand)) {
+ if (!findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
+ BuildVectorOpds, InsertElts, *OperandIndex))
+ return false;
+ } else {
+ BuildVectorOpds[*OperandIndex] = InsertedOperand;
+ InsertElts[*OperandIndex] = LastInsertInst;
+ }
+ if (isa<UndefValue>(LastInsertInst->getOperand(0)))
+ return true;
+ LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
+ } while (LastInsertInst != nullptr &&
+ (isa<InsertValueInst>(LastInsertInst) ||
+ isa<InsertElementInst>(LastInsertInst)) &&
+ LastInsertInst->hasOneUse());
+ return false;
+}
/// Recognize construction of vectors like
-/// %ra = insertelement <4 x float> undef, float %s0, i32 0
+/// %ra = insertelement <4 x float> poison, float %s0, i32 0
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
/// starting from the last insertelement or insertvalue instruction.
///
-/// Also recognize aggregates like {<2 x float>, <2 x float>},
+/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
///
/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
///
/// \return true if it matches.
-static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
+static bool findBuildAggregate(Instruction *LastInsertInst,
+ TargetTransformInfo *TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,
SmallVectorImpl<Value *> &InsertElts) {
+
assert((isa<InsertElementInst>(LastInsertInst) ||
isa<InsertValueInst>(LastInsertInst)) &&
"Expected insertelement or insertvalue instruction!");
- do {
- Value *InsertedOperand;
- auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
- if (IE) {
- InsertedOperand = IE->getOperand(1);
- LastInsertInst = IE->getOperand(0);
- } else {
- auto *IV = cast<InsertValueInst>(LastInsertInst);
- InsertedOperand = IV->getInsertedValueOperand();
- LastInsertInst = IV->getAggregateOperand();
- }
- if (isa<InsertElementInst>(InsertedOperand) ||
- isa<InsertValueInst>(InsertedOperand)) {
- SmallVector<Value *, 8> TmpBuildVectorOpds;
- SmallVector<Value *, 8> TmpInsertElts;
- if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
- TmpInsertElts))
- return false;
- BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
- TmpBuildVectorOpds.rend());
- InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
- } else {
- BuildVectorOpds.push_back(InsertedOperand);
- InsertElts.push_back(IE);
- }
- if (isa<UndefValue>(LastInsertInst))
- break;
- if ((!isa<InsertValueInst>(LastInsertInst) &&
- !isa<InsertElementInst>(LastInsertInst)) ||
- !LastInsertInst->hasOneUse())
- return false;
- } while (true);
- std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
- std::reverse(InsertElts.begin(), InsertElts.end());
- return true;
+
+ assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
+ "Expected empty result vectors!");
+
+ Optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
+ if (!AggregateSize)
+ return false;
+ BuildVectorOpds.resize(*AggregateSize);
+ InsertElts.resize(*AggregateSize);
+
+ if (findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts,
+ 0)) {
+ llvm::erase_value(BuildVectorOpds, nullptr);
+ llvm::erase_value(InsertElts, nullptr);
+ if (BuildVectorOpds.size() >= 2)
+ return true;
+ }
+
+ return false;
}
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
@@ -7195,6 +7267,16 @@ static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
return nullptr;
}
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
+ if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
+ return true;
+ if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
+ return true;
+ return false;
+}
+
/// Attempt to reduce a horizontal reduction.
/// If it is legal to match a horizontal reduction feeding the phi node \a P
/// with reduction operators \a Root (or one of its operands) in a basic block
@@ -7234,9 +7316,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
Instruction *Inst;
unsigned Level;
std::tie(Inst, Level) = Stack.pop_back_val();
- auto *BI = dyn_cast<BinaryOperator>(Inst);
- auto *SI = dyn_cast<SelectInst>(Inst);
- if (BI || SI) {
+ Value *B0, *B1;
+ bool IsBinop = matchRdxBop(Inst, B0, B1);
+ bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
+ if (IsBinop || IsSelect) {
HorizontalReduction HorRdx;
if (HorRdx.matchAssociativeReduction(P, Inst)) {
if (HorRdx.tryToReduce(R, TTI)) {
@@ -7247,10 +7330,10 @@ static bool tryToVectorizeHorReductionOrInstOperands(
continue;
}
}
- if (P && BI) {
- Inst = dyn_cast<Instruction>(BI->getOperand(0));
+ if (P && IsBinop) {
+ Inst = dyn_cast<Instruction>(B0);
if (Inst == P)
- Inst = dyn_cast<Instruction>(BI->getOperand(1));
+ Inst = dyn_cast<Instruction>(B1);
if (!Inst) {
// Set P to nullptr to avoid re-analysis of phi node in
// matchAssociativeReduction function unless this is the root node.
@@ -7283,9 +7366,7 @@ static bool tryToVectorizeHorReductionOrInstOperands(
bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
BasicBlock *BB, BoUpSLP &R,
TargetTransformInfo *TTI) {
- if (!V)
- return false;
- auto *I = dyn_cast<Instruction>(V);
+ auto *I = dyn_cast_or_null<Instruction>(V);
if (!I)
return false;
@@ -7307,8 +7388,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
SmallVector<Value *, 16> BuildVectorOpds;
SmallVector<Value *, 16> BuildVectorInsts;
- if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
- BuildVectorOpds.size() < 2)
+ if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
return false;
LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
@@ -7323,7 +7403,6 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
SmallVector<Value *, 16> BuildVectorInsts;
SmallVector<Value *, 16> BuildVectorOpds;
if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
- BuildVectorOpds.size() < 2 ||
(llvm::all_of(BuildVectorOpds,
[](Value *V) { return isa<ExtractElementInst>(V); }) &&
isShuffle(BuildVectorOpds)))
@@ -7369,7 +7448,6 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
SmallVector<Value *, 4> Incoming;
SmallPtrSet<Value *, 16> VisitedInstrs;
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
bool HaveVectorizedPhiNodes = true;
while (HaveVectorizedPhiNodes) {
@@ -7396,27 +7474,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Look for the next elements with the same type.
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
- Type *EltTy = (*IncIt)->getType();
-
- assert(EltTy->isSized() &&
- "Instructions should all be sized at this point");
- TypeSize EltTS = DL->getTypeSizeInBits(EltTy);
- if (EltTS.isScalable()) {
- // For now, just ignore vectorizing scalable types.
- ++IncIt;
- continue;
- }
-
- unsigned EltSize = EltTS.getFixedSize();
- unsigned MaxNumElts = MaxVecRegSize / EltSize;
- if (MaxNumElts < 2) {
- ++IncIt;
- continue;
- }
-
while (SameTypeIt != E &&
- (*SameTypeIt)->getType() == EltTy &&
- (SameTypeIt - IncIt) < MaxNumElts) {
+ (*SameTypeIt)->getType() == (*IncIt)->getType()) {
VisitedInstrs.insert(*SameTypeIt);
++SameTypeIt;
}
@@ -7448,12 +7507,17 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
SmallVector<Instruction *, 8> PostProcessInstructions;
SmallDenseSet<Instruction *, 4> KeyNodes;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // Skip instructions with scalable type. The num of elements is unknown at
+ // compile-time for scalable type.
+ if (isa<ScalableVectorType>(it->getType()))
+ continue;
+
// Skip instructions marked for the deletion.
if (R.isDeleted(&*it))
continue;
// We may go through BB multiple times so skip the one we have checked.
if (!VisitedInstrs.insert(&*it).second) {
- if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
+ if (it->use_empty() && KeyNodes.contains(&*it) &&
vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
// We would like to start over since some instructions are deleted
// and the iterator may become invalid value.
@@ -7470,16 +7534,29 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
// Try to vectorize reductions that use PHINodes.
if (PHINode *P = dyn_cast<PHINode>(it)) {
// Check that the PHI is a reduction PHI.
- if (P->getNumIncomingValues() != 2)
- return Changed;
+ if (P->getNumIncomingValues() == 2) {
+ // Try to match and vectorize a horizontal reduction.
+ if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
+ TTI)) {
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+ // Try to vectorize the incoming values of the PHI, to catch reductions
+ // that feed into PHIs.
+ for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
+ // Skip if the incoming block is the current BB for now. Also, bypass
+ // unreachable IR for efficiency and to avoid crashing.
+ // TODO: Collect the skipped incoming values and try to vectorize them
+ // after processing BB.
+ if (BB == P->getIncomingBlock(I) ||
+ !DT->isReachableFromEntry(P->getIncomingBlock(I)))
+ continue;
- // Try to match and vectorize a horizontal reduction.
- if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
- TTI)) {
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
+ Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
+ P->getIncomingBlock(I), R, TTI);
}
continue;
}
@@ -7543,7 +7620,7 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
unsigned MaxElts = MaxVecRegSize / EltSize;
for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
auto Len = std::min<unsigned>(BE - BI, MaxElts);
- auto GEPList = makeArrayRef(&Entry.second[BI], Len);
+ ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
// Initialize a set a candidate getelementptrs. Note that we use a
// SetVector here to preserve program order. If the index computations