diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2559 |
1 files changed, 1846 insertions, 713 deletions
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0b630197911a..cc3f5c7d4b48 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22,6 +22,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" @@ -217,15 +218,18 @@ static bool allSameBlock(ArrayRef<Value *> VL) { return true; } +/// \returns True if the value is a constant (but not globals/constant +/// expressions). +static bool isConstant(Value *V) { + return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V); +} + /// \returns True if all of the values in \p VL are constants (but not /// globals/constant expressions). static bool allConstant(ArrayRef<Value *> VL) { // Constant expressions and globals can't be vectorized like normal integer/FP // constants. - for (Value *i : VL) - if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i)) - return false; - return true; + return all_of(VL, isConstant); } /// \returns True if all of the values in \p VL are identical. @@ -287,10 +291,11 @@ static bool isCommutative(Instruction *I) { /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3 /// ret <4 x i8> %ins4 /// InstCombiner transforms this into a shuffle and vector mul +/// Mask will return the Shuffle Mask equivalent to the extracted elements. /// TODO: Can we split off and reuse the shuffle mask detection from /// TargetTransformInfo::getInstructionThroughput? static Optional<TargetTransformInfo::ShuffleKind> -isShuffle(ArrayRef<Value *> VL) { +isShuffle(ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask) { auto *EI0 = cast<ExtractElementInst>(VL[0]); unsigned Size = cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements(); @@ -308,9 +313,12 @@ isShuffle(ArrayRef<Value *> VL) { if (!Idx) return None; // Undefined behavior if Idx is negative or >= Size. - if (Idx->getValue().uge(Size)) + if (Idx->getValue().uge(Size)) { + Mask.push_back(UndefMaskElem); continue; + } unsigned IntIdx = Idx->getValue().getZExtValue(); + Mask.push_back(IntIdx); // We can extractelement from undef or poison vector. if (isa<UndefValue>(Vec)) continue; @@ -538,6 +546,41 @@ static void inversePermutation(ArrayRef<unsigned> Indices, Mask[Indices[I]] = I; } +/// \returns inserting index of InsertElement or InsertValue instruction, +/// using Offset as base offset for index. +static Optional<int> getInsertIndex(Value *InsertInst, unsigned Offset) { + int Index = Offset; + if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { + if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { + auto *VT = cast<FixedVectorType>(IE->getType()); + if (CI->getValue().uge(VT->getNumElements())) + return UndefMaskElem; + Index *= VT->getNumElements(); + Index += CI->getZExtValue(); + return Index; + } + if (isa<UndefValue>(IE->getOperand(2))) + return UndefMaskElem; + return None; + } + + auto *IV = cast<InsertValueInst>(InsertInst); + Type *CurrentType = IV->getType(); + for (unsigned I : IV->indices()) { + if (auto *ST = dyn_cast<StructType>(CurrentType)) { + Index *= ST->getNumElements(); + CurrentType = ST->getElementType(I); + } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { + Index *= AT->getNumElements(); + CurrentType = AT->getElementType(); + } else { + return None; + } + Index += I; + } + return Index; +} + namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -570,7 +613,9 @@ public: if (MaxVectorRegSizeOption.getNumOccurrences()) MaxVecRegSize = MaxVectorRegSizeOption; else - MaxVecRegSize = TTI->getRegisterBitWidth(true); + MaxVecRegSize = + TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) + .getFixedSize(); if (MinVectorRegSizeOption.getNumOccurrences()) MinVecRegSize = MinVectorRegSizeOption; @@ -593,7 +638,7 @@ public: /// \returns the vectorization cost of the subtree that starts at \p VL. /// A negative number means that this is profitable. - InstructionCost getTreeCost(); + InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = None); /// Construct a vectorizable tree that starts at \p Roots, ignoring users for /// the purpose of scheduling and extraction in the \p UserIgnoreLst. @@ -621,6 +666,7 @@ public: BS->clear(); } MinBWs.clear(); + InstrElementSize.clear(); } unsigned getTreeSize() const { return VectorizableTree.size(); } @@ -937,10 +983,16 @@ public: ScalarEvolution &SE) { auto *LI1 = dyn_cast<LoadInst>(V1); auto *LI2 = dyn_cast<LoadInst>(V2); - if (LI1 && LI2) - return isConsecutiveAccess(LI1, LI2, DL, SE) - ? VLOperands::ScoreConsecutiveLoads - : VLOperands::ScoreFail; + if (LI1 && LI2) { + if (LI1->getParent() != LI2->getParent()) + return VLOperands::ScoreFail; + + Optional<int> Dist = getPointersDiff( + LI1->getType(), LI1->getPointerOperand(), LI2->getType(), + LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true); + return (Dist && *Dist == 1) ? VLOperands::ScoreConsecutiveLoads + : VLOperands::ScoreFail; + } auto *C1 = dyn_cast<Constant>(V1); auto *C2 = dyn_cast<Constant>(V2); @@ -1500,10 +1552,12 @@ public: private: /// Checks if all users of \p I are the part of the vectorization tree. - bool areAllUsersVectorized(Instruction *I) const; + bool areAllUsersVectorized(Instruction *I, + ArrayRef<Value *> VectorizedVals) const; /// \returns the cost of the vectorizable entry. - InstructionCost getEntryCost(TreeEntry *E); + InstructionCost getEntryCost(const TreeEntry *E, + ArrayRef<Value *> VectorizedVals); /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, @@ -1529,6 +1583,14 @@ private: getGatherCost(FixedVectorType *Ty, const DenseSet<unsigned> &ShuffledIndices) const; + /// Checks if the gathered \p VL can be represented as shuffle(s) of previous + /// tree entries. + /// \returns ShuffleKind, if gathered values can be represented as shuffles of + /// previous tree entries. \p Mask is filled with the shuffle mask. + Optional<TargetTransformInfo::ShuffleKind> + isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, + SmallVectorImpl<const TreeEntry *> &Entries); + /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. @@ -1536,7 +1598,7 @@ private: /// Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(TreeEntry *E); + void setInsertPointAfterBundle(const TreeEntry *E); /// \returns a vector from a collection of scalars in \p VL. Value *gather(ArrayRef<Value *> VL); @@ -1615,7 +1677,7 @@ private: void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) { if (Operands.size() < OpIdx + 1) Operands.resize(OpIdx + 1); - assert(Operands[OpIdx].size() == 0 && "Already resized?"); + assert(Operands[OpIdx].empty() && "Already resized?"); Operands[OpIdx].resize(Scalars.size()); for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane) Operands[OpIdx][Lane] = OpVL[Lane]; @@ -1706,6 +1768,17 @@ private: setOperations(S); return true; } + /// When ReuseShuffleIndices is empty it just returns position of \p V + /// within vector of Scalars. Otherwise, try to remap on its reuse index. + int findLaneForValue(Value *V) const { + unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V)); + assert(FoundLane < Scalars.size() && "Couldn't find extract lane"); + if (!ReuseShuffleIndices.empty()) { + FoundLane = std::distance(ReuseShuffleIndices.begin(), + find(ReuseShuffleIndices, FoundLane)); + } + return FoundLane; + } #ifndef NDEBUG /// Debug printer. @@ -1766,7 +1839,7 @@ private: }; #ifndef NDEBUG - void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost, + void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost, InstructionCost VecCost, InstructionCost ScalarCost) const { dbgs() << "SLP: Calculated costs for Tree:\n"; E->dump(); @@ -1896,7 +1969,7 @@ private: bool aliased = true; if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) { // Do the alias check. - aliased = AA->alias(Loc1, Loc2); + aliased = !AA->isNoAlias(Loc1, Loc2); } // Store the result in the cache. result = aliased; @@ -2547,12 +2620,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - int FoundLane = Lane; - if (!Entry->ReuseShuffleIndices.empty()) { - FoundLane = - std::distance(Entry->ReuseShuffleIndices.begin(), - llvm::find(Entry->ReuseShuffleIndices, FoundLane)); - } + int FoundLane = Entry->findLaneForValue(Scalar); // Check if the scalar is externally used as an extra arg. auto ExtI = ExternallyUsedValues.find(Scalar); @@ -2575,6 +2643,7 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, // instructions. If that is the case, the one in Lane 0 will // be used. if (UseScalar != U || + UseEntry->State == TreeEntry::ScatterVectorize || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); @@ -2606,8 +2675,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, return; } + // Don't handle scalable vectors + if (S.getOpcode() == Instruction::ExtractElement && + isa<ScalableVectorType>( + cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); + return; + } + // Don't handle vectors. - if (S.OpValue->getType()->isVectorTy()) { + if (S.OpValue->getType()->isVectorTy() && + !isa<InsertElementInst>(S.OpValue)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; @@ -2764,6 +2843,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, // Keeps the reordered operands to avoid code duplication. SmallVector<ValueList, 2> OperandsVec; for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { + if (!DT->isReachableFromEntry(PH->getIncomingBlock(I))) { + ValueList Operands(VL.size(), PoisonValue::get(PH->getType())); + TE->setOperand(I, Operands); + OperandsVec.push_back(Operands); + continue; + } ValueList Operands; // Prepare the operand vector. for (Value *V : VL) @@ -2819,6 +2904,41 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, BS.cancelScheduling(VL, VL0); return; } + case Instruction::InsertElement: { + assert(ReuseShuffleIndicies.empty() && "All inserts should be unique"); + + // Check that we have a buildvector and not a shuffle of 2 or more + // different vectors. + ValueSet SourceVectors; + for (Value *V : VL) + SourceVectors.insert(cast<Instruction>(V)->getOperand(0)); + + if (count_if(VL, [&SourceVectors](Value *V) { + return !SourceVectors.contains(V); + }) >= 2) { + // Found 2nd source vector - cancel. + LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with " + "different source vectors.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + BS.cancelScheduling(VL, VL0); + return; + } + + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx); + LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n"); + + constexpr int NumOps = 2; + ValueList VectorOperands[NumOps]; + for (int I = 0; I < NumOps; ++I) { + for (Value *V : VL) + VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I)); + + TE->setOperand(I, VectorOperands[I]); + } + buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, 0}); + return; + } case Instruction::Load: { // Check that a vectorized load would load the same memory as a scalar // load. For example, we don't want to vectorize loads that are smaller @@ -2856,7 +2976,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, OrdersType CurrentOrder; // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { Value *Ptr0; Value *PtrN; if (CurrentOrder.empty()) { @@ -2866,13 +2986,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); + Optional<int> Diff = getPointersDiff( + ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (static_cast<unsigned>(*Diff) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -2892,13 +3009,21 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, } return; } - // Vectorizing non-consecutive loads with `llvm.masked.gather`. - TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(PointerOps, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); - return; + Align CommonAlignment = cast<LoadInst>(VL0)->getAlign(); + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); + if (TTI->isLegalMaskedGather(FixedVectorType::get(ScalarTy, VL.size()), + CommonAlignment)) { + // Vectorizing non-consecutive loads with `llvm.masked.gather`. + TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, + S, UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(); + buildTree_rec(PointerOps, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() + << "SLP: added a vector of non-consecutive loads.\n"); + return; + } } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); @@ -3135,7 +3260,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, OrdersType CurrentOrder; // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) { Value *Ptr0; Value *PtrN; if (CurrentOrder.empty()) { @@ -3145,13 +3270,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth, Ptr0 = PointerOps[CurrentOrder.front()]; PtrN = PointerOps[CurrentOrder.back()]; } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); + Optional<int> Dist = + getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE); // Check that the sorted pointer operands are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (static_cast<unsigned>(*Dist) == VL.size() - 1) { if (CurrentOrder.empty()) { // Original stores are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; @@ -3405,8 +3527,10 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue, return ShouldKeepOrder; } -bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { - return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) { +bool BoUpSLP::areAllUsersVectorized(Instruction *I, + ArrayRef<Value *> VectorizedVals) const { + return (I->hasOneUse() && is_contained(VectorizedVals, I)) || + llvm::all_of(I->users(), [this](User *U) { return ScalarToTreeEntry.count(U) > 0; }); } @@ -3417,7 +3541,16 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount()); + SmallVector<Type *, 4> VecTys; + for (Use &Arg : CI->args()) + VecTys.push_back( + FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); + FastMathFlags FMF; + if (auto *FPCI = dyn_cast<FPMathOperator>(CI)) + FMF = FPCI->getFastMathFlags(); + SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end()); + IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys, FMF, + dyn_cast<IntrinsicInst>(CI)); auto IntrinsicCost = TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput); @@ -3428,11 +3561,6 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, auto LibCost = IntrinsicCost; if (!CI->isNoBuiltin() && VecFunc) { // Calculate the cost of the vector library call. - SmallVector<Type *, 4> VecTys; - for (Use &Arg : CI->args()) - VecTys.push_back( - FixedVectorType::get(Arg->getType(), VecTy->getNumElements())); - // If the corresponding vector call is cheaper, return its cost. LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys, TTI::TCK_RecipThroughput); @@ -3440,7 +3568,82 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, return {IntrinsicCost, LibCost}; } -InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { +/// Compute the cost of creating a vector of type \p VecTy containing the +/// extracted values from \p VL. +static InstructionCost +computeExtractCost(ArrayRef<Value *> VL, FixedVectorType *VecTy, + TargetTransformInfo::ShuffleKind ShuffleKind, + ArrayRef<int> Mask, TargetTransformInfo &TTI) { + unsigned NumOfParts = TTI.getNumberOfParts(VecTy); + + if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc || !NumOfParts || + VecTy->getNumElements() < NumOfParts) + return TTI.getShuffleCost(ShuffleKind, VecTy, Mask); + + bool AllConsecutive = true; + unsigned EltsPerVector = VecTy->getNumElements() / NumOfParts; + unsigned Idx = -1; + InstructionCost Cost = 0; + + // Process extracts in blocks of EltsPerVector to check if the source vector + // operand can be re-used directly. If not, add the cost of creating a shuffle + // to extract the values into a vector register. + for (auto *V : VL) { + ++Idx; + + // Reached the start of a new vector registers. + if (Idx % EltsPerVector == 0) { + AllConsecutive = true; + continue; + } + + // Check all extracts for a vector register on the target directly + // extract values in order. + unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V)); + unsigned PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])); + AllConsecutive &= PrevIdx + 1 == CurrentIdx && + CurrentIdx % EltsPerVector == Idx % EltsPerVector; + + if (AllConsecutive) + continue; + + // Skip all indices, except for the last index per vector block. + if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size()) + continue; + + // If we have a series of extracts which are not consecutive and hence + // cannot re-use the source vector register directly, compute the shuffle + // cost to extract the a vector with EltsPerVector elements. + Cost += TTI.getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, + FixedVectorType::get(VecTy->getElementType(), EltsPerVector)); + } + return Cost; +} + +/// Shuffles \p Mask in accordance with the given \p SubMask. +static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask) { + if (SubMask.empty()) + return; + if (Mask.empty()) { + Mask.append(SubMask.begin(), SubMask.end()); + return; + } + SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size()); + int TermValue = std::min(Mask.size(), SubMask.size()); + for (int I = 0, E = SubMask.size(); I < E; ++I) { + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || + Mask[SubMask[I]] >= TermValue) { + NewMask[I] = UndefMaskElem; + continue; + } + NewMask[I] = Mask[SubMask[I]]; + } + Mask.swap(NewMask); +} + +InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, + ArrayRef<Value *> VectorizedVals) { ArrayRef<Value*> VL = E->Scalars; Type *ScalarTy = VL[0]->getType(); @@ -3448,6 +3651,8 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { ScalarTy = SI->getValueOperand()->getType(); else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0])) ScalarTy = CI->getOperand(0)->getType(); + else if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) + ScalarTy = IE->getOperand(1)->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; @@ -3456,45 +3661,168 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { if (MinBWs.count(VL[0])) VecTy = FixedVectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + auto *FinalVecTy = VecTy; unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); - InstructionCost ReuseShuffleCost = 0; - if (NeedToShuffleReuses) { - ReuseShuffleCost = - TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } + if (NeedToShuffleReuses) + FinalVecTy = + FixedVectorType::get(VecTy->getElementType(), ReuseShuffleNumbers); + // FIXME: it tries to fix a problem with MSVC buildbots. + TargetTransformInfo &TTIRef = *TTI; + auto &&AdjustExtractsCost = [this, &TTIRef, CostKind, VL, VecTy, + VectorizedVals](InstructionCost &Cost, + bool IsGather) { + DenseMap<Value *, int> ExtractVectorsTys; + for (auto *V : VL) { + // If all users of instruction are going to be vectorized and this + // instruction itself is not going to be vectorized, consider this + // instruction as dead and remove its cost from the final cost of the + // vectorized tree. + if (!areAllUsersVectorized(cast<Instruction>(V), VectorizedVals) || + (IsGather && ScalarToTreeEntry.count(V))) + continue; + auto *EE = cast<ExtractElementInst>(V); + unsigned Idx = *getExtractIndex(EE); + if (TTIRef.getNumberOfParts(VecTy) != + TTIRef.getNumberOfParts(EE->getVectorOperandType())) { + auto It = + ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first; + It->getSecond() = std::min<int>(It->second, Idx); + } + // Take credit for instruction that will become dead. + if (EE->hasOneUse()) { + Instruction *Ext = EE->user_back(); + if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa<GetElementPtrInst>(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + Cost -= + TTIRef.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(), + EE->getVectorOperandType(), Idx); + // Add back the cost of s|zext which is subtracted separately. + Cost += TTIRef.getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EE->getType(), + TTI::getCastContextHint(Ext), CostKind, Ext); + continue; + } + } + Cost -= TTIRef.getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), Idx); + } + // Add a cost for subvector extracts/inserts if required. + for (const auto &Data : ExtractVectorsTys) { + auto *EEVTy = cast<FixedVectorType>(Data.first->getType()); + unsigned NumElts = VecTy->getNumElements(); + if (TTIRef.getNumberOfParts(EEVTy) > TTIRef.getNumberOfParts(VecTy)) { + unsigned Idx = (Data.second / NumElts) * NumElts; + unsigned EENumElts = EEVTy->getNumElements(); + if (Idx + NumElts <= EENumElts) { + Cost += + TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, None, Idx, VecTy); + } else { + // Need to round up the subvector type vectorization factor to avoid a + // crash in cost model functions. Make SubVT so that Idx + VF of SubVT + // <= EENumElts. + auto *SubVT = + FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx); + Cost += + TTIRef.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector, + EEVTy, None, Idx, SubVT); + } + } else { + Cost += TTIRef.getShuffleCost(TargetTransformInfo::SK_InsertSubvector, + VecTy, None, 0, EEVTy); + } + } + }; if (E->State == TreeEntry::NeedToGather) { if (allConstant(VL)) return 0; - if (isSplat(VL)) { - return ReuseShuffleCost + - TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + if (isa<InsertElementInst>(VL[0])) + return InstructionCost::getInvalid(); + SmallVector<int> Mask; + SmallVector<const TreeEntry *> Entries; + Optional<TargetTransformInfo::ShuffleKind> Shuffle = + isGatherShuffledEntry(E, Mask, Entries); + if (Shuffle.hasValue()) { + InstructionCost GatherCost = 0; + if (ShuffleVectorInst::isIdentityMask(Mask)) { + // Perfect match in the graph, will reuse the previously vectorized + // node. Cost is 0. + LLVM_DEBUG( + dbgs() + << "SLP: perfect diamond match for gather bundle that starts with " + << *VL.front() << ".\n"); + if (NeedToShuffleReuses) + GatherCost = + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); + } else { + LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size() + << " entries for bundle that starts with " + << *VL.front() << ".\n"); + // Detected that instead of gather we can emit a shuffle of single/two + // previously vectorized nodes. Add the cost of the permutation rather + // than gather. + ::addMask(Mask, E->ReuseShuffleIndices); + GatherCost = TTI->getShuffleCost(*Shuffle, FinalVecTy, Mask); + } + return GatherCost; } - if (E->getOpcode() == Instruction::ExtractElement && - allSameType(VL) && allSameBlock(VL)) { - Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL); + if (isSplat(VL)) { + // Found the broadcasting of the single scalar, calculate the cost as the + // broadcast. + return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy); + } + if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && + allSameBlock(VL) && + !isa<ScalableVectorType>( + cast<ExtractElementInst>(E->getMainOp())->getVectorOperandType())) { + // Check that gather of extractelements can be represented as just a + // shuffle of a single/two vectors the scalars are extracted from. + SmallVector<int> Mask; + Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = + isShuffle(VL, Mask); if (ShuffleKind.hasValue()) { + // Found the bunch of extractelement instructions that must be gathered + // into a vector and can be represented as a permutation elements in a + // single input vector or of 2 input vectors. InstructionCost Cost = - TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { - // If all users of instruction are going to be vectorized and this - // instruction itself is not going to be vectorized, consider this - // instruction as dead and remove its cost from the final cost of the - // vectorized tree. - if (areAllUsersVectorized(cast<Instruction>(V)) && - !ScalarToTreeEntry.count(V)) { - auto *IO = cast<ConstantInt>( - cast<ExtractElementInst>(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); - } - } - return ReuseShuffleCost + Cost; - } - } + computeExtractCost(VL, VecTy, *ShuffleKind, Mask, *TTI); + AdjustExtractsCost(Cost, /*IsGather=*/true); + if (NeedToShuffleReuses) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecTy, E->ReuseShuffleIndices); + return Cost; + } + } + InstructionCost ReuseShuffleCost = 0; + if (NeedToShuffleReuses) + ReuseShuffleCost = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, FinalVecTy, E->ReuseShuffleIndices); return ReuseShuffleCost + getGatherCost(VL); } + InstructionCost CommonCost = 0; + SmallVector<int> Mask; + if (!E->ReorderIndices.empty()) { + SmallVector<int> NewMask; + if (E->getOpcode() == Instruction::Store) { + // For stores the order is actually a mask. + NewMask.resize(E->ReorderIndices.size()); + copy(E->ReorderIndices, NewMask.begin()); + } else { + inversePermutation(E->ReorderIndices, NewMask); + } + ::addMask(Mask, NewMask); + } + if (NeedToShuffleReuses) + ::addMask(Mask, E->ReuseShuffleIndices); + if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask)) + CommonCost = + TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask); assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); @@ -3508,46 +3836,39 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { case Instruction::ExtractValue: case Instruction::ExtractElement: { + // The common cost of removal ExtractElement/ExtractValue instructions + + // the cost of shuffles, if required to resuffle the original vector. if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast<ConstantInt>( - cast<ExtractElementInst>(VL[I])->getIndexOperand()); - Idx = IO->getZExtValue(); - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + auto *EE = cast<ExtractElementInst>(VL[I]); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { - ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); ++Idx; } } Idx = ReuseShuffleNumbers; for (Value *V : VL) { if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast<ConstantInt>( - cast<ExtractElementInst>(V)->getIndexOperand()); - Idx = IO->getZExtValue(); + auto *EE = cast<ExtractElementInst>(V); + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), + *getExtractIndex(EE)); } else { --Idx; + CommonCost += TTI->getVectorInstrCost(Instruction::ExtractElement, + VecTy, Idx); } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - InstructionCost DeadCost = ReuseShuffleCost; - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - DeadCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } - for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *EI = cast<Instruction>(VL[I]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(EI)) { + if (ShuffleOrOp == Instruction::ExtractValue) { + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + auto *EI = cast<Instruction>(VL[I]); // Take credit for instruction that will become dead. if (EI->hasOneUse()) { Instruction *Ext = EI->user_back(); @@ -3556,20 +3877,66 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { [](User *U) { return isa<GetElementPtrInst>(U); })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. - DeadCost -= TTI->getExtractWithExtendCost( + CommonCost -= TTI->getExtractWithExtendCost( Ext->getOpcode(), Ext->getType(), VecTy, I); // Add back the cost of s|zext which is subtracted separately. - DeadCost += TTI->getCastInstrCost( + CommonCost += TTI->getCastInstrCost( Ext->getOpcode(), Ext->getType(), EI->getType(), TTI::getCastContextHint(Ext), CostKind, Ext); continue; } } - DeadCost -= + CommonCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } + } else { + AdjustExtractsCost(CommonCost, /*IsGather=*/false); + } + return CommonCost; + } + case Instruction::InsertElement: { + auto *SrcVecTy = cast<FixedVectorType>(VL0->getType()); + + unsigned const NumElts = SrcVecTy->getNumElements(); + unsigned const NumScalars = VL.size(); + APInt DemandedElts = APInt::getNullValue(NumElts); + // TODO: Add support for Instruction::InsertValue. + unsigned Offset = UINT_MAX; + bool IsIdentity = true; + SmallVector<int> ShuffleMask(NumElts, UndefMaskElem); + for (unsigned I = 0; I < NumScalars; ++I) { + Optional<int> InsertIdx = getInsertIndex(VL[I], 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + unsigned Idx = *InsertIdx; + DemandedElts.setBit(Idx); + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + assert(Idx >= Offset && "Failed to find vector index offset"); + IsIdentity &= Idx - Offset == I; + } + ShuffleMask[Idx] = I; + } + assert(Offset < NumElts && "Failed to find vector index offset"); + + InstructionCost Cost = 0; + Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, + /*Insert*/ true, /*Extract*/ false); + + if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) { + // FIXME: Replace with SK_InsertSubvector once it is properly supported. + unsigned Sz = PowerOf2Ceil(Offset + NumScalars); + Cost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, + FixedVectorType::get(SrcVecTy->getElementType(), Sz)); + } else if (!IsIdentity) { + Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, + ShuffleMask); } - return DeadCost; + + return Cost; } case Instruction::ZExt: case Instruction::SExt: @@ -3588,7 +3955,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. @@ -3598,12 +3965,11 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { InstructionCost VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { - VecCost = - ReuseShuffleCost + - TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, - TTI::getCastContextHint(VL0), CostKind, VL0); + VecCost = CommonCost + TTI->getCastInstrCost( + E->getOpcode(), VecTy, SrcVecTy, + TTI::getCastContextHint(VL0), CostKind, VL0); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); return VecCost - ScalarCost; } case Instruction::FCmp: @@ -3614,7 +3980,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; @@ -3655,8 +4021,8 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { CmpInst::BAD_ICMP_PREDICATE, CostKind); VecCost = std::min(VecCost, IntrinsicCost); } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::FNeg: case Instruction::Add: @@ -3719,14 +4085,14 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -3737,40 +4103,39 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost( Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecCost = TTI->getArithmeticInstrCost( Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. - Align alignment = cast<LoadInst>(VL0)->getAlign(); + Align Alignment = cast<LoadInst>(VL0)->getAlign(); InstructionCost ScalarEltCost = TTI->getMemoryOpCost( - Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); + Instruction::Load, ScalarTy, Alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecLdCost; if (E->State == TreeEntry::Vectorize) { - VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, Alignment, 0, CostKind, VL0); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + Align CommonAlignment = Alignment; + for (Value *V : VL) + CommonAlignment = + commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); VecLdCost = TTI->getGatherScatterOpCost( Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(), - /*VariableMask=*/false, alignment, CostKind, VL0); + /*VariableMask=*/false, CommonAlignment, CostKind, VL0); } - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - VecLdCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); - return ReuseShuffleCost + VecLdCost - ScalarLdCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost)); + return CommonCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. @@ -3780,29 +4145,22 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { Align Alignment = SI->getAlign(); InstructionCost ScalarEltCost = TTI->getMemoryOpCost( Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); - if (NeedToShuffleReuses) - ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost; InstructionCost VecStCost = TTI->getMemoryOpCost( Instruction::Store, VecTy, Alignment, 0, CostKind, VL0); - if (IsReorder) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - VecStCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost)); - return ReuseShuffleCost + VecStCost - ScalarStCost; + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecStCost, ScalarStCost)); + return CommonCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast<CallInst>(VL0); Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); // Calculate the cost of the scalar and vector calls. - IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); + IntrinsicCostAttributes CostAttrs(ID, *CI, 1); InstructionCost ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + CommonCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; @@ -3814,7 +4172,7 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { << " (" << VecCallCost << "-" << ScalarCallCost << ")" << " for " << *CI << "\n"); - return ReuseShuffleCost + VecCallCost - ScalarCallCost; + return CommonCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { assert(E->isAltShuffle() && @@ -3827,11 +4185,11 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { Instruction *I = cast<Instruction>(VL[Idx]); - ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); + CommonCost -= TTI->getInstructionCost(I, CostKind); } for (Value *V : VL) { Instruction *I = cast<Instruction>(V); - ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); + CommonCost += TTI->getInstructionCost(I, CostKind); } } for (Value *V : VL) { @@ -3856,9 +4214,17 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) { VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty, TTI::CastContextHint::None, CostKind); } - VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); - LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost)); - return ReuseShuffleCost + VecCost - ScalarCost; + + SmallVector<int> Mask(E->Scalars.size()); + for (unsigned I = 0, End = E->Scalars.size(); I < End; ++I) { + auto *OpInst = cast<Instruction>(E->Scalars[I]); + assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); + Mask[I] = I + (OpInst->getOpcode() == E->getAltOpcode() ? End : 0); + } + VecCost += + TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, 0); + LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost, ScalarCost)); + return CommonCost + VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); @@ -3877,10 +4243,20 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { if (VectorizableTree.size() != 2) return false; - // Handle splat and all-constants stores. + // Handle splat and all-constants stores. Also try to vectorize tiny trees + // with the second gather nodes if they have less scalar operands rather than + // the initial tree element (may be profitable to shuffle the second gather) + // or they are extractelements, which form shuffle. + SmallVector<int> Mask; if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || - isSplat(VectorizableTree[1]->Scalars))) + isSplat(VectorizableTree[1]->Scalars) || + (VectorizableTree[1]->State == TreeEntry::NeedToGather && + VectorizableTree[1]->Scalars.size() < + VectorizableTree[0]->Scalars.size()) || + (VectorizableTree[1]->State == TreeEntry::NeedToGather && + VectorizableTree[1]->getOpcode() == Instruction::ExtractElement && + isShuffle(VectorizableTree[1]->Scalars, Mask)))) return true; // Gathering cost would be too much for tiny trees. @@ -3892,21 +4268,27 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { } static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, - TargetTransformInfo *TTI) { + TargetTransformInfo *TTI, + bool MustMatchOrInst) { // Look past the root to find a source value. Arbitrarily follow the // path through operand 0 of any 'or'. Also, peek through optional // shift-left-by-multiple-of-8-bits. Value *ZextLoad = Root; const APInt *ShAmtC; + bool FoundOr = false; while (!isa<ConstantExpr>(ZextLoad) && (match(ZextLoad, m_Or(m_Value(), m_Value())) || (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) && - ShAmtC->urem(8) == 0))) - ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0); - + ShAmtC->urem(8) == 0))) { + auto *BinOp = cast<BinaryOperator>(ZextLoad); + ZextLoad = BinOp->getOperand(0); + if (BinOp->getOpcode() == Instruction::Or) + FoundOr = true; + } // Check if the input is an extended load of the required or/shift expression. Value *LoadPtr; - if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) + if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root || + !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr))))) return false; // Require that the total load bit width is a legal integer type. @@ -3931,7 +4313,8 @@ bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const { unsigned NumElts = VectorizableTree[0]->Scalars.size(); Value *FirstReduced = VectorizableTree[0]->Scalars[0]; - return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI); + return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI, + /* MatchOr */ false); } bool BoUpSLP::isLoadCombineCandidate() const { @@ -3941,13 +4324,19 @@ bool BoUpSLP::isLoadCombineCandidate() const { for (Value *Scalar : VectorizableTree[0]->Scalars) { Value *X; if (!match(Scalar, m_Store(m_Value(X), m_Value())) || - !isLoadCombineCandidateImpl(X, NumElts, TTI)) + !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true)) return false; } return true; } bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const { + // No need to vectorize inserts of gathered values. + if (VectorizableTree.size() == 2 && + isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) && + VectorizableTree[1]->State == TreeEntry::NeedToGather) + return true; + // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -3991,8 +4380,16 @@ InstructionCost BoUpSLP::getSpillCost() const { continue; OrderedScalars.push_back(Inst); } - llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) { - return DT->dominates(B, A); + llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) { + auto *NodeA = DT->getNode(A->getParent()); + auto *NodeB = DT->getNode(B->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA != NodeB) + return NodeA->getDFSNumIn() < NodeB->getDFSNumIn(); + return B->comesBefore(A); }); for (Instruction *Inst : OrderedScalars) { @@ -4038,8 +4435,12 @@ InstructionCost BoUpSLP::getSpillCost() const { if (NumCalls) { SmallVector<Type*, 4> V; - for (auto *II : LiveValues) - V.push_back(FixedVectorType::get(II->getType(), BundleWidth)); + for (auto *II : LiveValues) { + auto *ScalarTy = II->getType(); + if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy)) + ScalarTy = VectorTy->getElementType(); + V.push_back(FixedVectorType::get(ScalarTy, BundleWidth)); + } Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V); } @@ -4049,7 +4450,7 @@ InstructionCost BoUpSLP::getSpillCost() const { return Cost; } -InstructionCost BoUpSLP::getTreeCost() { +InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) { InstructionCost Cost = 0; LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size " << VectorizableTree.size() << ".\n"); @@ -4059,28 +4460,7 @@ InstructionCost BoUpSLP::getTreeCost() { for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) { TreeEntry &TE = *VectorizableTree[I].get(); - // We create duplicate tree entries for gather sequences that have multiple - // uses. However, we should not compute the cost of duplicate sequences. - // For example, if we have a build vector (i.e., insertelement sequence) - // that is used by more than one vector instruction, we only need to - // compute the cost of the insertelement instructions once. The redundant - // instructions will be eliminated by CSE. - // - // We should consider not creating duplicate tree entries for gather - // sequences, and instead add additional edges to the tree representing - // their uses. Since such an approach results in fewer total entries, - // existing heuristics based on tree size may yield different results. - // - if (TE.State == TreeEntry::NeedToGather && - std::any_of(std::next(VectorizableTree.begin(), I + 1), - VectorizableTree.end(), - [TE](const std::unique_ptr<TreeEntry> &EntryPtr) { - return EntryPtr->State == TreeEntry::NeedToGather && - EntryPtr->isSame(TE.Scalars); - })) - continue; - - InstructionCost C = getEntryCost(&TE); + InstructionCost C = getEntryCost(&TE, VectorizedVals); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " << *TE.Scalars[0] @@ -4090,6 +4470,10 @@ InstructionCost BoUpSLP::getTreeCost() { SmallPtrSet<Value *, 16> ExtractCostCalculated; InstructionCost ExtractCost = 0; + SmallVector<unsigned> VF; + SmallVector<SmallVector<int>> ShuffleMask; + SmallVector<Value *> FirstUsers; + SmallVector<APInt> DemandedElts; for (ExternalUser &EU : ExternalUses) { // We only add extract cost once for the same scalar. if (!ExtractCostCalculated.insert(EU.Scalar).second) @@ -4101,6 +4485,57 @@ InstructionCost BoUpSLP::getTreeCost() { if (EphValues.count(EU.User)) continue; + // No extract cost for vector "scalar" + if (isa<FixedVectorType>(EU.Scalar->getType())) + continue; + + // Already counted the cost for external uses when tried to adjust the cost + // for extractelements, no need to add it again. + if (isa<ExtractElementInst>(EU.Scalar)) + continue; + + // If found user is an insertelement, do not calculate extract cost but try + // to detect it as a final shuffled/identity match. + if (EU.User && isa<InsertElementInst>(EU.User)) { + if (auto *FTy = dyn_cast<FixedVectorType>(EU.User->getType())) { + Optional<int> InsertIdx = getInsertIndex(EU.User, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + Value *VU = EU.User; + auto *It = find_if(FirstUsers, [VU](Value *V) { + // Checks if 2 insertelements are from the same buildvector. + if (VU->getType() != V->getType()) + return false; + auto *IE1 = cast<InsertElementInst>(VU); + auto *IE2 = cast<InsertElementInst>(V); + // Go though of insertelement instructions trying to find either VU as + // the original vector for IE2 or V as the original vector for IE1. + do { + if (IE1 == VU || IE2 == V) + return true; + if (IE1) + IE1 = dyn_cast<InsertElementInst>(IE1->getOperand(0)); + if (IE2) + IE2 = dyn_cast<InsertElementInst>(IE2->getOperand(0)); + } while (IE1 || IE2); + return false; + }); + int VecId = -1; + if (It == FirstUsers.end()) { + VF.push_back(FTy->getNumElements()); + ShuffleMask.emplace_back(VF.back(), UndefMaskElem); + FirstUsers.push_back(EU.User); + DemandedElts.push_back(APInt::getNullValue(VF.back())); + VecId = FirstUsers.size() - 1; + } else { + VecId = std::distance(FirstUsers.begin(), It); + } + int Idx = *InsertIdx; + ShuffleMask[VecId][Idx] = EU.Lane; + DemandedElts[VecId].setBit(Idx); + } + } + // If we plan to rewrite the tree in a smaller type, we will need to sign // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. @@ -4121,6 +4556,48 @@ InstructionCost BoUpSLP::getTreeCost() { InstructionCost SpillCost = getSpillCost(); Cost += SpillCost + ExtractCost; + for (int I = 0, E = FirstUsers.size(); I < E; ++I) { + // For the very first element - simple shuffle of the source vector. + int Limit = ShuffleMask[I].size() * 2; + if (I == 0 && + all_of(ShuffleMask[I], [Limit](int Idx) { return Idx < Limit; }) && + !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) { + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, + cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]); + LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + continue; + } + // Other elements - permutation of 2 vectors (the initial one and the next + // Ith incoming vector). + unsigned VF = ShuffleMask[I].size(); + for (unsigned Idx = 0; Idx < VF; ++Idx) { + int &Mask = ShuffleMask[I][Idx]; + Mask = Mask == UndefMaskElem ? Idx : VF + Mask; + } + InstructionCost C = TTI->getShuffleCost( + TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()), + ShuffleMask[I]); + LLVM_DEBUG( + dbgs() + << "SLP: Adding cost " << C + << " for final shuffle of vector node and external insertelement users " + << *VectorizableTree.front()->Scalars.front() << ".\n" + << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + InstructionCost InsertCost = TTI->getScalarizationOverhead( + cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I], + /*Insert*/ true, + /*Extract*/ false); + Cost -= InsertCost; + LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost + << " for insertelements gather.\n" + << "SLP: Current total cost = " << Cost << "\n"); + } #ifndef NDEBUG SmallString<256> Str; @@ -4138,6 +4615,146 @@ InstructionCost BoUpSLP::getTreeCost() { return Cost; } +Optional<TargetTransformInfo::ShuffleKind> +BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, SmallVectorImpl<int> &Mask, + SmallVectorImpl<const TreeEntry *> &Entries) { + // TODO: currently checking only for Scalars in the tree entry, need to count + // reused elements too for better cost estimation. + Mask.assign(TE->Scalars.size(), UndefMaskElem); + Entries.clear(); + // Build a lists of values to tree entries. + DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>> ValueToTEs; + for (const std::unique_ptr<TreeEntry> &EntryPtr : VectorizableTree) { + if (EntryPtr.get() == TE) + break; + if (EntryPtr->State != TreeEntry::NeedToGather) + continue; + for (Value *V : EntryPtr->Scalars) + ValueToTEs.try_emplace(V).first->getSecond().insert(EntryPtr.get()); + } + // Find all tree entries used by the gathered values. If no common entries + // found - not a shuffle. + // Here we build a set of tree nodes for each gathered value and trying to + // find the intersection between these sets. If we have at least one common + // tree node for each gathered value - we have just a permutation of the + // single vector. If we have 2 different sets, we're in situation where we + // have a permutation of 2 input vectors. + SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs; + DenseMap<Value *, int> UsedValuesEntry; + for (Value *V : TE->Scalars) { + if (isa<UndefValue>(V)) + continue; + // Build a list of tree entries where V is used. + SmallPtrSet<const TreeEntry *, 4> VToTEs; + auto It = ValueToTEs.find(V); + if (It != ValueToTEs.end()) + VToTEs = It->second; + if (const TreeEntry *VTE = getTreeEntry(V)) + VToTEs.insert(VTE); + if (VToTEs.empty()) + return None; + if (UsedTEs.empty()) { + // The first iteration, just insert the list of nodes to vector. + UsedTEs.push_back(VToTEs); + } else { + // Need to check if there are any previously used tree nodes which use V. + // If there are no such nodes, consider that we have another one input + // vector. + SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs); + unsigned Idx = 0; + for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) { + // Do we have a non-empty intersection of previously listed tree entries + // and tree entries using current V? + set_intersect(VToTEs, Set); + if (!VToTEs.empty()) { + // Yes, write the new subset and continue analysis for the next + // scalar. + Set.swap(VToTEs); + break; + } + VToTEs = SavedVToTEs; + ++Idx; + } + // No non-empty intersection found - need to add a second set of possible + // source vectors. + if (Idx == UsedTEs.size()) { + // If the number of input vectors is greater than 2 - not a permutation, + // fallback to the regular gather. + if (UsedTEs.size() == 2) + return None; + UsedTEs.push_back(SavedVToTEs); + Idx = UsedTEs.size() - 1; + } + UsedValuesEntry.try_emplace(V, Idx); + } + } + + unsigned VF = 0; + if (UsedTEs.size() == 1) { + // Try to find the perfect match in another gather node at first. + auto It = find_if(UsedTEs.front(), [TE](const TreeEntry *EntryPtr) { + return EntryPtr->isSame(TE->Scalars); + }); + if (It != UsedTEs.front().end()) { + Entries.push_back(*It); + std::iota(Mask.begin(), Mask.end(), 0); + return TargetTransformInfo::SK_PermuteSingleSrc; + } + // No perfect match, just shuffle, so choose the first tree node. + Entries.push_back(*UsedTEs.front().begin()); + } else { + // Try to find nodes with the same vector factor. + assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries."); + // FIXME: Shall be replaced by GetVF function once non-power-2 patch is + // landed. + auto &&GetVF = [](const TreeEntry *TE) { + if (!TE->ReuseShuffleIndices.empty()) + return TE->ReuseShuffleIndices.size(); + return TE->Scalars.size(); + }; + DenseMap<int, const TreeEntry *> VFToTE; + for (const TreeEntry *TE : UsedTEs.front()) + VFToTE.try_emplace(GetVF(TE), TE); + for (const TreeEntry *TE : UsedTEs.back()) { + auto It = VFToTE.find(GetVF(TE)); + if (It != VFToTE.end()) { + VF = It->first; + Entries.push_back(It->second); + Entries.push_back(TE); + break; + } + } + // No 2 source vectors with the same vector factor - give up and do regular + // gather. + if (Entries.empty()) + return None; + } + + // Build a shuffle mask for better cost estimation and vector emission. + for (int I = 0, E = TE->Scalars.size(); I < E; ++I) { + Value *V = TE->Scalars[I]; + if (isa<UndefValue>(V)) + continue; + unsigned Idx = UsedValuesEntry.lookup(V); + const TreeEntry *VTE = Entries[Idx]; + int FoundLane = VTE->findLaneForValue(V); + Mask[I] = Idx * VF + FoundLane; + // Extra check required by isSingleSourceMaskImpl function (called by + // ShuffleVectorInst::isSingleSourceMask). + if (Mask[I] >= 2 * E) + return None; + } + switch (Entries.size()) { + case 1: + return TargetTransformInfo::SK_PermuteSingleSrc; + case 2: + return TargetTransformInfo::SK_PermuteTwoSrc; + default: + break; + } + return None; +} + InstructionCost BoUpSLP::getGatherCost(FixedVectorType *Ty, const DenseSet<unsigned> &ShuffledIndices) const { @@ -4168,6 +4785,8 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const { // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; + if (isConstant(VL[Idx])) + continue; if (!UniqueElements.insert(VL[Idx]).second) ShuffledElements.insert(Idx); } @@ -4191,7 +4810,7 @@ void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL, Right = Ops.getVL(1); } -void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { +void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. auto *Front = E->getMainOp(); @@ -4253,83 +4872,224 @@ void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { } Value *BoUpSLP::gather(ArrayRef<Value *> VL) { - Value *Val0 = - isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; - FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); - Value *Vec = PoisonValue::get(VecTy); - unsigned InsIndex = 0; - for (Value *Val : VL) { - Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++)); + // List of instructions/lanes from current block and/or the blocks which are + // part of the current loop. These instructions will be inserted at the end to + // make it possible to optimize loops and hoist invariant instructions out of + // the loops body with better chances for success. + SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts; + SmallSet<int, 4> PostponedIndices; + Loop *L = LI->getLoopFor(Builder.GetInsertBlock()); + auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) { + SmallPtrSet<BasicBlock *, 4> Visited; + while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second) + InsertBB = InsertBB->getSinglePredecessor(); + return InsertBB && InsertBB == InstBB; + }; + for (int I = 0, E = VL.size(); I < E; ++I) { + if (auto *Inst = dyn_cast<Instruction>(VL[I])) + if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || + getTreeEntry(Inst) || (L && (L->contains(Inst)))) && + PostponedIndices.insert(I).second) + PostponedInsts.emplace_back(Inst, I); + } + + auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) { + Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos)); auto *InsElt = dyn_cast<InsertElementInst>(Vec); if (!InsElt) - continue; + return Vec; GatherSeq.insert(InsElt); CSEBlocks.insert(InsElt->getParent()); // Add to our 'need-to-extract' list. - if (TreeEntry *Entry = getTreeEntry(Val)) { + if (TreeEntry *Entry = getTreeEntry(V)) { // Find which lane we need to extract. - unsigned FoundLane = std::distance(Entry->Scalars.begin(), - find(Entry->Scalars, Val)); - assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane"); - if (!Entry->ReuseShuffleIndices.empty()) { - FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), - find(Entry->ReuseShuffleIndices, FoundLane)); - } - ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane)); + unsigned FoundLane = Entry->findLaneForValue(V); + ExternalUses.emplace_back(V, InsElt, FoundLane); + } + return Vec; + }; + Value *Val0 = + isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0]; + FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); + Value *Vec = PoisonValue::get(VecTy); + SmallVector<int> NonConsts; + // Insert constant values at first. + for (int I = 0, E = VL.size(); I < E; ++I) { + if (PostponedIndices.contains(I)) + continue; + if (!isConstant(VL[I])) { + NonConsts.push_back(I); + continue; } + Vec = CreateInsertElement(Vec, VL[I], I); } + // Insert non-constant values. + for (int I : NonConsts) + Vec = CreateInsertElement(Vec, VL[I], I); + // Append instructions, which are/may be part of the loop, in the end to make + // it possible to hoist non-loop-based instructions. + for (const std::pair<Value *, unsigned> &Pair : PostponedInsts) + Vec = CreateInsertElement(Vec, Pair.first, Pair.second); return Vec; } +namespace { +/// Merges shuffle masks and emits final shuffle instruction, if required. +class ShuffleInstructionBuilder { + IRBuilderBase &Builder; + const unsigned VF = 0; + bool IsFinalized = false; + SmallVector<int, 4> Mask; + +public: + ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF) + : Builder(Builder), VF(VF) {} + + /// Adds a mask, inverting it before applying. + void addInversedMask(ArrayRef<unsigned> SubMask) { + if (SubMask.empty()) + return; + SmallVector<int, 4> NewMask; + inversePermutation(SubMask, NewMask); + addMask(NewMask); + } + + /// Functions adds masks, merging them into single one. + void addMask(ArrayRef<unsigned> SubMask) { + SmallVector<int, 4> NewMask(SubMask.begin(), SubMask.end()); + addMask(NewMask); + } + + void addMask(ArrayRef<int> SubMask) { ::addMask(Mask, SubMask); } + + Value *finalize(Value *V) { + IsFinalized = true; + unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements(); + if (VF == ValueVF && Mask.empty()) + return V; + SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem); + std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0); + addMask(NormalizedMask); + + if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask)) + return V; + return Builder.CreateShuffleVector(V, Mask, "shuffle"); + } + + ~ShuffleInstructionBuilder() { + assert((IsFinalized || Mask.empty()) && + "Shuffle construction must be finalized."); + } +}; +} // namespace + Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) { + unsigned VF = VL.size(); InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { + if (TreeEntry *E = getTreeEntry(S.OpValue)) if (E->isSame(VL)) { Value *V = vectorizeTree(E); - if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) { - V = SV->getOperand(0); - } else { + if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) { + if (!E->ReuseShuffleIndices.empty()) { // Reshuffle to get only unique values. - SmallVector<int, 4> UniqueIdxs; + // If some of the scalars are duplicated in the vectorization tree + // entry, we do not vectorize them but instead generate a mask for + // the reuses. But if there are several users of the same entry, + // they may have different vectorization factors. This is especially + // important for PHI nodes. In this case, we need to adapt the + // resulting instruction for the user vectorization factor and have + // to reshuffle it again to take only unique elements of the vector. + // Without this code the function incorrectly returns reduced vector + // instruction with the same elements, not with the unique ones. + + // block: + // %phi = phi <2 x > { .., %entry} {%shuffle, %block} + // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1> + // ... (use %2) + // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2} + // br %block + SmallVector<int> UniqueIdxs; SmallSet<int, 4> UsedIdxs; - for (int Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) - UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UniqueIdxs); + int Pos = 0; + int Sz = VL.size(); + for (int Idx : E->ReuseShuffleIndices) { + if (Idx != Sz && UsedIdxs.insert(Idx).second) + UniqueIdxs.emplace_back(Pos); + ++Pos; + } + assert(VF >= UsedIdxs.size() && "Expected vectorization factor " + "less than original vector size."); + UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem); + V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle"); + } else { + assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() && + "Expected vectorization factor less " + "than original vector size."); + SmallVector<int> UniformMask(VF, 0); + std::iota(UniformMask.begin(), UniformMask.end(), 0); + V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle"); } } return V; } - } } // Check that every instruction appears once in this bundle. - SmallVector<int, 4> ReuseShuffleIndicies; - SmallVector<Value *, 4> UniqueValues; + SmallVector<int> ReuseShuffleIndicies; + SmallVector<Value *> UniqueValues; if (VL.size() > 2) { DenseMap<Value *, unsigned> UniquePositions; - for (Value *V : VL) { + unsigned NumValues = + std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) { + return !isa<UndefValue>(V); + }).base()); + VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues)); + int UniqueVals = 0; + bool HasUndefs = false; + for (Value *V : VL.drop_back(VL.size() - VF)) { + if (isa<UndefValue>(V)) { + ReuseShuffleIndicies.emplace_back(UndefMaskElem); + HasUndefs = true; + continue; + } + if (isConstant(V)) { + ReuseShuffleIndicies.emplace_back(UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second || isa<Constant>(V)) + if (Res.second) { UniqueValues.emplace_back(V); + ++UniqueVals; + } } - // Do not shuffle single element or if number of unique values is not power - // of 2. - if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 || - !llvm::isPowerOf2_32(UniqueValues.size())) + if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) { + // Emit pure splat vector. + // FIXME: why it is not identified as an identity. + unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem); + if (NumUndefs == ReuseShuffleIndicies.size() - 1) + ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(), + UndefMaskElem); + else + ReuseShuffleIndicies.assign(VF, 0); + } else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) { ReuseShuffleIndicies.clear(); - else - VL = UniqueValues; + UniqueValues.clear(); + UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues)); + } + UniqueValues.append(VF - UniqueValues.size(), + PoisonValue::get(VL[0]->getType())); + VL = UniqueValues; } + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); Value *Vec = gather(VL); if (!ReuseShuffleIndicies.empty()) { - Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle"); + ShuffleBuilder.addMask(ReuseShuffleIndicies); + Vec = ShuffleBuilder.finalize(Vec); if (auto *I = dyn_cast<Instruction>(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); @@ -4347,11 +5107,28 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); + unsigned VF = E->Scalars.size(); + if (NeedToShuffleReuses) + VF = E->ReuseShuffleIndices.size(); + ShuffleInstructionBuilder ShuffleBuilder(Builder, VF); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); - Value *Vec = gather(E->Scalars); + Value *Vec; + SmallVector<int> Mask; + SmallVector<const TreeEntry *> Entries; + Optional<TargetTransformInfo::ShuffleKind> Shuffle = + isGatherShuffledEntry(E, Mask, Entries); + if (Shuffle.hasValue()) { + assert((Entries.size() == 1 || Entries.size() == 2) && + "Expected shuffle of 1 or 2 entries."); + Vec = Builder.CreateShuffleVector(Entries.front()->VectorizedValue, + Entries.back()->VectorizedValue, Mask); + } else { + Vec = gather(E->Scalars); + } if (NeedToShuffleReuses) { - Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + Vec = ShuffleBuilder.finalize(Vec); if (auto *I = dyn_cast<Instruction>(Vec)) { GatherSeq.insert(I); CSEBlocks.insert(I->getParent()); @@ -4370,6 +5147,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Type *ScalarTy = VL0->getType(); if (auto *Store = dyn_cast<StoreInst>(VL0)) ScalarTy = Store->getValueOperand()->getType(); + else if (auto *IE = dyn_cast<InsertElementInst>(VL0)) + ScalarTy = IE->getOperand(1)->getType(); auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { @@ -4409,18 +5188,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { case Instruction::ExtractElement: { Value *V = E->getSingleOperand(0); - if (!E->ReorderIndices.empty()) { - SmallVector<int, 4> Mask; - inversePermutation(E->ReorderIndices, Mask); - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - if (E->ReorderIndices.empty()) - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - } + Builder.SetInsertPoint(VL0); + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; return V; } @@ -4431,19 +5202,76 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); Value *NewV = propagateMetadata(V, E->Scalars); - if (!E->ReorderIndices.empty()) { - SmallVector<int, 4> Mask; - inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices, - "shuffle"); - } + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + NewV = ShuffleBuilder.finalize(NewV); E->VectorizedValue = NewV; return NewV; } + case Instruction::InsertElement: { + Builder.SetInsertPoint(VL0); + Value *V = vectorizeTree(E->getOperand(1)); + + const unsigned NumElts = + cast<FixedVectorType>(VL0->getType())->getNumElements(); + const unsigned NumScalars = E->Scalars.size(); + + // Create InsertVector shuffle if necessary + Instruction *FirstInsert = nullptr; + bool IsIdentity = true; + unsigned Offset = UINT_MAX; + for (unsigned I = 0; I < NumScalars; ++I) { + Value *Scalar = E->Scalars[I]; + if (!FirstInsert && + !is_contained(E->Scalars, cast<Instruction>(Scalar)->getOperand(0))) + FirstInsert = cast<Instruction>(Scalar); + Optional<int> InsertIdx = getInsertIndex(Scalar, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + unsigned Idx = *InsertIdx; + if (Idx < Offset) { + Offset = Idx; + IsIdentity &= I == 0; + } else { + assert(Idx >= Offset && "Failed to find vector index offset"); + IsIdentity &= Idx - Offset == I; + } + } + assert(Offset < NumElts && "Failed to find vector index offset"); + + // Create shuffle to resize vector + SmallVector<int> Mask(NumElts, UndefMaskElem); + if (!IsIdentity) { + for (unsigned I = 0; I < NumScalars; ++I) { + Value *Scalar = E->Scalars[I]; + Optional<int> InsertIdx = getInsertIndex(Scalar, 0); + if (!InsertIdx || *InsertIdx == UndefMaskElem) + continue; + Mask[*InsertIdx - Offset] = I; + } + } else { + std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + } + if (!IsIdentity || NumElts != NumScalars) + V = Builder.CreateShuffleVector(V, Mask); + + if (NumElts != NumScalars) { + SmallVector<int> InsertMask(NumElts); + std::iota(InsertMask.begin(), InsertMask.end(), 0); + for (unsigned I = 0; I < NumElts; I++) { + if (Mask[I] != UndefMaskElem) + InsertMask[Offset + I] = NumElts + I; + } + + V = Builder.CreateShuffleVector( + FirstInsert->getOperand(0), V, InsertMask, + cast<Instruction>(E->Scalars.back())->getName()); + } + + ++NumVectorInstructions; + E->VectorizedValue = V; + return V; + } case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -4467,8 +5295,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *CI = cast<CastInst>(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4489,8 +5317,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate(); Value *V = Builder.CreateCmp(P0, L, R); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4509,8 +5337,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Value *V = Builder.CreateSelect(Cond, True, False); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4532,8 +5360,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4575,8 +5403,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (auto *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4614,19 +5442,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { for (Value *V : E->Scalars) CommonAlignment = commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign()); - NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment); + NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment); } Value *V = propagateMetadata(NewLI, E->Scalars); - if (IsReorder) { - SmallVector<int, 4> Mask; - inversePermutation(E->ReorderIndices, Mask); - V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); - } + ShuffleBuilder.addInversedMask(E->ReorderIndices); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -4640,11 +5462,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { setInsertPointAfterBundle(E); Value *VecValue = vectorizeTree(E->getOperand(0)); - if (IsReorder) { - SmallVector<int, 4> Mask(E->ReorderIndices.begin(), - E->ReorderIndices.end()); - VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf"); - } + ShuffleBuilder.addMask(E->ReorderIndices); + VecValue = ShuffleBuilder.finalize(VecValue); + Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast( ScalarPtr, VecValue->getType()->getPointerTo(AS)); @@ -4658,8 +5478,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0)); Value *V = propagateMetadata(ST, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4697,8 +5515,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4721,6 +5539,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *ScalarArg = nullptr; std::vector<Value *> OpVecs; + SmallVector<Type *, 2> TysForDecl = + {FixedVectorType::get(CI->getType(), E->Scalars.size())}; for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) { ValueList OpVL; // Some intrinsics have scalar arguments. This argument should not be @@ -4729,6 +5549,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { CallInst *CEI = cast<CallInst>(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); + if (hasVectorInstrinsicOverloadedScalarOpd(IID, j)) + TysForDecl.push_back(ScalarArg->getType()); continue; } @@ -4745,8 +5567,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { - Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())}; - CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl); } SmallVector<OperandBundleDef, 1> OpBundles; @@ -4760,8 +5581,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0)); propagateIRFlags(V, E->Scalars, VL0); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4807,17 +5628,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. ValueList OpScalars, AltScalars; - unsigned e = E->Scalars.size(); - SmallVector<int, 8> Mask(e); - for (unsigned i = 0; i < e; ++i) { - auto *OpInst = cast<Instruction>(E->Scalars[i]); + unsigned Sz = E->Scalars.size(); + SmallVector<int> Mask(Sz); + for (unsigned I = 0; I < Sz; ++I) { + auto *OpInst = cast<Instruction>(E->Scalars[I]); assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == E->getAltOpcode()) { - Mask[i] = e + i; - AltScalars.push_back(E->Scalars[i]); + Mask[I] = Sz + I; + AltScalars.push_back(E->Scalars[I]); } else { - Mask[i] = i; - OpScalars.push_back(E->Scalars[i]); + Mask[I] = I; + OpScalars.push_back(E->Scalars[I]); } } @@ -4827,8 +5648,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast<Instruction>(V)) V = propagateMetadata(I, E->Scalars); - if (NeedToShuffleReuses) - V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle"); + ShuffleBuilder.addMask(E->ReuseShuffleIndices); + V = ShuffleBuilder.finalize(V); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4861,8 +5682,14 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { // sign extend the extracted values below. auto *ScalarRoot = VectorizableTree[0]->Scalars[0]; if (MinBWs.count(ScalarRoot)) { - if (auto *I = dyn_cast<Instruction>(VectorRoot)) - Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); + if (auto *I = dyn_cast<Instruction>(VectorRoot)) { + // If current instr is a phi and not the last phi, insert it after the + // last phi node. + if (isa<PHINode>(I)) + Builder.SetInsertPoint(&*I->getParent()->getFirstInsertionPt()); + else + Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); + } auto BundleWidth = VectorizableTree[0]->Scalars.size(); auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto *VecTy = FixedVectorType::get(MinTy, BundleWidth); @@ -4873,16 +5700,6 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n"); - // If necessary, sign-extend or zero-extend ScalarRoot to the larger type - // specified by ScalarType. - auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) { - if (!MinBWs.count(ScalarRoot)) - return Ex; - if (MinBWs[ScalarRoot].second) - return Builder.CreateSExt(Ex, ScalarType); - return Builder.CreateZExt(Ex, ScalarType); - }; - // Extract all of the elements with the external uses. for (const auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; @@ -4901,6 +5718,29 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { assert(Vec && "Can't find vectorizable value"); Value *Lane = Builder.getInt32(ExternalUse.Lane); + auto ExtractAndExtendIfNeeded = [&](Value *Vec) { + if (Scalar->getType() != Vec->getType()) { + Value *Ex; + // "Reuse" the existing extract to improve final codegen. + if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) { + Ex = Builder.CreateExtractElement(ES->getOperand(0), + ES->getOperand(1)); + } else { + Ex = Builder.CreateExtractElement(Vec, Lane); + } + // If necessary, sign-extend or zero-extend ScalarRoot + // to the larger type. + if (!MinBWs.count(ScalarRoot)) + return Ex; + if (MinBWs[ScalarRoot].second) + return Builder.CreateSExt(Ex, Scalar->getType()); + return Builder.CreateZExt(Ex, Scalar->getType()); + } + assert(isa<FixedVectorType>(Scalar->getType()) && + isa<InsertElementInst>(Scalar) && + "In-tree scalar of vector type is not insertelement?"); + return Vec; + }; // If User == nullptr, the Scalar is used as extra arg. Generate // ExtractElement instruction and update the record for this scalar in // ExternallyUsedValues. @@ -4914,14 +5754,16 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); } - Value *Ex = Builder.CreateExtractElement(Vec, Lane); - Ex = extend(ScalarRoot, Ex, Scalar->getType()); + Value *NewInst = ExtractAndExtendIfNeeded(Vec); CSEBlocks.insert(cast<Instruction>(Scalar)->getParent()); - auto &Locs = ExternallyUsedValues[Scalar]; - ExternallyUsedValues.insert({Ex, Locs}); + auto &NewInstLocs = ExternallyUsedValues[NewInst]; + auto It = ExternallyUsedValues.find(Scalar); + assert(It != ExternallyUsedValues.end() && + "Externally used scalar is not found in ExternallyUsedValues"); + NewInstLocs.append(It->second); ExternallyUsedValues.erase(Scalar); // Required to update internally referenced instructions. - Scalar->replaceAllUsesWith(Ex); + Scalar->replaceAllUsesWith(NewInst); continue; } @@ -4939,25 +5781,22 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { } else { Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator()); } - Value *Ex = Builder.CreateExtractElement(Vec, Lane); - Ex = extend(ScalarRoot, Ex, Scalar->getType()); + Value *NewInst = ExtractAndExtendIfNeeded(Vec); CSEBlocks.insert(PH->getIncomingBlock(i)); - PH->setOperand(i, Ex); + PH->setOperand(i, NewInst); } } } else { Builder.SetInsertPoint(cast<Instruction>(User)); - Value *Ex = Builder.CreateExtractElement(Vec, Lane); - Ex = extend(ScalarRoot, Ex, Scalar->getType()); + Value *NewInst = ExtractAndExtendIfNeeded(Vec); CSEBlocks.insert(cast<Instruction>(User)->getParent()); - User->replaceUsesOfWith(Scalar, Ex); + User->replaceUsesOfWith(Scalar, NewInst); } } else { Builder.SetInsertPoint(&F->getEntryBlock().front()); - Value *Ex = Builder.CreateExtractElement(Vec, Lane); - Ex = extend(ScalarRoot, Ex, Scalar->getType()); + Value *NewInst = ExtractAndExtendIfNeeded(Vec); CSEBlocks.insert(&F->getEntryBlock()); - User->replaceUsesOfWith(Scalar, Ex); + User->replaceUsesOfWith(Scalar, NewInst); } LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n"); @@ -5043,10 +5882,11 @@ void BoUpSLP::optimizeGatherSequence() { // Sort blocks by domination. This ensures we visit a block after all blocks // dominating it are visited. - llvm::stable_sort(CSEWorkList, - [this](const DomTreeNode *A, const DomTreeNode *B) { - return DT->properlyDominates(A, B); - }); + llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) { + assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + return A->getDFSNumIn() < B->getDFSNumIn(); + }); // Perform O(N^2) search over the gather sequences and merge identical // instructions. TODO: We can further optimize this scan if we split the @@ -5091,7 +5931,7 @@ void BoUpSLP::optimizeGatherSequence() { Optional<BoUpSLP::ScheduleData *> BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, const InstructionsState &S) { - if (isa<PHINode>(S.OpValue)) + if (isa<PHINode>(S.OpValue) || isa<InsertElementInst>(S.OpValue)) return nullptr; // Initialize the instruction bundle. @@ -5101,11 +5941,53 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule, + ScheduleData *Bundle) { + // The scheduling region got new instructions at the lower end (or it is a + // new region for the first bundle). This makes it necessary to + // recalculate all dependencies. + // It is seldom that this needs to be done a second time after adding the + // initial bundle to the region. + if (ScheduleEnd != OldScheduleEnd) { + for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) + doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); + ReSchedule = true; + } + if (ReSchedule) { + resetSchedule(); + initialFillReadyList(ReadyInsts); + } + if (Bundle) { + LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle + << " in block " << BB->getName() << "\n"); + calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); + } + + // Now try to schedule the new bundle or (if no bundle) just calculate + // dependencies. As soon as the bundle is "ready" it means that there are no + // cyclic dependencies and we can schedule it. Note that's important that we + // don't "schedule" the bundle yet (see cancelScheduling). + while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && + !ReadyInsts.empty()) { + ScheduleData *Picked = ReadyInsts.pop_back_val(); + if (Picked->isSchedulingEntity() && Picked->isReady()) + schedule(Picked, ReadyInsts); + } + }; + // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V, S)) + if (!extendSchedulingRegion(V, S)) { + // If the scheduling region got new instructions at the lower end (or it + // is a new region for the first bundle). This makes it necessary to + // recalculate all dependencies. + // Otherwise the compiler may crash trying to incorrectly calculate + // dependencies and emit instruction in the wrong order at the actual + // scheduling. + TryScheduleBundle(/*ReSchedule=*/false, nullptr); return None; + } } for (Value *V : VL) { @@ -5134,42 +6016,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, BundleMember->FirstInBundle = Bundle; PrevInBundle = BundleMember; } - if (ScheduleEnd != OldScheduleEnd) { - // The scheduling region got new instructions at the lower end (or it is a - // new region for the first bundle). This makes it necessary to - // recalculate all dependencies. - // It is seldom that this needs to be done a second time after adding the - // initial bundle to the region. - for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - doForAllOpcodes(I, [](ScheduleData *SD) { - SD->clearDependencies(); - }); - } - ReSchedule = true; - } - if (ReSchedule) { - resetSchedule(); - initialFillReadyList(ReadyInsts); - } assert(Bundle && "Failed to find schedule bundle"); - - LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " - << BB->getName() << "\n"); - - calculateDependencies(Bundle, true, SLP); - - // Now try to schedule the new bundle. As soon as the bundle is "ready" it - // means that there are no cyclic dependencies and we can schedule it. - // Note that's important that we don't "schedule" the bundle yet (see - // cancelScheduling). - while (!Bundle->isReady() && !ReadyInsts.empty()) { - - ScheduleData *pickedSD = ReadyInsts.pop_back_val(); - - if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { - schedule(pickedSD, ReadyInsts); - } - } + TryScheduleBundle(ReSchedule, Bundle); if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); return None; @@ -5179,7 +6027,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL, Value *OpValue) { - if (isa<PHINode>(OpValue)) + if (isa<PHINode>(OpValue) || isa<InsertElementInst>(OpValue)) return; ScheduleData *Bundle = getScheduleData(OpValue); @@ -5219,7 +6067,8 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, return true; Instruction *I = dyn_cast<Instruction>(V); assert(I && "bundle member must be an instruction"); - assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled"); + assert(!isa<PHINode>(I) && !isa<InsertElementInst>(I) && + "phi nodes/insertelements don't need to be scheduled"); auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { ScheduleData *ISD = getScheduleData(I); if (!ISD) @@ -5252,41 +6101,39 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, BasicBlock::reverse_iterator UpperEnd = BB->rend(); BasicBlock::iterator DownIter = ScheduleEnd->getIterator(); BasicBlock::iterator LowerEnd = BB->end(); - while (true) { + while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I && + &*DownIter != I) { if (++ScheduleRegionSize > ScheduleRegionSizeLimit) { LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n"); return false; } - if (UpIter != UpperEnd) { - if (&*UpIter == I) { - initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); - ScheduleStart = I; - if (isOneOf(S, I) != I) - CheckSheduleForI(I); - LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I - << "\n"); - return true; - } - ++UpIter; - } - if (DownIter != LowerEnd) { - if (&*DownIter == I) { - initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, - nullptr); - ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); - assert(ScheduleEnd && "tried to vectorize a terminator?"); - LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I - << "\n"); - return true; - } - ++DownIter; - } - assert((UpIter != UpperEnd || DownIter != LowerEnd) && - "instruction not found in block"); + ++UpIter; + ++DownIter; } + if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) { + assert(I->getParent() == ScheduleStart->getParent() && + "Instruction is in wrong basic block."); + initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); + ScheduleStart = I; + if (isOneOf(S, I) != I) + CheckSheduleForI(I); + LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I + << "\n"); + return true; + } + assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) && + "Expected to reach top of the basic block or instruction down the " + "lower end."); + assert(I->getParent() == ScheduleEnd->getParent() && + "Instruction is in wrong basic block."); + initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, + nullptr); + ScheduleEnd = I->getNextNode(); + if (isOneOf(S, I) != I) + CheckSheduleForI(I); + assert(ScheduleEnd && "tried to vectorize a terminator?"); + LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); return true; } @@ -5491,8 +6338,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { - assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + assert((isa<InsertElementInst>(SD->Inst) || + SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -5515,7 +6362,7 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { ScheduleData *BundleMember = picked; while (BundleMember) { Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { + if (pickedInst->getNextNode() != LastScheduledInst) { BS->BB->getInstList().remove(pickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), pickedInst); @@ -5540,10 +6387,12 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { if (auto *Store = dyn_cast<StoreInst>(V)) { if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand())) return DL->getTypeSizeInBits(Trunc->getSrcTy()); - else - return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); + return DL->getTypeSizeInBits(Store->getValueOperand()->getType()); } + if (auto *IEI = dyn_cast<InsertElementInst>(V)) + return getVectorElementSize(IEI->getOperand(1)); + auto E = InstrElementSize.find(V); if (E != InstrElementSize.end()) return E->second; @@ -5552,53 +6401,58 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) { // that feed it. The type of the loaded value may indicate a more suitable // width than V's type. We want to base the vector element size on the width // of memory operations where possible. - SmallVector<Instruction *, 16> Worklist; + SmallVector<std::pair<Instruction *, BasicBlock *>, 16> Worklist; SmallPtrSet<Instruction *, 16> Visited; if (auto *I = dyn_cast<Instruction>(V)) { - Worklist.push_back(I); + Worklist.emplace_back(I, I->getParent()); Visited.insert(I); } // Traverse the expression tree in bottom-up order looking for loads. If we // encounter an instruction we don't yet handle, we give up. - auto MaxWidth = 0u; - auto FoundUnknownInst = false; - while (!Worklist.empty() && !FoundUnknownInst) { - auto *I = Worklist.pop_back_val(); + auto Width = 0u; + while (!Worklist.empty()) { + Instruction *I; + BasicBlock *Parent; + std::tie(I, Parent) = Worklist.pop_back_val(); // We should only be looking at scalar instructions here. If the current - // instruction has a vector type, give up. + // instruction has a vector type, skip. auto *Ty = I->getType(); if (isa<VectorType>(Ty)) - FoundUnknownInst = true; + continue; // If the current instruction is a load, update MaxWidth to reflect the // width of the loaded value. - else if (isa<LoadInst>(I)) - MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty)); + if (isa<LoadInst>(I) || isa<ExtractElementInst>(I) || + isa<ExtractValueInst>(I)) + Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty)); // Otherwise, we need to visit the operands of the instruction. We only // handle the interesting cases from buildTree here. If an operand is an - // instruction we haven't yet visited, we add it to the worklist. + // instruction we haven't yet visited and from the same basic block as the + // user or the use is a PHI node, we add it to the worklist. else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) || - isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) { + isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I) || + isa<UnaryOperator>(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast<Instruction>(U.get())) - if (Visited.insert(J).second) - Worklist.push_back(J); + if (Visited.insert(J).second && + (isa<PHINode>(I) || J->getParent() == Parent)) + Worklist.emplace_back(J, J->getParent()); + } else { + break; } - - // If we don't yet handle the instruction, give up. - else - FoundUnknownInst = true; } - int Width = MaxWidth; // If we didn't encounter a memory access in the expression tree, or if we // gave up for some reason, just return the width of V. Otherwise, return the // maximum width we found. - if (!MaxWidth || FoundUnknownInst) + if (!Width) { + if (auto *CI = dyn_cast<CmpInst>(V)) + V = CI->getOperand(0); Width = DL->getTypeSizeInBits(V->getType()); + } for (Instruction *I : Visited) InstrElementSize[I] = Width; @@ -5633,6 +6487,9 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr, break; case Instruction::ZExt: case Instruction::SExt: + if (isa<ExtractElementInst>(I->getOperand(0)) || + isa<InsertElementInst>(I->getOperand(0))) + return false; break; // We can demote certain binary operations if we can demote both of their @@ -5884,8 +6741,6 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A PreservedAnalyses PA; PA.preserveSet<CFGAnalyses>(); - PA.preserve<AAManager>(); - PA.preserve<GlobalsAA>(); return PA; } @@ -5929,6 +6784,9 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. + // Update DFS numbers now so that we can use them for ordering. + DT->updateDFSNumbers(); + // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { collectSeedInstructions(BB); @@ -5960,6 +6818,44 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, return Changed; } +/// Order may have elements assigned special value (size) which is out of +/// bounds. Such indices only appear on places which correspond to undef values +/// (see canReuseExtract for details) and used in order to avoid undef values +/// have effect on operands ordering. +/// The first loop below simply finds all unused indices and then the next loop +/// nest assigns these indices for undef values positions. +/// As an example below Order has two undef positions and they have assigned +/// values 3 and 7 respectively: +/// before: 6 9 5 4 9 2 1 0 +/// after: 6 3 5 4 7 2 1 0 +/// \returns Fixed ordering. +static BoUpSLP::OrdersType fixupOrderingIndices(ArrayRef<unsigned> Order) { + BoUpSLP::OrdersType NewOrder(Order.begin(), Order.end()); + const unsigned Sz = NewOrder.size(); + SmallBitVector UsedIndices(Sz); + SmallVector<int> MaskedIndices; + for (int I = 0, E = NewOrder.size(); I < E; ++I) { + if (NewOrder[I] < Sz) + UsedIndices.set(NewOrder[I]); + else + MaskedIndices.push_back(I); + } + if (MaskedIndices.empty()) + return NewOrder; + SmallVector<int> AvailableIndices(MaskedIndices.size()); + unsigned Cnt = 0; + int Idx = UsedIndices.find_first(); + do { + AvailableIndices[Cnt] = Idx; + Idx = UsedIndices.find_next(Idx); + ++Cnt; + } while (Idx > 0); + assert(Cnt == MaskedIndices.size() && "Non-synced masked/available indices."); + for (int I = 0, E = MaskedIndices.size(); I < E; ++I) + NewOrder[MaskedIndices[I]] = AvailableIndices[I]; + return NewOrder; +} + bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, unsigned Idx) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -5979,9 +6875,9 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R, // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == Chain.size()) { // TODO: reorder tree nodes without tree rebuilding. - SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend()); - llvm::transform(*Order, ReorderedOps.begin(), - [Chain](const unsigned Idx) { return Chain[Idx]; }); + SmallVector<Value *, 4> ReorderedOps(Chain.size()); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [Chain](const unsigned Idx) { return Chain[Idx]; }); R.buildTree(ReorderedOps); } if (R.isTreeTinyAndNotFullyVectorizable()) @@ -6021,20 +6917,42 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, int E = Stores.size(); SmallBitVector Tails(E, false); - SmallVector<int, 16> ConsecutiveChain(E, E + 1); int MaxIter = MaxStoreLookup.getValue(); + SmallVector<std::pair<int, int>, 16> ConsecutiveChain( + E, std::make_pair(E, INT_MAX)); + SmallVector<SmallBitVector, 4> CheckedPairs(E, SmallBitVector(E, false)); int IterCnt; auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter, + &CheckedPairs, &ConsecutiveChain](int K, int Idx) { if (IterCnt >= MaxIter) return true; + if (CheckedPairs[Idx].test(K)) + return ConsecutiveChain[K].second == 1 && + ConsecutiveChain[K].first == Idx; ++IterCnt; - if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) + CheckedPairs[Idx].set(K); + CheckedPairs[K].set(Idx); + Optional<int> Diff = getPointersDiff( + Stores[K]->getValueOperand()->getType(), Stores[K]->getPointerOperand(), + Stores[Idx]->getValueOperand()->getType(), + Stores[Idx]->getPointerOperand(), *DL, *SE, /*StrictCheck=*/true); + if (!Diff || *Diff == 0) + return false; + int Val = *Diff; + if (Val < 0) { + if (ConsecutiveChain[Idx].second > -Val) { + Tails.set(K); + ConsecutiveChain[Idx] = std::make_pair(K, -Val); + } + return false; + } + if (ConsecutiveChain[K].second <= Val) return false; Tails.set(Idx); - ConsecutiveChain[K] = Idx; - return true; + ConsecutiveChain[K] = std::make_pair(Idx, Val); + return Val == 1; }; // Do a quadratic search on all of the given stores in reverse order and find // all of the pairs of stores that follow each other. @@ -6051,32 +6969,51 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores, break; } + // Tracks if we tried to vectorize stores starting from the given tail + // already. + SmallBitVector TriedTails(E, false); // For stores that start but don't end a link in the chain: for (int Cnt = E; Cnt > 0; --Cnt) { int I = Cnt - 1; - if (ConsecutiveChain[I] == E + 1 || Tails.test(I)) + if (ConsecutiveChain[I].first == E || Tails.test(I)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. BoUpSLP::ValueList Operands; // Collect the chain into a list. - while (I != E + 1 && !VectorizedStores.count(Stores[I])) { + while (I != E && !VectorizedStores.count(Stores[I])) { Operands.push_back(Stores[I]); + Tails.set(I); + if (ConsecutiveChain[I].second != 1) { + // Mark the new end in the chain and go back, if required. It might be + // required if the original stores come in reversed order, for example. + if (ConsecutiveChain[I].first != E && + Tails.test(ConsecutiveChain[I].first) && !TriedTails.test(I) && + !VectorizedStores.count(Stores[ConsecutiveChain[I].first])) { + TriedTails.set(I); + Tails.reset(ConsecutiveChain[I].first); + if (Cnt < ConsecutiveChain[I].first + 2) + Cnt = ConsecutiveChain[I].first + 2; + } + break; + } // Move to the next value in the chain. - I = ConsecutiveChain[I]; + I = ConsecutiveChain[I].first; } + assert(!Operands.empty() && "Expected non-empty list of stores."); - // If a vector register can't hold 1 element, we are done. unsigned MaxVecRegSize = R.getMaxVecRegSize(); unsigned EltSize = R.getVectorElementSize(Operands[0]); - if (MaxVecRegSize % EltSize != 0) - continue; + unsigned MaxElts = llvm::PowerOf2Floor(MaxVecRegSize / EltSize); + + unsigned MinVF = std::max(2U, R.getMinVecRegSize() / EltSize); + unsigned MaxVF = std::min(R.getMaximumVF(EltSize, Instruction::Store), + MaxElts); - unsigned MaxElts = MaxVecRegSize / EltSize; // FIXME: Is division-by-2 the correct step? Should we assert that the // register size is a power-of-2? unsigned StartIdx = 0; - for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { + for (unsigned Size = MaxVF; Size >= MinVF; Size /= 2) { for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size); if (!VectorizedStores.count(Slice.front()) && @@ -6146,8 +7083,7 @@ bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { } bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, - bool AllowReorder, - ArrayRef<Value *> InsertUses) { + bool AllowReorder) { if (VL.size() < 2) return false; @@ -6165,7 +7101,7 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, // determining vectorization factor for scalar instructions. for (Value *V : VL) { Type *Ty = V->getType(); - if (!isValidElementType(Ty)) { + if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) { // NOTE: the following will give user internal llvm type name, which may // not be useful. R.getORE()->emit([&]() { @@ -6196,20 +7132,16 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool Changed = false; bool CandidateFound = false; InstructionCost MinCost = SLPCostThreshold.getValue(); - - bool CompensateUseCost = - !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) { - return V && isa<InsertElementInst>(V); - }); - assert((!CompensateUseCost || InsertUses.size() == VL.size()) && - "Each scalar expected to have an associated InsertElement user."); + Type *ScalarTy = VL[0]->getType(); + if (auto *IE = dyn_cast<InsertElementInst>(VL[0])) + ScalarTy = IE->getOperand(1)->getType(); unsigned NextInst = 0, MaxInst = VL.size(); for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). - auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF); + auto *VecTy = FixedVectorType::get(ScalarTy, VF); if (TTI->getNumberOfParts(VecTy) == VF) continue; for (unsigned I = NextInst; I < MaxInst; ++I) { @@ -6220,7 +7152,10 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, else OpsWidth = VF; - if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + if (!isPowerOf2_32(OpsWidth)) + continue; + + if ((VF > MinVF && OpsWidth <= VF / 2) || (VF == MinVF && OpsWidth < 2)) break; ArrayRef<Value *> Ops = VL.slice(I, OpsWidth); @@ -6235,17 +7170,15 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, << "\n"); R.buildTree(Ops); - Optional<ArrayRef<unsigned>> Order = R.bestOrder(); - // TODO: check if we can allow reordering for more cases. - if (AllowReorder && Order) { - // TODO: reorder tree nodes without tree rebuilding. - // Conceptually, there is nothing actually preventing us from trying to - // reorder a larger list. In fact, we do exactly this when vectorizing - // reductions. However, at this point, we only expect to get here when - // there are exactly two operations. - assert(Ops.size() == 2); - Value *ReorderedOps[] = {Ops[1], Ops[0]}; - R.buildTree(ReorderedOps, None); + if (AllowReorder) { + Optional<ArrayRef<unsigned>> Order = R.bestOrder(); + if (Order) { + // TODO: reorder tree nodes without tree rebuilding. + SmallVector<Value *, 4> ReorderedOps(Ops.size()); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [Ops](const unsigned Idx) { return Ops[Idx]; }); + R.buildTree(ReorderedOps); + } } if (R.isTreeTinyAndNotFullyVectorizable()) continue; @@ -6253,46 +7186,6 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, R.computeMinimumValueSizes(); InstructionCost Cost = R.getTreeCost(); CandidateFound = true; - if (CompensateUseCost) { - // TODO: Use TTI's getScalarizationOverhead for sequence of inserts - // rather than sum of single inserts as the latter may overestimate - // cost. This work should imply improving cost estimation for extracts - // that added in for external (for vectorization tree) users,i.e. that - // part should also switch to same interface. - // For example, the following case is projected code after SLP: - // %4 = extractelement <4 x i64> %3, i32 0 - // %v0 = insertelement <4 x i64> poison, i64 %4, i32 0 - // %5 = extractelement <4 x i64> %3, i32 1 - // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1 - // %6 = extractelement <4 x i64> %3, i32 2 - // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2 - // %7 = extractelement <4 x i64> %3, i32 3 - // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3 - // - // Extracts here added by SLP in order to feed users (the inserts) of - // original scalars and contribute to "ExtractCost" at cost evaluation. - // The inserts in turn form sequence to build an aggregate that - // detected by findBuildAggregate routine. - // SLP makes an assumption that such sequence will be optimized away - // later (instcombine) so it tries to compensate ExctractCost with - // cost of insert sequence. - // Current per element cost calculation approach is not quite accurate - // and tends to create bias toward favoring vectorization. - // Switching to the TTI interface might help a bit. - // Alternative solution could be pattern-match to detect a no-op or - // shuffle. - InstructionCost UserCost = 0; - for (unsigned Lane = 0; Lane < OpsWidth; Lane++) { - auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]); - if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) - UserCost += TTI->getVectorInstrCost( - Instruction::InsertElement, IE->getType(), CI->getZExtValue()); - } - LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost - << ".\n"); - Cost -= UserCost; - } - MinCost = std::min(MinCost, Cost); if (Cost < -SLPCostThreshold) { @@ -6411,11 +7304,29 @@ class HorizontalReduction { /// The type of reduction operation. RecurKind RdxKind; + const unsigned INVALID_OPERAND_INDEX = std::numeric_limits<unsigned>::max(); + + static bool isCmpSelMinMax(Instruction *I) { + return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) && + RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I)); + } + + // And/or are potentially poison-safe logical patterns like: + // select x, y, false + // select x, true, y + static bool isBoolLogicOp(Instruction *I) { + return match(I, m_LogicalAnd(m_Value(), m_Value())) || + match(I, m_LogicalOr(m_Value(), m_Value())); + } + /// Checks if instruction is associative and can be vectorized. static bool isVectorizable(RecurKind Kind, Instruction *I) { if (Kind == RecurKind::None) return false; - if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind)) + + // Integer ops that map to select instructions or intrinsics are fine. + if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) || + isBoolLogicOp(I)) return true; if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { @@ -6428,6 +7339,16 @@ class HorizontalReduction { return I->isAssociative(); } + static Value *getRdxOperand(Instruction *I, unsigned Index) { + // Poison-safe 'or' takes the form: select X, true, Y + // To make that work with the normal operand processing, we skip the + // true value operand. + // TODO: Change the code and data structures to handle this without a hack. + if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1) + return I->getOperand(2); + return I->getOperand(Index); + } + /// Checks if the ParentStackElem.first should be marked as a reduction /// operation with an extra argument or as extra argument itself. void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem, @@ -6440,8 +7361,7 @@ class HorizontalReduction { // in this case. // Do not perform analysis of remaining operands of ParentStackElem.first // instruction, this whole instruction is an extra argument. - RecurKind ParentRdxKind = getRdxKind(ParentStackElem.first); - ParentStackElem.second = getNumberOfOperands(ParentRdxKind); + ParentStackElem.second = INVALID_OPERAND_INDEX; } else { // We ran into something like: // ParentStackElem.first += ... + ExtraArg + ... @@ -6451,7 +7371,7 @@ class HorizontalReduction { /// Creates reduction operation with the current opcode. static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS, - Value *RHS, const Twine &Name) { + Value *RHS, const Twine &Name, bool UseSelect) { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind); switch (Kind) { case RecurKind::Add: @@ -6467,23 +7387,30 @@ class HorizontalReduction { return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); case RecurKind::FMin: return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); - - case RecurKind::SMax: { - Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::SMin: { - Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::UMax: { - Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } - case RecurKind::UMin: { - Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); - return Builder.CreateSelect(Cmp, LHS, RHS, Name); - } + case RecurKind::SMax: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS); + case RecurKind::SMin: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS); + case RecurKind::UMax: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS); + case RecurKind::UMin: + if (UseSelect) { + Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name); + return Builder.CreateSelect(Cmp, LHS, RHS, Name); + } + return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS); default: llvm_unreachable("Unknown reduction operation."); } @@ -6494,26 +7421,30 @@ class HorizontalReduction { static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, Value *RHS, const Twine &Name, const ReductionOpsListType &ReductionOps) { - Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); + bool UseSelect = ReductionOps.size() == 2; + assert((!UseSelect || isa<SelectInst>(ReductionOps[1][0])) && + "Expected cmp + select pairs for reduction"); + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect); if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { - if (auto *Sel = dyn_cast<SelectInst>(Op)) + if (auto *Sel = dyn_cast<SelectInst>(Op)) { propagateIRFlags(Sel->getCondition(), ReductionOps[0]); - propagateIRFlags(Op, ReductionOps[1]); - return Op; + propagateIRFlags(Op, ReductionOps[1]); + return Op; + } } propagateIRFlags(Op, ReductionOps[0]); return Op; } + /// Creates reduction operation with the current opcode with the IR flags /// from \p I. static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS, Value *RHS, const Twine &Name, Instruction *I) { - Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name); - if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { - if (auto *Sel = dyn_cast<SelectInst>(Op)) { - propagateIRFlags(Sel->getCondition(), - cast<SelectInst>(I)->getCondition()); - } + auto *SelI = dyn_cast<SelectInst>(I); + Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, SelI != nullptr); + if (SelI && RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) { + if (auto *Sel = dyn_cast<SelectInst>(Op)) + propagateIRFlags(Sel->getCondition(), SelI->getCondition()); } propagateIRFlags(Op, I); return Op; @@ -6526,9 +7457,11 @@ class HorizontalReduction { return RecurKind::Add; if (match(I, m_Mul(m_Value(), m_Value()))) return RecurKind::Mul; - if (match(I, m_And(m_Value(), m_Value()))) + if (match(I, m_And(m_Value(), m_Value())) || + match(I, m_LogicalAnd(m_Value(), m_Value()))) return RecurKind::And; - if (match(I, m_Or(m_Value(), m_Value()))) + if (match(I, m_Or(m_Value(), m_Value())) || + match(I, m_LogicalOr(m_Value(), m_Value()))) return RecurKind::Or; if (match(I, m_Xor(m_Value(), m_Value()))) return RecurKind::Xor; @@ -6542,6 +7475,10 @@ class HorizontalReduction { if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) return RecurKind::FMin; + // This matches either cmp+select or intrinsics. SLP is expected to handle + // either form. + // TODO: If we are canonicalizing to intrinsics, we can remove several + // special-case paths that deal with selects. if (match(I, m_SMax(m_Value(), m_Value()))) return RecurKind::SMax; if (match(I, m_SMin(m_Value(), m_Value()))) @@ -6610,61 +7547,52 @@ class HorizontalReduction { return RecurKind::None; } - /// Return true if this operation is a cmp+select idiom. - static bool isCmpSel(RecurKind Kind) { - return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind); - } - /// Get the index of the first operand. - static unsigned getFirstOperandIndex(RecurKind Kind) { - // We allow calling this before 'Kind' is set, so handle that specially. - if (Kind == RecurKind::None) - return 0; - return isCmpSel(Kind) ? 1 : 0; + static unsigned getFirstOperandIndex(Instruction *I) { + return isCmpSelMinMax(I) ? 1 : 0; } /// Total number of operands in the reduction operation. - static unsigned getNumberOfOperands(RecurKind Kind) { - return isCmpSel(Kind) ? 3 : 2; + static unsigned getNumberOfOperands(Instruction *I) { + return isCmpSelMinMax(I) ? 3 : 2; } /// Checks if the instruction is in basic block \p BB. - /// For a min/max reduction check that both compare and select are in \p BB. - static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB, - bool IsRedOp) { - if (IsRedOp && isCmpSel(Kind)) { - auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition()); - return I->getParent() == BB && Cmp && Cmp->getParent() == BB; + /// For a cmp+sel min/max reduction check that both ops are in \p BB. + static bool hasSameParent(Instruction *I, BasicBlock *BB) { + if (isCmpSelMinMax(I)) { + auto *Sel = cast<SelectInst>(I); + auto *Cmp = cast<Instruction>(Sel->getCondition()); + return Sel->getParent() == BB && Cmp->getParent() == BB; } return I->getParent() == BB; } /// Expected number of uses for reduction operations/reduced values. - static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I, - bool IsReductionOp) { - // SelectInst must be used twice while the condition op must have single - // use only. - if (isCmpSel(Kind)) - return I->hasNUses(2) && - (!IsReductionOp || - cast<SelectInst>(I)->getCondition()->hasOneUse()); + static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) { + if (IsCmpSelMinMax) { + // SelectInst must be used twice while the condition op must have single + // use only. + if (auto *Sel = dyn_cast<SelectInst>(I)) + return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse(); + return I->hasNUses(2); + } // Arithmetic reduction operation must be used once only. return I->hasOneUse(); } /// Initializes the list of reduction operations. - void initReductionOps(RecurKind Kind) { - if (isCmpSel(Kind)) + void initReductionOps(Instruction *I) { + if (isCmpSelMinMax(I)) ReductionOps.assign(2, ReductionOpsType()); else ReductionOps.assign(1, ReductionOpsType()); } /// Add all reduction operations for the reduction instruction \p I. - void addReductionOps(RecurKind Kind, Instruction *I) { - assert(Kind != RecurKind::None && "Expected reduction operation."); - if (isCmpSel(Kind)) { + void addReductionOps(Instruction *I) { + if (isCmpSelMinMax(I)) { ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition()); ReductionOps[1].emplace_back(I); } else { @@ -6675,53 +7603,61 @@ class HorizontalReduction { static Value *getLHS(RecurKind Kind, Instruction *I) { if (Kind == RecurKind::None) return nullptr; - return I->getOperand(getFirstOperandIndex(Kind)); + return I->getOperand(getFirstOperandIndex(I)); } static Value *getRHS(RecurKind Kind, Instruction *I) { if (Kind == RecurKind::None) return nullptr; - return I->getOperand(getFirstOperandIndex(Kind) + 1); + return I->getOperand(getFirstOperandIndex(I) + 1); } public: HorizontalReduction() = default; /// Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, Instruction *B) { - assert((!Phi || is_contained(Phi->operands(), B)) && + bool matchAssociativeReduction(PHINode *Phi, Instruction *Inst) { + assert((!Phi || is_contained(Phi->operands(), Inst)) && "Phi needs to use the binary operator"); - - RdxKind = getRdxKind(B); + assert((isa<BinaryOperator>(Inst) || isa<SelectInst>(Inst) || + isa<IntrinsicInst>(Inst)) && + "Expected binop, select, or intrinsic for reduction matching"); + RdxKind = getRdxKind(Inst); // We could have a initial reductions that is not an add. // r *= v1 + v2 + v3 + v4 // In such a case start looking for a tree rooted in the first '+'. if (Phi) { - if (getLHS(RdxKind, B) == Phi) { + if (getLHS(RdxKind, Inst) == Phi) { Phi = nullptr; - B = dyn_cast<Instruction>(getRHS(RdxKind, B)); - if (!B) + Inst = dyn_cast<Instruction>(getRHS(RdxKind, Inst)); + if (!Inst) return false; - RdxKind = getRdxKind(B); - } else if (getRHS(RdxKind, B) == Phi) { + RdxKind = getRdxKind(Inst); + } else if (getRHS(RdxKind, Inst) == Phi) { Phi = nullptr; - B = dyn_cast<Instruction>(getLHS(RdxKind, B)); - if (!B) + Inst = dyn_cast<Instruction>(getLHS(RdxKind, Inst)); + if (!Inst) return false; - RdxKind = getRdxKind(B); + RdxKind = getRdxKind(Inst); } } - if (!isVectorizable(RdxKind, B)) + if (!isVectorizable(RdxKind, Inst)) return false; // Analyze "regular" integer/FP types for reductions - no target-specific // types or pointers. - Type *Ty = B->getType(); + Type *Ty = Inst->getType(); if (!isValidElementType(Ty) || Ty->isPointerTy()) return false; - ReductionRoot = B; + // Though the ultimate reduction may have multiple uses, its condition must + // have only single use. + if (auto *Sel = dyn_cast<SelectInst>(Inst)) + if (!Sel->getCondition()->hasOneUse()) + return false; + + ReductionRoot = Inst; // The opcode for leaf values that we perform a reduction on. // For example: load(x) + load(y) + load(z) + fptoui(w) @@ -6729,11 +7665,11 @@ public: // potential candidate for the reduction. unsigned LeafOpcode = 0; - // Post order traverse the reduction tree starting at B. We only handle true - // trees containing only binary operators. + // Post-order traverse the reduction tree starting at Inst. We only handle + // true trees containing binary operators or selects. SmallVector<std::pair<Instruction *, unsigned>, 32> Stack; - Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind))); - initReductionOps(RdxKind); + Stack.push_back(std::make_pair(Inst, getFirstOperandIndex(Inst))); + initReductionOps(Inst); while (!Stack.empty()) { Instruction *TreeN = Stack.back().first; unsigned EdgeToVisit = Stack.back().second++; @@ -6741,12 +7677,12 @@ public: bool IsReducedValue = TreeRdxKind != RdxKind; // Postorder visit. - if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) { + if (IsReducedValue || EdgeToVisit >= getNumberOfOperands(TreeN)) { if (IsReducedValue) ReducedVals.push_back(TreeN); else { - auto I = ExtraArgs.find(TreeN); - if (I != ExtraArgs.end() && !I->second) { + auto ExtraArgsIter = ExtraArgs.find(TreeN); + if (ExtraArgsIter != ExtraArgs.end() && !ExtraArgsIter->second) { // Check if TreeN is an extra argument of its parent operation. if (Stack.size() <= 1) { // TreeN can't be an extra argument as it is a root reduction @@ -6759,23 +7695,23 @@ public: markExtraArg(Stack[Stack.size() - 2], TreeN); ExtraArgs.erase(TreeN); } else - addReductionOps(RdxKind, TreeN); + addReductionOps(TreeN); } // Retract. Stack.pop_back(); continue; } - // Visit left or right. - Value *EdgeVal = TreeN->getOperand(EdgeToVisit); - auto *I = dyn_cast<Instruction>(EdgeVal); - if (!I) { + // Visit operands. + Value *EdgeVal = getRdxOperand(TreeN, EdgeToVisit); + auto *EdgeInst = dyn_cast<Instruction>(EdgeVal); + if (!EdgeInst) { // Edge value is not a reduction instruction or a leaf instruction. // (It may be a constant, function argument, or something else.) markExtraArg(Stack.back(), EdgeVal); continue; } - RecurKind EdgeRdxKind = getRdxKind(I); + RecurKind EdgeRdxKind = getRdxKind(EdgeInst); // Continue analysis if the next operand is a reduction operation or // (possibly) a leaf value. If the leaf value opcode is not set, // the first met operation != reduction operation is considered as the @@ -6784,25 +7720,26 @@ public: // Each tree node needs to have minimal number of users except for the // ultimate reduction. const bool IsRdxInst = EdgeRdxKind == RdxKind; - if (I != Phi && I != B && - hasSameParent(RdxKind, I, B->getParent(), IsRdxInst) && - hasRequiredNumberOfUses(RdxKind, I, IsRdxInst) && - (!LeafOpcode || LeafOpcode == I->getOpcode() || IsRdxInst)) { + if (EdgeInst != Phi && EdgeInst != Inst && + hasSameParent(EdgeInst, Inst->getParent()) && + hasRequiredNumberOfUses(isCmpSelMinMax(Inst), EdgeInst) && + (!LeafOpcode || LeafOpcode == EdgeInst->getOpcode() || IsRdxInst)) { if (IsRdxInst) { // We need to be able to reassociate the reduction operations. - if (!isVectorizable(EdgeRdxKind, I)) { + if (!isVectorizable(EdgeRdxKind, EdgeInst)) { // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), I); + markExtraArg(Stack.back(), EdgeInst); continue; } } else if (!LeafOpcode) { - LeafOpcode = I->getOpcode(); + LeafOpcode = EdgeInst->getOpcode(); } - Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind))); + Stack.push_back( + std::make_pair(EdgeInst, getFirstOperandIndex(EdgeInst))); continue; } // I is an extra argument for TreeN (its parent operation). - markExtraArg(Stack.back(), I); + markExtraArg(Stack.back(), EdgeInst); } return true; } @@ -6896,8 +7833,8 @@ public: "instructions."); // TODO: reorder tree nodes without tree rebuilding. SmallVector<Value *, 4> ReorderedOps(VL.size()); - llvm::transform(*Order, ReorderedOps.begin(), - [VL](const unsigned Idx) { return VL[Idx]; }); + transform(fixupOrderingIndices(*Order), ReorderedOps.begin(), + [VL](const unsigned Idx) { return VL[Idx]; }); V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } if (V.isTreeTinyAndNotFullyVectorizable()) @@ -6905,12 +7842,21 @@ public: if (V.isLoadCombineReductionCandidate(RdxKind)) break; + // For a poison-safe boolean logic reduction, do not replace select + // instructions with logic ops. All reduced values will be frozen (see + // below) to prevent leaking poison. + if (isa<SelectInst>(ReductionRoot) && + isBoolLogicOp(cast<Instruction>(ReductionRoot)) && + NumReducedVals != ReduxWidth) + break; + V.computeMinimumValueSizes(); // Estimate cost. - InstructionCost TreeCost = V.getTreeCost(); + InstructionCost TreeCost = + V.getTreeCost(makeArrayRef(&ReducedVals[i], ReduxWidth)); InstructionCost ReductionCost = - getReductionCost(TTI, ReducedVals[i], ReduxWidth); + getReductionCost(TTI, ReducedVals[i], ReduxWidth, RdxFMF); InstructionCost Cost = TreeCost + ReductionCost; if (!Cost.isValid()) { LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n"); @@ -6945,11 +7891,16 @@ public: // Emit a reduction. If the root is a select (min/max idiom), the insert // point is the compare condition of that select. Instruction *RdxRootInst = cast<Instruction>(ReductionRoot); - if (isCmpSel(RdxKind)) + if (isCmpSelMinMax(RdxRootInst)) Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst)); else Builder.SetInsertPoint(RdxRootInst); + // To prevent poison from leaking across what used to be sequential, safe, + // scalar boolean logic operations, the reduction operand must be frozen. + if (isa<SelectInst>(RdxRootInst) && isBoolLogicOp(RdxRootInst)) + VectorizedRoot = Builder.CreateFreeze(VectorizedRoot); + Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); @@ -6983,17 +7934,6 @@ public: } } - // Update users. For a min/max reduction that ends with a compare and - // select, we also have to RAUW for the compare instruction feeding the - // reduction root. That's because the original compare may have extra uses - // besides the final select of the reduction. - if (isCmpSel(RdxKind)) { - if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) { - Instruction *ScalarCmp = - getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot)); - ScalarCmp->replaceAllUsesWith(VecSelect->getCondition()); - } - } ReductionRoot->replaceAllUsesWith(VectorizedTree); // Mark all scalar reduction ops for deletion, they are replaced by the @@ -7008,8 +7948,8 @@ public: private: /// Calculate the cost of a reduction. InstructionCost getReductionCost(TargetTransformInfo *TTI, - Value *FirstReducedVal, - unsigned ReduxWidth) { + Value *FirstReducedVal, unsigned ReduxWidth, + FastMathFlags FMF) { Type *ScalarTy = FirstReducedVal->getType(); FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth); InstructionCost VectorCost, ScalarCost; @@ -7022,17 +7962,15 @@ private: case RecurKind::FAdd: case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, - /*IsPairwiseForm=*/false); + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF); ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; } case RecurKind::FMax: case RecurKind::FMin: { auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); - VectorCost = - TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*pairwise=*/false, /*unsigned=*/false); + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*unsigned=*/false); ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, @@ -7046,9 +7984,7 @@ private: auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy)); bool IsUnsigned = RdxKind == RecurKind::UMax || RdxKind == RecurKind::UMin; - VectorCost = - TTI->getMinMaxReductionCost(VectorTy, VecCondTy, - /*IsPairwiseForm=*/false, IsUnsigned); + VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy, IsUnsigned); ScalarCost = TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, @@ -7109,36 +8045,6 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) { } while (true); } -static Optional<unsigned> getOperandIndex(Instruction *InsertInst, - unsigned OperandOffset) { - unsigned OperandIndex = OperandOffset; - if (auto *IE = dyn_cast<InsertElementInst>(InsertInst)) { - if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) { - auto *VT = cast<FixedVectorType>(IE->getType()); - OperandIndex *= VT->getNumElements(); - OperandIndex += CI->getZExtValue(); - return OperandIndex; - } - return None; - } - - auto *IV = cast<InsertValueInst>(InsertInst); - Type *CurrentType = IV->getType(); - for (unsigned int Index : IV->indices()) { - if (auto *ST = dyn_cast<StructType>(CurrentType)) { - OperandIndex *= ST->getNumElements(); - CurrentType = ST->getElementType(Index); - } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) { - OperandIndex *= AT->getNumElements(); - CurrentType = AT->getElementType(); - } else { - return None; - } - OperandIndex += Index; - } - return OperandIndex; -} - static bool findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl<Value *> &BuildVectorOpds, @@ -7146,8 +8052,7 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst, unsigned OperandOffset) { do { Value *InsertedOperand = LastInsertInst->getOperand(1); - Optional<unsigned> OperandIndex = - getOperandIndex(LastInsertInst, OperandOffset); + Optional<int> OperandIndex = getInsertIndex(LastInsertInst, OperandOffset); if (!OperandIndex) return false; if (isa<InsertElementInst>(InsertedOperand) || @@ -7159,14 +8064,12 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst, BuildVectorOpds[*OperandIndex] = InsertedOperand; InsertElts[*OperandIndex] = LastInsertInst; } - if (isa<UndefValue>(LastInsertInst->getOperand(0))) - return true; LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0)); } while (LastInsertInst != nullptr && (isa<InsertValueInst>(LastInsertInst) || isa<InsertElementInst>(LastInsertInst)) && LastInsertInst->hasOneUse()); - return false; + return true; } /// Recognize construction of vectors like @@ -7212,10 +8115,6 @@ static bool findBuildAggregate(Instruction *LastInsertInst, return false; } -static bool PhiTypeSorterFunc(Value *V, Value *V2) { - return V->getType() < V2->getType(); -} - /// Try and get a reduction value from a phi node. /// /// Given a phi node \p P in a block \p ParentBB, consider possible reductions @@ -7274,6 +8173,14 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { return true; if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1)))) return true; + if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1)))) + return true; return false; } @@ -7309,6 +8216,9 @@ static bool tryToVectorizeHorReductionOrInstOperands( // horizontal reduction. // Interrupt the process if the Root instruction itself was vectorized or all // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized. + // Skip the analysis of CmpInsts.Compiler implements postanalysis of the + // CmpInsts so we can skip extra attempts in + // tryToVectorizeHorReductionOrInstOperands and save compile time. SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0}); SmallPtrSet<Value *, 8> VisitedInstrs; bool Res = false; @@ -7316,6 +8226,11 @@ static bool tryToVectorizeHorReductionOrInstOperands( Instruction *Inst; unsigned Level; std::tie(Inst, Level) = Stack.pop_back_val(); + // Do not try to analyze instruction that has already been vectorized. + // This may happen when we vectorize instruction operands on a previous + // iteration while stack was populated before that happened. + if (R.isDeleted(Inst)) + continue; Value *B0, *B1; bool IsBinop = matchRdxBop(Inst, B0, B1); bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); @@ -7345,7 +8260,8 @@ static bool tryToVectorizeHorReductionOrInstOperands( // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; - if (Vectorize(Inst, R)) { + // Do not try to vectorize CmpInst operands, this is done separately. + if (!isa<CmpInst>(Inst) && Vectorize(Inst, R)) { Res = true; continue; } @@ -7357,7 +8273,10 @@ static bool tryToVectorizeHorReductionOrInstOperands( for (auto *Op : Inst->operand_values()) if (VisitedInstrs.insert(Op).second) if (auto *I = dyn_cast<Instruction>(Op)) - if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB) + // Do not try to vectorize CmpInst operands, this is done + // separately. + if (!isa<PHINode>(I) && !isa<CmpInst>(I) && !R.isDeleted(I) && + I->getParent() == BB) Stack.emplace_back(I, Level); } return Res; @@ -7394,42 +8313,29 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI, LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n"); // Aggregate value is unlikely to be processed in vector register, we need to // extract scalars into scalar registers, so NeedExtraction is set true. - return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, - BuildVectorInsts); + return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false); } bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, BasicBlock *BB, BoUpSLP &R) { SmallVector<Value *, 16> BuildVectorInsts; SmallVector<Value *, 16> BuildVectorOpds; + SmallVector<int> Mask; if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) || (llvm::all_of(BuildVectorOpds, [](Value *V) { return isa<ExtractElementInst>(V); }) && - isShuffle(BuildVectorOpds))) + isShuffle(BuildVectorOpds, Mask))) return false; - // Vectorize starting with the build vector operands ignoring the BuildVector - // instructions for the purpose of scheduling and user extraction. - return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false, - BuildVectorInsts); -} - -bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB, - BoUpSLP &R) { - if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) - return true; - - bool OpsChanged = false; - for (int Idx = 0; Idx < 2; ++Idx) { - OpsChanged |= - vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI); - } - return OpsChanged; + LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n"); + return tryToVectorizeList(BuildVectorInsts, R, /*AllowReorder=*/true); } bool SLPVectorizerPass::vectorizeSimpleInstructions( - SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) { + SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R, + bool AtTerminator) { bool OpsChanged = false; + SmallVector<Instruction *, 4> PostponedCmps; for (auto *I : reverse(Instructions)) { if (R.isDeleted(I)) continue; @@ -7437,10 +8343,29 @@ bool SLPVectorizerPass::vectorizeSimpleInstructions( OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R); else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R); - else if (auto *CI = dyn_cast<CmpInst>(I)) - OpsChanged |= vectorizeCmpInst(CI, BB, R); + else if (isa<CmpInst>(I)) + PostponedCmps.push_back(I); + } + if (AtTerminator) { + // Try to find reductions first. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + for (Value *Op : I->operands()) + OpsChanged |= vectorizeRootInstruction(nullptr, Op, BB, R, TTI); + } + // Try to vectorize operands as vector bundles. + for (Instruction *I : PostponedCmps) { + if (R.isDeleted(I)) + continue; + OpsChanged |= tryToVectorize(I, R); + } + Instructions.clear(); + } else { + // Insert in reverse order since the PostponedCmps vector was filled in + // reverse order. + Instructions.assign(PostponedCmps.rbegin(), PostponedCmps.rend()); } - Instructions.clear(); return OpsChanged; } @@ -7448,6 +8373,10 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { bool Changed = false; SmallVector<Value *, 4> Incoming; SmallPtrSet<Value *, 16> VisitedInstrs; + // Maps phi nodes to the non-phi nodes found in the use tree for each phi + // node. Allows better to identify the chains that can be vectorized in the + // better way. + DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes; bool HaveVectorizedPhiNodes = true; while (HaveVectorizedPhiNodes) { @@ -7460,22 +8389,125 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { if (!P) break; - if (!VisitedInstrs.count(P) && !R.isDeleted(P)) + // No need to analyze deleted, vectorized and non-vectorizable + // instructions. + if (!VisitedInstrs.count(P) && !R.isDeleted(P) && + isValidElementType(P->getType())) Incoming.push_back(P); } - // Sort by type. - llvm::stable_sort(Incoming, PhiTypeSorterFunc); + // Find the corresponding non-phi nodes for better matching when trying to + // build the tree. + for (Value *V : Incoming) { + SmallVectorImpl<Value *> &Opcodes = + PHIToOpcodes.try_emplace(V).first->getSecond(); + if (!Opcodes.empty()) + continue; + SmallVector<Value *, 4> Nodes(1, V); + SmallPtrSet<Value *, 4> Visited; + while (!Nodes.empty()) { + auto *PHI = cast<PHINode>(Nodes.pop_back_val()); + if (!Visited.insert(PHI).second) + continue; + for (Value *V : PHI->incoming_values()) { + if (auto *PHI1 = dyn_cast<PHINode>((V))) { + Nodes.push_back(PHI1); + continue; + } + Opcodes.emplace_back(V); + } + } + } + + // Sort by type, parent, operands. + stable_sort(Incoming, [this, &PHIToOpcodes](Value *V1, Value *V2) { + assert(isValidElementType(V1->getType()) && + isValidElementType(V2->getType()) && + "Expected vectorizable types only."); + // It is fine to compare type IDs here, since we expect only vectorizable + // types, like ints, floats and pointers, we don't care about other type. + if (V1->getType()->getTypeID() < V2->getType()->getTypeID()) + return true; + if (V1->getType()->getTypeID() > V2->getType()->getTypeID()) + return false; + ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; + ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() < Opcodes2.size()) + return true; + if (Opcodes1.size() > Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) + if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { + DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent()); + DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent()); + if (!NodeI1) + return NodeI2 != nullptr; + if (!NodeI2) + return false; + assert((NodeI1 == NodeI2) == + (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeI1 != NodeI2) + return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return I1->getOpcode() < I2->getOpcode(); + } + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() < Opcodes2[I]->getValueID()) + return true; + if (Opcodes1[I]->getValueID() > Opcodes2[I]->getValueID()) + return false; + } + return false; + }); + + auto &&AreCompatiblePHIs = [&PHIToOpcodes](Value *V1, Value *V2) { + if (V1 == V2) + return true; + if (V1->getType() != V2->getType()) + return false; + ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1]; + ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2]; + if (Opcodes1.size() != Opcodes2.size()) + return false; + for (int I = 0, E = Opcodes1.size(); I < E; ++I) { + // Undefs are compatible with any other value. + if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I])) + continue; + if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I])) + if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + continue; + return false; + } + if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I])) + continue; + if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID()) + return false; + } + return true; + }; // Try to vectorize elements base on their type. + SmallVector<Value *, 4> Candidates; for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(), E = Incoming.end(); IncIt != E;) { - // Look for the next elements with the same type. + // Look for the next elements with the same type, parent and operand + // kinds. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt; - while (SameTypeIt != E && - (*SameTypeIt)->getType() == (*IncIt)->getType()) { + while (SameTypeIt != E && AreCompatiblePHIs(*SameTypeIt, *IncIt)) { VisitedInstrs.insert(*SameTypeIt); ++SameTypeIt; } @@ -7488,13 +8520,25 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // So allow tryToVectorizeList to reorder them if it is beneficial. This // is done when there are exactly two elements since tryToVectorizeList // asserts that there are only two values when AllowReorder is true. - bool AllowReorder = NumElts == 2; - if (NumElts > 1 && - tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) { + if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, + /*AllowReorder=*/true)) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; - break; + } else if (NumElts < 4 && + (Candidates.empty() || + Candidates.front()->getType() == (*IncIt)->getType())) { + Candidates.append(IncIt, std::next(IncIt, NumElts)); + } + // Final attempt to vectorize phis with the same types. + if (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType()) { + if (Candidates.size() > 1 && + tryToVectorizeList(Candidates, R, /*AllowReorder=*/true)) { + // Success start over because instructions might have been changed. + HaveVectorizedPhiNodes = true; + Changed = true; + } + Candidates.clear(); } // Start over at the next instruction of a different type (or the end). @@ -7518,7 +8562,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // We may go through BB multiple times so skip the one we have checked. if (!VisitedInstrs.insert(&*it).second) { if (it->use_empty() && KeyNodes.contains(&*it) && - vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) { + vectorizeSimpleInstructions(PostProcessInstructions, BB, R, + it->isTerminator())) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. Changed = true; @@ -7577,7 +8622,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { // Start vectorization of post-process list of instructions from the // top-tree instructions to try to vectorize as many instructions as // possible. - OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R); + OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R, + it->isTerminator()); if (OpsChanged) { // We would like to start over since some instructions are deleted // and the iterator may become invalid value. @@ -7690,16 +8736,103 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; + // Sort by type, base pointers and values operand. Value operands must be + // compatible (have the same opcode, same parent), otherwise it is + // definitely not profitable to try to vectorize them. + auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) { + if (V->getPointerOperandType()->getTypeID() < + V2->getPointerOperandType()->getTypeID()) + return true; + if (V->getPointerOperandType()->getTypeID() > + V2->getPointerOperandType()->getTypeID()) + return false; + // UndefValues are compatible with all other values. + if (isa<UndefValue>(V->getValueOperand()) || + isa<UndefValue>(V2->getValueOperand())) + return false; + if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand())) + if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { + DomTreeNodeBase<llvm::BasicBlock> *NodeI1 = + DT->getNode(I1->getParent()); + DomTreeNodeBase<llvm::BasicBlock> *NodeI2 = + DT->getNode(I2->getParent()); + assert(NodeI1 && "Should only process reachable instructions"); + assert(NodeI1 && "Should only process reachable instructions"); + assert((NodeI1 == NodeI2) == + (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeI1 != NodeI2) + return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn(); + InstructionsState S = getSameOpcode({I1, I2}); + if (S.getOpcode()) + return false; + return I1->getOpcode() < I2->getOpcode(); + } + if (isa<Constant>(V->getValueOperand()) && + isa<Constant>(V2->getValueOperand())) + return false; + return V->getValueOperand()->getValueID() < + V2->getValueOperand()->getValueID(); + }; + + auto &&AreCompatibleStores = [](StoreInst *V1, StoreInst *V2) { + if (V1 == V2) + return true; + if (V1->getPointerOperandType() != V2->getPointerOperandType()) + return false; + // Undefs are compatible with any other value. + if (isa<UndefValue>(V1->getValueOperand()) || + isa<UndefValue>(V2->getValueOperand())) + return true; + if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand())) + if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) { + if (I1->getParent() != I2->getParent()) + return false; + InstructionsState S = getSameOpcode({I1, I2}); + return S.getOpcode() > 0; + } + if (isa<Constant>(V1->getValueOperand()) && + isa<Constant>(V2->getValueOperand())) + return true; + return V1->getValueOperand()->getValueID() == + V2->getValueOperand()->getValueID(); + }; + // Attempt to sort and vectorize each of the store-groups. - for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e; - ++it) { - if (it->second.size() < 2) + for (auto &Pair : Stores) { + if (Pair.second.size() < 2) continue; LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " - << it->second.size() << ".\n"); + << Pair.second.size() << ".\n"); + + stable_sort(Pair.second, StoreSorter); - Changed |= vectorizeStores(it->second, R); + // Try to vectorize elements based on their compatibility. + for (ArrayRef<StoreInst *>::iterator IncIt = Pair.second.begin(), + E = Pair.second.end(); + IncIt != E;) { + + // Look for the next elements with the same type. + ArrayRef<StoreInst *>::iterator SameTypeIt = IncIt; + Type *EltTy = (*IncIt)->getPointerOperand()->getType(); + + while (SameTypeIt != E && AreCompatibleStores(*SameTypeIt, *IncIt)) + ++SameTypeIt; + + // Try to vectorize them. + unsigned NumElts = (SameTypeIt - IncIt); + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at stores (" + << NumElts << ")\n"); + if (NumElts > 1 && !EltTy->getPointerElementType()->isVectorTy() && + vectorizeStores(makeArrayRef(IncIt, NumElts), R)) { + // Success start over because instructions might have been changed. + Changed = true; + } + + // Start over at the next instruction of a different type (or the end). + IncIt = SameTypeIt; + } } return Changed; } |