diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlan.h')
| -rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 1625 |
1 files changed, 1070 insertions, 555 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 0b596e7e4f63..a1ff684b2b80 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -28,7 +28,6 @@ #include "VPlanAnalysis.h" #include "VPlanValue.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -38,6 +37,7 @@ #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" @@ -60,6 +60,7 @@ class RecurrenceDescriptor; class SCEV; class Type; class VPBasicBlock; +class VPBuilder; class VPRegionBlock; class VPlan; class VPReplicateRecipe; @@ -83,9 +84,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step); -const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE, - Loop *CurLoop = nullptr); - /// A helper function that returns the reciprocal of the block probability of /// predicated blocks. If we return X, we are assuming the predicated block /// will execute once for every X iterations of the loop header. @@ -175,6 +173,7 @@ private: Kind LaneKind; public: + VPLane(unsigned Lane) : Lane(Lane), LaneKind(VPLane::Kind::First) {} VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } @@ -225,132 +224,88 @@ public: return Lane; } } - - /// Returns the maxmimum number of lanes that we are able to consider - /// caching for \p VF. - static unsigned getNumCachedLanes(const ElementCount &VF) { - return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); - } -}; - -/// VPIteration represents a single point in the iteration space of the output -/// (vectorized and/or unrolled) IR loop. -struct VPIteration { - /// in [0..UF) - unsigned Part; - - VPLane Lane; - - VPIteration(unsigned Part, unsigned Lane, - VPLane::Kind Kind = VPLane::Kind::First) - : Part(Part), Lane(Lane, Kind) {} - - VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {} - - bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); } }; /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, - DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx); - - /// The chosen Vectorization and Unroll Factors of the loop being vectorized. + VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, + LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, + InnerLoopVectorizer *ILV, VPlan *Plan, + Loop *CurrentParentLoop, Type *CanonicalIVTy); + /// Target Transform Info. + const TargetTransformInfo *TTI; + + /// The chosen Vectorization Factor of the loop being vectorized. ElementCount VF; - unsigned UF; - /// Hold the indices to generate specific scalar instructions. Null indicates + /// Hold the index to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector /// instructions. - std::optional<VPIteration> Instance; + std::optional<VPLane> Lane; struct DataState { - /// A type for vectorized values in the new loop. Each value from the - /// original loop, when vectorized, is represented by UF vector values in - /// the new unrolled loop, where UF is the unroll factor. - typedef SmallVector<Value *, 2> PerPartValuesTy; + // Each value from the original loop, when vectorized, is represented by a + // vector value in the map. + DenseMap<VPValue *, Value *> VPV2Vector; - DenseMap<VPValue *, PerPartValuesTy> PerPartOutput; - - using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>; - DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars; + DenseMap<VPValue *, SmallVector<Value *, 4>> VPV2Scalars; } Data; - /// Get the generated vector Value for a given VPValue \p Def and a given \p - /// Part if \p IsScalar is false, otherwise return the generated scalar - /// for \p Part. \See set. - Value *get(VPValue *Def, unsigned Part, bool IsScalar = false); + /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar + /// is false, otherwise return the generated scalar. \See set. + Value *get(VPValue *Def, bool IsScalar = false); /// Get the generated Value for a given VPValue and given Part and Lane. - Value *get(VPValue *Def, const VPIteration &Instance); + Value *get(VPValue *Def, const VPLane &Lane); - bool hasVectorValue(VPValue *Def, unsigned Part) { - auto I = Data.PerPartOutput.find(Def); - return I != Data.PerPartOutput.end() && Part < I->second.size() && - I->second[Part]; - } + bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); } - bool hasScalarValue(VPValue *Def, VPIteration Instance) { - auto I = Data.PerPartScalars.find(Def); - if (I == Data.PerPartScalars.end()) + bool hasScalarValue(VPValue *Def, VPLane Lane) { + auto I = Data.VPV2Scalars.find(Def); + if (I == Data.VPV2Scalars.end()) return false; - unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); - return Instance.Part < I->second.size() && - CacheIdx < I->second[Instance.Part].size() && - I->second[Instance.Part][CacheIdx]; + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + return CacheIdx < I->second.size() && I->second[CacheIdx]; } - /// Set the generated vector Value for a given VPValue and a given Part, if \p - /// IsScalar is false. If \p IsScalar is true, set the scalar in (Part, 0). - void set(VPValue *Def, Value *V, unsigned Part, bool IsScalar = false) { + /// Set the generated vector Value for a given VPValue, if \p + /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0. + void set(VPValue *Def, Value *V, bool IsScalar = false) { if (IsScalar) { - set(Def, V, VPIteration(Part, 0)); + set(Def, V, VPLane(0)); return; } assert((VF.isScalar() || V->getType()->isVectorTy()) && - "scalar values must be stored as (Part, 0)"); - if (!Data.PerPartOutput.count(Def)) { - DataState::PerPartValuesTy Entry(UF); - Data.PerPartOutput[Def] = Entry; - } - Data.PerPartOutput[Def][Part] = V; + "scalar values must be stored as (0, 0)"); + Data.VPV2Vector[Def] = V; } /// Reset an existing vector value for \p Def and a given \p Part. - void reset(VPValue *Def, Value *V, unsigned Part) { - auto Iter = Data.PerPartOutput.find(Def); - assert(Iter != Data.PerPartOutput.end() && - "need to overwrite existing value"); - Iter->second[Part] = V; + void reset(VPValue *Def, Value *V) { + assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value"); + Data.VPV2Vector[Def] = V; } - /// Set the generated scalar \p V for \p Def and the given \p Instance. - void set(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.insert({Def, {}}); - auto &PerPartVec = Iter.first->second; - if (PerPartVec.size() <= Instance.Part) - PerPartVec.resize(Instance.Part + 1); - auto &Scalars = PerPartVec[Instance.Part]; - unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + /// Set the generated scalar \p V for \p Def and the given \p Lane. + void set(VPValue *Def, Value *V, const VPLane &Lane) { + auto &Scalars = Data.VPV2Scalars[Def]; + unsigned CacheIdx = Lane.mapToCacheIndex(VF); if (Scalars.size() <= CacheIdx) Scalars.resize(CacheIdx + 1); assert(!Scalars[CacheIdx] && "should overwrite existing value"); Scalars[CacheIdx] = V; } - /// Reset an existing scalar value for \p Def and a given \p Instance. - void reset(VPValue *Def, Value *V, const VPIteration &Instance) { - auto Iter = Data.PerPartScalars.find(Def); - assert(Iter != Data.PerPartScalars.end() && - "need to overwrite existing value"); - assert(Instance.Part < Iter->second.size() && + /// Reset an existing scalar value for \p Def and a given \p Lane. + void reset(VPValue *Def, Value *V, const VPLane &Lane) { + auto Iter = Data.VPV2Scalars.find(Def); + assert(Iter != Data.VPV2Scalars.end() && "need to overwrite existing value"); - unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); - assert(CacheIdx < Iter->second[Instance.Part].size() && + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + assert(CacheIdx < Iter->second.size() && "need to overwrite existing value"); - Iter->second[Instance.Part][CacheIdx] = V; + Iter->second[CacheIdx] = V; } /// Add additional metadata to \p To that was not present on \p Orig. @@ -371,7 +326,7 @@ struct VPTransformState { void setDebugLocFrom(DebugLoc DL); /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance); + void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane); /// Hold state information used when constructing the CFG of the output IR, /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. @@ -414,8 +369,8 @@ struct VPTransformState { /// Pointer to the VPlan code is generated for. VPlan *Plan; - /// The loop object for the current parent region, or nullptr. - Loop *CurrentVectorLoop = nullptr; + /// The parent loop object for the current scope, or nullptr. + Loop *CurrentParentLoop = nullptr; /// LoopVersioning. It's only set up (non-null) if memchecks were /// used. @@ -482,6 +437,26 @@ class VPBlockBase { Successors.erase(Pos); } + /// This function replaces one predecessor with another, useful when + /// trying to replace an old block in the CFG with a new one. + void replacePredecessor(VPBlockBase *Old, VPBlockBase *New) { + auto I = find(Predecessors, Old); + assert(I != Predecessors.end()); + assert(Old->getParent() == New->getParent() && + "replaced predecessor must have the same parent"); + *I = New; + } + + /// This function replaces one successor with another, useful when + /// trying to replace an old block in the CFG with a new one. + void replaceSuccessor(VPBlockBase *Old, VPBlockBase *New) { + auto I = find(Successors, Old); + assert(I != Successors.end()); + assert(Old->getParent() == New->getParent() && + "replaced successor must have the same parent"); + *I = New; + } + protected: VPBlockBase(const unsigned char SC, const std::string &N) : SubclassID(SC), Name(N) {} @@ -535,6 +510,7 @@ public: VPBlocksTy &getSuccessors() { return Successors; } iterator_range<VPBlockBase **> successors() { return Successors; } + iterator_range<VPBlockBase **> predecessors() { return Predecessors; } const VPBlocksTy &getPredecessors() const { return Predecessors; } VPBlocksTy &getPredecessors() { return Predecessors; } @@ -641,6 +617,14 @@ public: /// Remove all the successors of this block. void clearSuccessors() { Successors.clear(); } + /// Swap successors of the block. The block must have exactly 2 successors. + // TODO: This should be part of introducing conditional branch recipes rather + // than being independent. + void swapSuccessors() { + assert(Successors.size() == 2 && "must have 2 successors to swap"); + std::swap(Successors[0], Successors[1]); + } + /// The method which generates the output IR that correspond to this /// VPBlockBase, thereby "executing" the VPlan. virtual void execute(VPTransformState *State) = 0; @@ -648,9 +632,6 @@ public: /// Return the cost of the block. virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx) = 0; - /// Delete all blocks reachable from a given VPBlockBase, inclusive. - static void deleteCFG(VPBlockBase *Entry); - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -658,12 +639,8 @@ public: return true; } - /// Replace all operands of VPUsers in the block with \p NewValue and also - /// replaces all uses of VPValues defined in the block with NewValue. - virtual void dropAllReferences(VPValue *NewValue) = 0; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - void printAsOperand(raw_ostream &OS, bool PrintType) const { + void printAsOperand(raw_ostream &OS, bool PrintType = false) const { OS << getName(); } @@ -696,54 +673,21 @@ public: virtual VPBlockBase *clone() = 0; }; -/// A value that is used outside the VPlan. The operand of the user needs to be -/// added to the associated phi node. The incoming block from VPlan is -/// determined by where the VPValue is defined: if it is defined by a recipe -/// outside a region, its parent block is used, otherwise the middle block is -/// used. -class VPLiveOut : public VPUser { - PHINode *Phi; - -public: - VPLiveOut(PHINode *Phi, VPValue *Op) - : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {} - - static inline bool classof(const VPUser *U) { - return U->getVPUserID() == VPUser::VPUserID::LiveOut; - } - - /// Fix the wrapped phi node. This means adding an incoming value to exit - /// block phi's from the vector loop via middle block (values from scalar loop - /// already reach these phi's), and updating the value to scalar header phi's - /// from the scalar preheader. - void fixPhi(VPlan &Plan, VPTransformState &State); - - /// Returns true if the VPLiveOut uses scalars of operand \p Op. - bool usesScalars(const VPValue *Op) const override { - assert(is_contained(operands(), Op) && - "Op must be an operand of the recipe"); - return true; - } - - PHINode *getPhi() const { return Phi; } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the VPLiveOut to \p O. - void print(raw_ostream &O, VPSlotTracker &SlotTracker) const; -#endif -}; - /// Struct to hold various analysis needed for cost computations. struct VPCostContext { const TargetTransformInfo &TTI; + const TargetLibraryInfo &TLI; VPTypeAnalysis Types; LLVMContext &LLVMCtx; LoopVectorizationCostModel &CM; SmallPtrSet<Instruction *, 8> SkipCostComputation; + TargetTransformInfo::TargetCostKind CostKind; - VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy, - LLVMContext &LLVMCtx, LoopVectorizationCostModel &CM) - : TTI(TTI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {} + VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, + Type *CanIVTy, LoopVectorizationCostModel &CM, + TargetTransformInfo::TargetCostKind CostKind) + : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()), + CM(CM), CostKind(CostKind) {} /// Return the cost for \p UI with \p VF using the legacy cost model as /// fallback until computing the cost of all recipes migrates to VPlan. @@ -752,6 +696,9 @@ struct VPCostContext { /// Return true if the cost for \p UI shouldn't be computed, e.g. because it /// has already been pre-computed. bool skipCostComputation(Instruction *UI, bool IsVector) const; + + /// Returns the OperandInfo for \p V, if it is a live-in. + TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const; }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR @@ -774,12 +721,12 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, public: VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands, DebugLoc DL = {}) - : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} + : VPDef(SC), VPUser(Operands), DL(DL) {} template <typename IterT> VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands, DebugLoc DL = {}) - : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe), DL(DL) {} + : VPDef(SC), VPUser(Operands), DL(DL) {} virtual ~VPRecipeBase() = default; /// Clone the current recipe. @@ -796,7 +743,7 @@ public: /// Return the cost of this recipe, taking into account if the cost /// computation should be skipped and the ForceTargetInstructionCost flag. /// Also takes care of printing the cost for debugging. - virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx); + InstructionCost cost(ElementCount VF, VPCostContext &Ctx); /// Insert an unlinked recipe into a basic block immediately before /// the specified recipe. @@ -833,9 +780,7 @@ public: return true; } - static inline bool classof(const VPUser *U) { - return U->getVPUserID() == VPUser::VPUserID::Recipe; - } + static inline bool classof(const VPUser *U) { return true; } /// Returns true if the recipe may have side-effects. bool mayHaveSideEffects() const; @@ -860,9 +805,11 @@ public: DebugLoc getDebugLoc() const { return DL; } protected: - /// Compute the cost of this recipe using the legacy cost model and the - /// underlying instructions. - InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const; + /// Compute the cost of this recipe either using a recipe's specialized + /// implementation or using the legacy cost model and the underlying + /// instructions. + virtual InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const; }; // Helper macro to define common classof implementations for recipes. @@ -914,11 +861,14 @@ public: case VPRecipeBase::VPReplicateSC: case VPRecipeBase::VPScalarIVStepsSC: case VPRecipeBase::VPVectorPointerSC: + case VPRecipeBase::VPReverseVectorPointerSC: case VPRecipeBase::VPWidenCallSC: case VPRecipeBase::VPWidenCanonicalIVSC: case VPRecipeBase::VPWidenCastSC: case VPRecipeBase::VPWidenGEPSC: + case VPRecipeBase::VPWidenIntrinsicSC: case VPRecipeBase::VPWidenSC: + case VPRecipeBase::VPWidenEVLSC: case VPRecipeBase::VPWidenSelectSC: case VPRecipeBase::VPBlendSC: case VPRecipeBase::VPPredInstPHISC: @@ -930,13 +880,16 @@ public: case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: case VPRecipeBase::VPScalarCastSC: + case VPRecipeBase::VPPartialReductionSC: return true; - case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPBranchOnMaskSC: + case VPRecipeBase::VPInterleaveSC: + case VPRecipeBase::VPIRInstructionSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: case VPRecipeBase::VPWidenStoreSC: + case VPRecipeBase::VPHistogramSC: // TODO: Widened stores don't define a value, but widened loads do. Split // the recipes to be able to make widened loads VPSingleDefRecipes. return false; @@ -958,6 +911,11 @@ public: const Instruction *getUnderlyingInstr() const { return cast<Instruction>(getUnderlyingValue()); } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print this VPSingleDefRecipe to dbgs() (for debugging). + LLVM_DUMP_METHOD void dump() const; +#endif }; /// Class to record LLVM IR flag for a recipe along with it. @@ -986,12 +944,6 @@ public: DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {} }; -protected: - struct GEPFlagsTy { - char IsInBounds : 1; - GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {} - }; - private: struct ExactFlagsTy { char IsExact : 1; @@ -1018,7 +970,7 @@ private: WrapFlagsTy WrapFlags; DisjointFlagsTy DisjointFlags; ExactFlagsTy ExactFlags; - GEPFlagsTy GEPFlags; + GEPNoWrapFlags GEPFlags; NonNegFlagsTy NonNegFlags; FastMathFlagsTy FMFs; unsigned AllFlags; @@ -1055,7 +1007,7 @@ public: ExactFlags.IsExact = Op->isExact(); } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { OpType = OperationType::GEPOp; - GEPFlags.IsInBounds = GEP->isInBounds(); + GEPFlags = GEP->getNoWrapFlags(); } else if (auto *PNNI = dyn_cast<PossiblyNonNegInst>(&I)) { OpType = OperationType::NonNegOp; NonNegFlags.NonNeg = PNNI->hasNonNeg(); @@ -1095,7 +1047,7 @@ public: protected: template <typename IterT> VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, - GEPFlagsTy GEPFlags, DebugLoc DL = {}) + GEPNoWrapFlags GEPFlags, DebugLoc DL = {}) : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp), GEPFlags(GEPFlags) {} @@ -1103,9 +1055,11 @@ public: static inline bool classof(const VPRecipeBase *R) { return R->getVPDefID() == VPRecipeBase::VPInstructionSC || R->getVPDefID() == VPRecipeBase::VPWidenSC || + R->getVPDefID() == VPRecipeBase::VPWidenEVLSC || R->getVPDefID() == VPRecipeBase::VPWidenGEPSC || R->getVPDefID() == VPRecipeBase::VPWidenCastSC || R->getVPDefID() == VPRecipeBase::VPReplicateSC || + R->getVPDefID() == VPRecipeBase::VPReverseVectorPointerSC || R->getVPDefID() == VPRecipeBase::VPVectorPointerSC; } @@ -1130,7 +1084,7 @@ public: ExactFlags.IsExact = false; break; case OperationType::GEPOp: - GEPFlags.IsInBounds = false; + GEPFlags = GEPNoWrapFlags::none(); break; case OperationType::FPMathOp: FMFs.NoNaNs = false; @@ -1159,10 +1113,7 @@ public: I->setIsExact(ExactFlags.IsExact); break; case OperationType::GEPOp: - // TODO(gep_nowrap): Track the full GEPNoWrapFlags in VPlan. - cast<GetElementPtrInst>(I)->setNoWrapFlags( - GEPFlags.IsInBounds ? GEPNoWrapFlags::inBounds() - : GEPNoWrapFlags::none()); + cast<GetElementPtrInst>(I)->setNoWrapFlags(GEPFlags); break; case OperationType::FPMathOp: I->setHasAllowReassoc(FMFs.AllowReassoc); @@ -1188,11 +1139,7 @@ public: return CmpPredicate; } - bool isInBounds() const { - assert(OpType == OperationType::GEPOp && - "recipe doesn't have inbounds flag"); - return GEPFlags.IsInBounds; - } + GEPNoWrapFlags getGEPNoWrapFlags() const { return GEPFlags; } /// Returns true if the recipe has fast-math flags. bool hasFastMathFlags() const { return OpType == OperationType::FPMathOp; } @@ -1222,11 +1169,24 @@ public: #endif }; +/// Helper to access the operand that contains the unroll part for this recipe +/// after unrolling. +template <unsigned PartOpIdx> class VPUnrollPartAccessor { +protected: + /// Return the VPValue operand containing the unroll part or null if there is + /// no such operand. + VPValue *getUnrollPartOperand(VPUser &U) const; + + /// Return the unroll part. + unsigned getUnrollPart(VPUser &U) const; +}; + /// This is a concrete Recipe that models a single VPlan-level instruction. /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. -class VPInstruction : public VPRecipeWithIRFlags { +class VPInstruction : public VPRecipeWithIRFlags, + public VPUnrollPartAccessor<1> { friend class VPlanSlp; public: @@ -1253,15 +1213,16 @@ public: ComputeReductionResult, // Takes the VPValue to extract from as first operand and the lane or part // to extract as second operand, counting from the end starting with 1 for - // last. The second operand must be a positive constant and <= VF when - // extracting from a vector or <= UF when extracting from an unrolled - // scalar. + // last. The second operand must be a positive constant and <= VF. ExtractFromEnd, LogicalAnd, // Non-poison propagating logical And. // Add an offset in bytes (second operand) to a base pointer (first // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + // Returns a scalar boolean value, which is true if any lane of its (only + // boolean) vector operand is true. + AnyOf, }; private: @@ -1283,16 +1244,15 @@ private: /// needed. bool canGenerateScalarForFirstLane() const; - /// Utility methods serving execute(): generates a single instance of the - /// modeled instruction for a given part. \returns the generated value for \p - /// Part. In some cases an existing value is returned rather than a generated - /// one. - Value *generatePerPart(VPTransformState &State, unsigned Part); + /// Utility methods serving execute(): generates a single vector instance of + /// the modeled instruction. \returns the generated value. . In some cases an + /// existing value is returned rather than a generated one. + Value *generate(VPTransformState &State); /// Utility methods serving execute(): generates a scalar single instance of /// the modeled instruction for a given lane. \returns the scalar generated /// value for lane \p Lane. - Value *generatePerLane(VPTransformState &State, const VPIteration &Lane); + Value *generatePerLane(VPTransformState &State, const VPLane &Lane); #if !defined(NDEBUG) /// Return true if the VPInstruction is a floating point math operation, i.e. @@ -1326,6 +1286,12 @@ public: assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint"); } + VPInstruction(VPValue *Ptr, VPValue *Offset, GEPNoWrapFlags Flags, + DebugLoc DL = {}, const Twine &Name = "") + : VPRecipeWithIRFlags(VPDef::VPInstructionSC, + ArrayRef<VPValue *>({Ptr, Offset}), Flags, DL), + Opcode(VPInstruction::PtrAdd), Name(Name.str()) {} + VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands, FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = ""); @@ -1345,6 +1311,13 @@ public: /// provided. void execute(VPTransformState &State) override; + /// Return the cost of this VPInstruction. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the VPInstruction to \p O. void print(raw_ostream &O, const Twine &Indent, @@ -1354,14 +1327,6 @@ public: LLVM_DUMP_METHOD void dump() const; #endif - /// Return true if this instruction may modify memory. - bool mayWriteToMemory() const { - // TODO: we can use attributes of the called function to rule out memory - // modifications. - return Opcode == Instruction::Store || Opcode == Instruction::Call || - Opcode == Instruction::Invoke || Opcode == SLPStore; - } - bool hasResult() const { // CallInst may or may not have a result, depending on the called function. // Conservatively return calls have results for now. @@ -1384,6 +1349,9 @@ public: } } + /// Returns true if the underlying opcode may read from or write to memory. + bool opcodeMayReadOrWriteFromMemory() const; + /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override; @@ -1397,6 +1365,69 @@ public: /// Returns true if this VPInstruction's operands are single scalars and the /// result is also a single scalar. bool isSingleScalar() const; + + /// Returns the symbolic name assigned to the VPInstruction. + StringRef getName() const { return Name; } +}; + +/// A recipe to wrap on original IR instruction not to be modified during +/// execution, execept for PHIs. For PHIs, a single VPValue operand is allowed, +/// and it is used to add a new incoming value for the single predecessor VPBB. +/// Expect PHIs, VPIRInstructions cannot have any operands. +class VPIRInstruction : public VPRecipeBase { + Instruction &I; + +public: + VPIRInstruction(Instruction &I) + : VPRecipeBase(VPDef::VPIRInstructionSC, ArrayRef<VPValue *>()), I(I) {} + + ~VPIRInstruction() override = default; + + VP_CLASSOF_IMPL(VPDef::VPIRInstructionSC) + + VPIRInstruction *clone() override { + auto *R = new VPIRInstruction(I); + for (auto *Op : operands()) + R->addOperand(Op); + return R; + } + + void execute(VPTransformState &State) override; + + /// Return the cost of this VPIRInstruction. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + Instruction &getInstruction() const { return I; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + bool onlyFirstPartUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + /// Update the recipes single operand to the last lane of the operand using \p + /// Builder. Must only be used for single operand VPIRInstructions wrapping a + /// PHINode. + void extractLastLaneOfOperand(VPBuilder &Builder); }; /// VPWidenRecipe is a recipe for producing a widened instruction using the @@ -1406,11 +1437,16 @@ public: class VPWidenRecipe : public VPRecipeWithIRFlags { unsigned Opcode; +protected: + template <typename IterT> + VPWidenRecipe(unsigned VPDefOpcode, Instruction &I, + iterator_range<IterT> Operands) + : VPRecipeWithIRFlags(VPDefOpcode, Operands, I), Opcode(I.getOpcode()) {} + public: template <typename IterT> VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) - : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), - Opcode(I.getOpcode()) {} + : VPWidenRecipe(VPDef::VPWidenSC, I, Operands) {} ~VPWidenRecipe() override = default; @@ -1420,12 +1456,24 @@ public: return R; } - VP_CLASSOF_IMPL(VPDef::VPWidenSC) + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPWidenSC || + R->getVPDefID() == VPRecipeBase::VPWidenEVLSC; + } + + static inline bool classof(const VPUser *U) { + auto *R = dyn_cast<VPRecipeBase>(U); + return R && classof(R); + } /// Produce a widened instruction using the opcode and operands of the recipe, /// processing State.VF elements. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + unsigned getOpcode() const { return Opcode; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1435,6 +1483,54 @@ public: #endif }; +/// A recipe for widening operations with vector-predication intrinsics with +/// explicit vector length (EVL). +class VPWidenEVLRecipe : public VPWidenRecipe { + using VPRecipeWithIRFlags::transferFlags; + +public: + template <typename IterT> + VPWidenEVLRecipe(Instruction &I, iterator_range<IterT> Operands, VPValue &EVL) + : VPWidenRecipe(VPDef::VPWidenEVLSC, I, Operands) { + addOperand(&EVL); + } + VPWidenEVLRecipe(VPWidenRecipe &W, VPValue &EVL) + : VPWidenEVLRecipe(*W.getUnderlyingInstr(), W.operands(), EVL) { + transferFlags(W); + } + + ~VPWidenEVLRecipe() override = default; + + VPWidenRecipe *clone() override final { + llvm_unreachable("VPWidenEVLRecipe cannot be cloned"); + return nullptr; + } + + VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC); + + VPValue *getEVL() { return getOperand(getNumOperands() - 1); } + const VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } + + /// Produce a vp-intrinsic using the opcode and operands of the recipe, + /// processing EVL elements. + void execute(VPTransformState &State) override final; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + // EVL in that recipe is always the last operand, thus any use before means + // the VPValue should be vectorized. + return getEVL() == Op; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override final; +#endif +}; + /// VPWidenCastRecipe is a recipe to create vector cast instructions. class VPWidenCastRecipe : public VPRecipeWithIRFlags { /// Cast instruction opcode. @@ -1471,6 +1567,10 @@ public: /// Produce widened copies of the cast. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenCastRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1489,23 +1589,32 @@ class VPScalarCastRecipe : public VPSingleDefRecipe { Type *ResultTy; - Value *generate(VPTransformState &State, unsigned Part); + Value *generate(VPTransformState &State); public: - VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy) - : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}), Opcode(Opcode), + VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, + DebugLoc DL) + : VPSingleDefRecipe(VPDef::VPScalarCastSC, {Op}, DL), Opcode(Opcode), ResultTy(ResultTy) {} ~VPScalarCastRecipe() override = default; VPScalarCastRecipe *clone() override { - return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy); + return new VPScalarCastRecipe(Opcode, getOperand(0), ResultTy, + getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPScalarCastSC) void execute(VPTransformState &State) override; + /// Return the cost of this VPScalarCastRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; @@ -1522,24 +1631,111 @@ public: } }; -/// A recipe for widening Call instructions. -class VPWidenCallRecipe : public VPSingleDefRecipe { - /// ID of the vector intrinsic to call when widening the call. If set the - /// Intrinsic::not_intrinsic, a library call will be used instead. +/// A recipe for widening vector intrinsics. +class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags { + /// ID of the vector intrinsic to widen. Intrinsic::ID VectorIntrinsicID; - /// If this recipe represents a library call, Variant stores a pointer to - /// the chosen function. There is a 1:1 mapping between a given VF and the - /// chosen vectorized variant, so there will be a different vplan for each - /// VF with a valid variant. + + /// Scalar return type of the intrinsic. + Type *ResultTy; + + /// True if the intrinsic may read from memory. + bool MayReadFromMemory; + + /// True if the intrinsic may read write to memory. + bool MayWriteToMemory; + + /// True if the intrinsic may have side-effects. + bool MayHaveSideEffects; + +public: + VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID, + ArrayRef<VPValue *> CallArguments, Type *Ty, + DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI), + VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty), + MayReadFromMemory(CI.mayReadFromMemory()), + MayWriteToMemory(CI.mayWriteToMemory()), + MayHaveSideEffects(CI.mayHaveSideEffects()) {} + + VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, + ArrayRef<VPValue *> CallArguments, Type *Ty, + DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL), + VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) { + LLVMContext &Ctx = Ty->getContext(); + AttributeList Attrs = Intrinsic::getAttributes(Ctx, VectorIntrinsicID); + MemoryEffects ME = Attrs.getMemoryEffects(); + MayReadFromMemory = ME.onlyWritesMemory(); + MayWriteToMemory = ME.onlyReadsMemory(); + MayHaveSideEffects = MayWriteToMemory || + !Attrs.hasFnAttr(Attribute::NoUnwind) || + !Attrs.hasFnAttr(Attribute::WillReturn); + } + + VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID, + std::initializer_list<VPValue *> CallArguments, + Type *Ty, DebugLoc DL = {}) + : VPWidenIntrinsicRecipe(VectorIntrinsicID, + ArrayRef<VPValue *>(CallArguments), Ty, DL) {} + + ~VPWidenIntrinsicRecipe() override = default; + + VPWidenIntrinsicRecipe *clone() override { + return new VPWidenIntrinsicRecipe(*cast<CallInst>(getUnderlyingValue()), + VectorIntrinsicID, {op_begin(), op_end()}, + ResultTy, getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC) + + /// Produce a widened version of the vector intrinsic. + void execute(VPTransformState &State) override; + + /// Return the cost of this vector intrinsic. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Return the ID of the intrinsic. + Intrinsic::ID getVectorIntrinsicID() const { return VectorIntrinsicID; } + + /// Return the scalar return type of the intrinsic. + Type *getResultType() const { return ResultTy; } + + /// Return to name of the intrinsic as string. + StringRef getIntrinsicName() const; + + /// Returns true if the intrinsic may read from memory. + bool mayReadFromMemory() const { return MayReadFromMemory; } + + /// Returns true if the intrinsic may write to memory. + bool mayWriteToMemory() const { return MayWriteToMemory; } + + /// Returns true if the intrinsic may have side-effects. + bool mayHaveSideEffects() const { return MayHaveSideEffects; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + bool onlyFirstLaneUsed(const VPValue *Op) const override; +}; + +/// A recipe for widening Call instructions using library calls. +class VPWidenCallRecipe : public VPRecipeWithIRFlags { + /// Variant stores a pointer to the chosen function. There is a 1:1 mapping + /// between a given VF and the chosen vectorized variant, so there will be a + /// different VPlan for each VF with a valid variant. Function *Variant; public: - template <typename IterT> - VPWidenCallRecipe(Value *UV, iterator_range<IterT> CallArguments, - Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {}, - Function *Variant = nullptr) - : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, UV, DL), - VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) { + VPWidenCallRecipe(Value *UV, Function *Variant, + ArrayRef<VPValue *> CallArguments, DebugLoc DL = {}) + : VPRecipeWithIRFlags(VPDef::VPWidenCallSC, CallArguments, + *cast<Instruction>(UV)), + Variant(Variant) { assert( isa<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()) && "last operand must be the called function"); @@ -1548,8 +1744,8 @@ public: ~VPWidenCallRecipe() override = default; VPWidenCallRecipe *clone() override { - return new VPWidenCallRecipe(getUnderlyingValue(), operands(), - VectorIntrinsicID, getDebugLoc(), Variant); + return new VPWidenCallRecipe(getUnderlyingValue(), Variant, + {op_begin(), op_end()}, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenCallSC) @@ -1557,6 +1753,10 @@ public: /// Produce a widened version of the call instruction. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenCallRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + Function *getCalledScalarFunction() const { return cast<Function>(getOperand(getNumOperands() - 1)->getLiveInIRValue()); } @@ -1575,12 +1775,56 @@ public: #endif }; +/// A recipe representing a sequence of load -> update -> store as part of +/// a histogram operation. This means there may be aliasing between vector +/// lanes, which is handled by the llvm.experimental.vector.histogram family +/// of intrinsics. The only update operations currently supported are +/// 'add' and 'sub' where the other term is loop-invariant. +class VPHistogramRecipe : public VPRecipeBase { + /// Opcode of the update operation, currently either add or sub. + unsigned Opcode; + +public: + template <typename IterT> + VPHistogramRecipe(unsigned Opcode, iterator_range<IterT> Operands, + DebugLoc DL = {}) + : VPRecipeBase(VPDef::VPHistogramSC, Operands, DL), Opcode(Opcode) {} + + ~VPHistogramRecipe() override = default; + + VPHistogramRecipe *clone() override { + return new VPHistogramRecipe(Opcode, operands(), getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPHistogramSC); + + /// Produce a vectorized histogram operation. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPHistogramRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + unsigned getOpcode() const { return Opcode; } + + /// Return the mask operand if one was provided, or a null pointer if all + /// lanes should be executed unconditionally. + VPValue *getMask() const { + return getNumOperands() == 3 ? getOperand(2) : nullptr; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for widening select instructions. -struct VPWidenSelectRecipe : public VPSingleDefRecipe { +struct VPWidenSelectRecipe : public VPRecipeWithIRFlags { template <typename IterT> VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands) - : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I, - I.getDebugLoc()) {} + : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I) {} ~VPWidenSelectRecipe() override = default; @@ -1594,6 +1838,10 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe { /// Produce a widened version of the select instruction. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenSelectRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1605,23 +1853,23 @@ struct VPWidenSelectRecipe : public VPSingleDefRecipe { } bool isInvariantCond() const { - return getCond()->isDefinedOutsideVectorRegions(); + return getCond()->isDefinedOutsideLoopRegions(); } }; /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeWithIRFlags { bool isPointerLoopInvariant() const { - return getOperand(0)->isDefinedOutsideVectorRegions(); + return getOperand(0)->isDefinedOutsideLoopRegions(); } bool isIndexLoopInvariant(unsigned I) const { - return getOperand(I + 1)->isDefinedOutsideVectorRegions(); + return getOperand(I + 1)->isDefinedOutsideLoopRegions(); } bool areAllOperandsInvariant() const { return all_of(operands(), [](VPValue *Op) { - return Op->isDefinedOutsideVectorRegions(); + return Op->isDefinedOutsideLoopRegions(); }); } @@ -1642,6 +1890,67 @@ public: /// Generate the gep nodes. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenGEPRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + +/// A recipe to compute the pointers for widened memory accesses of IndexTy +/// in reverse order. +class VPReverseVectorPointerRecipe : public VPRecipeWithIRFlags, + public VPUnrollPartAccessor<2> { + Type *IndexedTy; + +public: + VPReverseVectorPointerRecipe(VPValue *Ptr, VPValue *VF, Type *IndexedTy, + GEPNoWrapFlags GEPFlags, DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPReverseVectorPointerSC, + ArrayRef<VPValue *>({Ptr, VF}), GEPFlags, DL), + IndexedTy(IndexedTy) {} + + VP_CLASSOF_IMPL(VPDef::VPReverseVectorPointerSC) + + VPValue *getVFValue() { return getOperand(1); } + const VPValue *getVFValue() const { return getOperand(1); } + + void execute(VPTransformState &State) override; + + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } + + /// Return the cost of this VPVectorPointerRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + + /// Returns true if the recipe only uses the first part of operand \p Op. + bool onlyFirstPartUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + assert(getNumOperands() <= 2 && "must have at most two operands"); + return true; + } + + VPReverseVectorPointerRecipe *clone() override { + return new VPReverseVectorPointerRecipe(getOperand(0), getVFValue(), + IndexedTy, getGEPNoWrapFlags(), + getDebugLoc()); + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1649,19 +1958,17 @@ public: #endif }; -/// A recipe to compute the pointers for widened memory accesses of IndexTy for -/// all parts. If IsReverse is true, compute pointers for accessing the input in -/// reverse order per part. -class VPVectorPointerRecipe : public VPRecipeWithIRFlags { +/// A recipe to compute the pointers for widened memory accesses of IndexTy. +class VPVectorPointerRecipe : public VPRecipeWithIRFlags, + public VPUnrollPartAccessor<1> { Type *IndexedTy; - bool IsReverse; public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, bool IsReverse, - bool IsInBounds, DebugLoc DL) + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, + DebugLoc DL) : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr), - GEPFlagsTy(IsInBounds), DL), - IndexedTy(IndexedTy), IsReverse(IsReverse) {} + GEPFlags, DL), + IndexedTy(IndexedTy) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) @@ -1673,9 +1980,24 @@ public: return true; } + /// Returns true if the recipe only uses the first part of operand \p Op. + bool onlyFirstPartUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + assert(getNumOperands() <= 2 && "must have at most two operands"); + return true; + } + VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, IsReverse, - isInBounds(), getDebugLoc()); + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + getGEPNoWrapFlags(), getDebugLoc()); + } + + /// Return the cost of this VPHeaderPHIRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1734,6 +2056,10 @@ public: /// Generate the phi nodes. void execute(VPTransformState &State) override = 0; + /// Return the cost of this header phi recipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1763,34 +2089,90 @@ public: } }; +/// Base class for widened induction (VPWidenIntOrFpInductionRecipe and +/// VPWidenPointerInductionRecipe), providing shared functionality, including +/// retrieving the step value, induction descriptor and original phi node. +class VPWidenInductionRecipe : public VPHeaderPHIRecipe { + const InductionDescriptor &IndDesc; + +public: + VPWidenInductionRecipe(unsigned char Kind, PHINode *IV, VPValue *Start, + VPValue *Step, const InductionDescriptor &IndDesc, + DebugLoc DL) + : VPHeaderPHIRecipe(Kind, IV, Start, DL), IndDesc(IndDesc) { + addOperand(Step); + } + + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPDef::VPWidenIntOrFpInductionSC || + R->getVPDefID() == VPDef::VPWidenPointerInductionSC; + } + + static inline bool classof(const VPValue *V) { + auto *R = V->getDefiningRecipe(); + return R && classof(R); + } + + static inline bool classof(const VPHeaderPHIRecipe *R) { + return classof(static_cast<const VPRecipeBase *>(R)); + } + + virtual void execute(VPTransformState &State) override = 0; + + /// Returns the step value of the induction. + VPValue *getStepValue() { return getOperand(1); } + const VPValue *getStepValue() const { return getOperand(1); } + + PHINode *getPHINode() const { return cast<PHINode>(getUnderlyingValue()); } + + /// Returns the induction descriptor for the recipe. + const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } + + VPValue *getBackedgeValue() override { + // TODO: All operands of base recipe must exist and be at same index in + // derived recipe. + llvm_unreachable( + "VPWidenIntOrFpInductionRecipe generates its own backedge value"); + } + + VPRecipeBase &getBackedgeRecipe() override { + // TODO: All operands of base recipe must exist and be at same index in + // derived recipe. + llvm_unreachable( + "VPWidenIntOrFpInductionRecipe generates its own backedge value"); + } +}; + /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector values. -class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe { - PHINode *IV; +class VPWidenIntOrFpInductionRecipe : public VPWidenInductionRecipe { TruncInst *Trunc; - const InductionDescriptor &IndDesc; public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, - const InductionDescriptor &IndDesc) - : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start), IV(IV), - Trunc(nullptr), IndDesc(IndDesc) { - addOperand(Step); + VPValue *VF, const InductionDescriptor &IndDesc, + DebugLoc DL) + : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start, + Step, IndDesc, DL), + Trunc(nullptr) { + addOperand(VF); } VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, - const InductionDescriptor &IndDesc, - TruncInst *Trunc) - : VPHeaderPHIRecipe(VPDef::VPWidenIntOrFpInductionSC, Trunc, Start), - IV(IV), Trunc(Trunc), IndDesc(IndDesc) { - addOperand(Step); + VPValue *VF, const InductionDescriptor &IndDesc, + TruncInst *Trunc, DebugLoc DL) + : VPWidenInductionRecipe(VPDef::VPWidenIntOrFpInductionSC, IV, Start, + Step, IndDesc, DL), + Trunc(Trunc) { + addOperand(VF); } ~VPWidenIntOrFpInductionRecipe() override = default; VPWidenIntOrFpInductionRecipe *clone() override { - return new VPWidenIntOrFpInductionRecipe(IV, getStartValue(), - getStepValue(), IndDesc, Trunc); + return new VPWidenIntOrFpInductionRecipe( + getPHINode(), getStartValue(), getStepValue(), getVFValue(), + getInductionDescriptor(), Trunc, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC) @@ -1805,34 +2187,20 @@ public: VPSlotTracker &SlotTracker) const override; #endif - VPValue *getBackedgeValue() override { - // TODO: All operands of base recipe must exist and be at same index in - // derived recipe. - llvm_unreachable( - "VPWidenIntOrFpInductionRecipe generates its own backedge value"); - } + VPValue *getVFValue() { return getOperand(2); } + const VPValue *getVFValue() const { return getOperand(2); } - VPRecipeBase &getBackedgeRecipe() override { - // TODO: All operands of base recipe must exist and be at same index in - // derived recipe. - llvm_unreachable( - "VPWidenIntOrFpInductionRecipe generates its own backedge value"); + VPValue *getSplatVFValue() { + // If the recipe has been unrolled (4 operands), return the VPValue for the + // induction increment. + return getNumOperands() == 5 ? getOperand(3) : nullptr; } - /// Returns the step value of the induction. - VPValue *getStepValue() { return getOperand(1); } - const VPValue *getStepValue() const { return getOperand(1); } - /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { return Trunc; } const TruncInst *getTruncInst() const { return Trunc; } - PHINode *getPHINode() { return IV; } - - /// Returns the induction descriptor for the recipe. - const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } - /// Returns true if the induction is canonical, i.e. starting at 0 and /// incremented by UF * VF (= the original IV is incremented by 1) and has the /// same type as the canonical induction. @@ -1840,13 +2208,19 @@ public: /// Returns the scalar type of the induction. Type *getScalarType() const { - return Trunc ? Trunc->getType() : IV->getType(); + return Trunc ? Trunc->getType() : getPHINode()->getType(); } -}; -class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe { - const InductionDescriptor &IndDesc; + /// Returns the VPValue representing the value of this induction at + /// the last unrolled part, if it exists. Returns itself if unrolling did not + /// take place. + VPValue *getLastUnrolledPartOperand() { + return getNumOperands() == 5 ? getOperand(4) : this; + } +}; +class VPWidenPointerInductionRecipe : public VPWidenInductionRecipe, + public VPUnrollPartAccessor<3> { bool IsScalarAfterVectorization; public: @@ -1854,20 +2228,17 @@ public: /// Start. VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - bool IsScalarAfterVectorization) - : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi), - IndDesc(IndDesc), - IsScalarAfterVectorization(IsScalarAfterVectorization) { - addOperand(Start); - addOperand(Step); - } + bool IsScalarAfterVectorization, DebugLoc DL) + : VPWidenInductionRecipe(VPDef::VPWidenPointerInductionSC, Phi, Start, + Step, IndDesc, DL), + IsScalarAfterVectorization(IsScalarAfterVectorization) {} ~VPWidenPointerInductionRecipe() override = default; VPWidenPointerInductionRecipe *clone() override { return new VPWidenPointerInductionRecipe( cast<PHINode>(getUnderlyingInstr()), getOperand(0), getOperand(1), - IndDesc, IsScalarAfterVectorization); + getInductionDescriptor(), IsScalarAfterVectorization, getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC) @@ -1878,8 +2249,51 @@ public: /// Returns true if only scalar values will be generated. bool onlyScalarsGenerated(bool IsScalable); - /// Returns the induction descriptor for the recipe. - const InductionDescriptor &getInductionDescriptor() const { return IndDesc; } + /// Returns the VPValue representing the value of this induction at + /// the first unrolled part, if it exists. Returns itself if unrolling did not + /// take place. + VPValue *getFirstUnrolledPartOperand() { + return getUnrollPart(*this) == 0 ? this : getOperand(2); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + +/// Recipe to generate a scalar PHI. Used to generate code for recipes that +/// produce scalar header phis, including VPCanonicalIVPHIRecipe and +/// VPEVLBasedIVPHIRecipe. +class VPScalarPHIRecipe : public VPHeaderPHIRecipe { + std::string Name; + +public: + VPScalarPHIRecipe(VPValue *Start, VPValue *BackedgeValue, DebugLoc DL, + StringRef Name) + : VPHeaderPHIRecipe(VPDef::VPScalarPHISC, nullptr, Start, DL), + Name(Name.str()) { + addOperand(BackedgeValue); + } + + ~VPScalarPHIRecipe() override = default; + + VPScalarPHIRecipe *clone() override { + llvm_unreachable("cloning not implemented yet"); + } + + VP_CLASSOF_IMPL(VPDef::VPScalarPHISC) + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override; + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -1896,9 +2310,10 @@ class VPWidenPHIRecipe : public VPSingleDefRecipe { SmallVector<VPBasicBlock *, 2> IncomingBlocks; public: - /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start. - VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr) - : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi) { + /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start and + /// debug location \p DL. + VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr, DebugLoc DL = {}) + : VPSingleDefRecipe(VPDef::VPWidenPHISC, ArrayRef<VPValue *>(), Phi, DL) { if (Start) addOperand(Start); } @@ -1953,6 +2368,10 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { void execute(VPTransformState &State) override; + /// Return the cost of this first-order recurrence phi recipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -1963,7 +2382,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe { /// A recipe for handling reduction phis. The start value is the first operand /// of the recipe and the incoming value from the backedge is the second /// operand. -class VPReductionPHIRecipe : public VPHeaderPHIRecipe { +class VPReductionPHIRecipe : public VPHeaderPHIRecipe, + public VPUnrollPartAccessor<2> { /// Descriptor for the reduction. const RecurrenceDescriptor &RdxDesc; @@ -1973,23 +2393,28 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe { /// The phi is part of an ordered reduction. Requires IsInLoop to be true. bool IsOrdered; + /// When expanding the reduction PHI, the plan's VF element count is divided + /// by this factor to form the reduction phi's VF. + unsigned VFScaleFactor = 1; + public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, VPValue &Start, bool IsInLoop = false, - bool IsOrdered = false) + bool IsOrdered = false, unsigned VFScaleFactor = 1) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), - RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { + RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), + VFScaleFactor(VFScaleFactor) { assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); } ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { - auto *R = - new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc, - *getOperand(0), IsInLoop, IsOrdered); + auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), + RdxDesc, *getOperand(0), IsInLoop, + IsOrdered, VFScaleFactor); R->addOperand(getBackedgeValue()); return R; } @@ -2020,17 +2445,66 @@ public: bool isInLoop() const { return IsInLoop; } }; +/// A recipe for forming partial reductions. In the loop, an accumulator and +/// vector operand are added together and passed to the next iteration as the +/// next accumulator. After the loop body, the accumulator is reduced to a +/// scalar value. +class VPPartialReductionRecipe : public VPSingleDefRecipe { + unsigned Opcode; + +public: + VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, + VPValue *Op1) + : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, + ReductionInst) {} + VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, + Instruction *ReductionInst = nullptr) + : VPSingleDefRecipe(VPDef::VPPartialReductionSC, + ArrayRef<VPValue *>({Op0, Op1}), ReductionInst), + Opcode(Opcode) { + [[maybe_unused]] auto *AccumulatorRecipe = + getOperand(1)->getDefiningRecipe(); + assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) || + isa<VPPartialReductionRecipe>(AccumulatorRecipe)) && + "Unexpected operand order for partial reduction recipe"); + } + ~VPPartialReductionRecipe() override = default; + + VPPartialReductionRecipe *clone() override { + return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), + getUnderlyingInstr()); + } + + VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) + + /// Generate the reduction in the loop. + void execute(VPTransformState &State) override; + + /// Return the cost of this VPPartialReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + + /// Get the binary op's opcode. + unsigned getOpcode() const { return Opcode; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. class VPBlendRecipe : public VPSingleDefRecipe { public: /// The blend operation is a User of the incoming values and of their - /// respective masks, ordered [I0, I1, M1, I2, M2, ...]. Note that the first - /// incoming value does not have a mask associated. + /// respective masks, ordered [I0, M0, I1, M1, I2, M2, ...]. Note that M0 can + /// be omitted (implied by passing an odd number of operands) in which case + /// all other incoming values are merged into it. VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) { - assert((Operands.size() + 1) % 2 == 0 && - "Expected an odd number of operands"); + assert(Operands.size() > 0 && "Expected at least one operand!"); } VPBlendRecipe *clone() override { @@ -2040,24 +2514,34 @@ public: VP_CLASSOF_IMPL(VPDef::VPBlendSC) - /// Return the number of incoming values, taking into account that the first - /// incoming value has no mask. - unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; } + /// A normalized blend is one that has an odd number of operands, whereby the + /// first operand does not have an associated mask. + bool isNormalized() const { return getNumOperands() % 2; } + + /// Return the number of incoming values, taking into account when normalized + /// the first incoming value will have no mask. + unsigned getNumIncomingValues() const { + return (getNumOperands() + isNormalized()) / 2; + } /// Return incoming value number \p Idx. VPValue *getIncomingValue(unsigned Idx) const { - return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - 1); + return Idx == 0 ? getOperand(0) : getOperand(Idx * 2 - isNormalized()); } /// Return mask number \p Idx. VPValue *getMask(unsigned Idx) const { - assert(Idx > 0 && "First index has no mask associated."); - return getOperand(Idx * 2); + assert((Idx > 0 || !isNormalized()) && "First index has no mask!"); + return Idx == 0 ? getOperand(1) : getOperand(Idx * 2 + !isNormalized()); } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenMemoryRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2143,6 +2627,10 @@ public: /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; + /// Return the cost of this VPInterleaveRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2180,8 +2668,9 @@ class VPReductionRecipe : public VPSingleDefRecipe { protected: VPReductionRecipe(const unsigned char SC, const RecurrenceDescriptor &R, Instruction *I, ArrayRef<VPValue *> Operands, - VPValue *CondOp, bool IsOrdered) - : VPSingleDefRecipe(SC, Operands, I), RdxDesc(R), IsOrdered(IsOrdered) { + VPValue *CondOp, bool IsOrdered, DebugLoc DL) + : VPSingleDefRecipe(SC, Operands, I, DL), RdxDesc(R), + IsOrdered(IsOrdered) { if (CondOp) { IsConditional = true; addOperand(CondOp); @@ -2191,16 +2680,17 @@ protected: public: VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered) + bool IsOrdered, DebugLoc DL = {}) : VPReductionRecipe(VPDef::VPReductionSC, R, I, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, - IsOrdered) {} + IsOrdered, DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { return new VPReductionRecipe(RdxDesc, getUnderlyingInstr(), getChainOp(), - getVecOp(), getCondOp(), IsOrdered); + getVecOp(), getCondOp(), IsOrdered, + getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { @@ -2213,9 +2703,13 @@ public: return R && classof(R); } - /// Generate the reduction in the loop + /// Generate the reduction in the loop. void execute(VPTransformState &State) override; + /// Return the cost of VPReductionRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2246,12 +2740,12 @@ public: /// The Operands are {ChainOp, VecOp, EVL, [Condition]}. class VPReductionEVLRecipe : public VPReductionRecipe { public: - VPReductionEVLRecipe(VPReductionRecipe *R, VPValue *EVL, VPValue *CondOp) + VPReductionEVLRecipe(VPReductionRecipe &R, VPValue &EVL, VPValue *CondOp) : VPReductionRecipe( - VPDef::VPReductionEVLSC, R->getRecurrenceDescriptor(), - cast_or_null<Instruction>(R->getUnderlyingValue()), - ArrayRef<VPValue *>({R->getChainOp(), R->getVecOp(), EVL}), CondOp, - R->isOrdered()) {} + VPDef::VPReductionEVLSC, R.getRecurrenceDescriptor(), + cast_or_null<Instruction>(R.getUnderlyingValue()), + ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, + R.isOrdered(), R.getDebugLoc()) {} ~VPReductionEVLRecipe() override = default; @@ -2319,6 +2813,10 @@ public: /// the \p State. void execute(VPTransformState &State) override; + /// Return the cost of this VPReplicateRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2376,6 +2874,10 @@ public: /// conditional branch. void execute(VPTransformState &State) override; + /// Return the cost of this VPBranchOnMaskRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2413,19 +2915,27 @@ class VPPredInstPHIRecipe : public VPSingleDefRecipe { public: /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi /// nodes after merging back from a Branch-on-Mask. - VPPredInstPHIRecipe(VPValue *PredV) - : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {} + VPPredInstPHIRecipe(VPValue *PredV, DebugLoc DL) + : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV, DL) {} ~VPPredInstPHIRecipe() override = default; VPPredInstPHIRecipe *clone() override { - return new VPPredInstPHIRecipe(getOperand(0)); + return new VPPredInstPHIRecipe(getOperand(0), getDebugLoc()); } VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC) - /// Generates phi nodes for live-outs as needed to retain SSA form. + /// Generates phi nodes for live-outs (from a replicate region) as needed to + /// retain SSA form. void execute(VPTransformState &State) override; + /// Return the cost of this VPPredInstPHIRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2513,6 +3023,10 @@ public: llvm_unreachable("VPWidenMemoryRecipe should not be instantiated."); } + /// Return the cost of this VPWidenMemoryRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + Instruction &getIngredient() const { return Ingredient; } }; @@ -2558,10 +3072,10 @@ struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue { /// using the address to load from, the explicit vector length and an optional /// mask. struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { - VPWidenLoadEVLRecipe(VPWidenLoadRecipe *L, VPValue *EVL, VPValue *Mask) - : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L->getIngredient(), - {L->getAddr(), EVL}, L->isConsecutive(), - L->isReverse(), L->getDebugLoc()), + VPWidenLoadEVLRecipe(VPWidenLoadRecipe &L, VPValue &EVL, VPValue *Mask) + : VPWidenMemoryRecipe(VPDef::VPWidenLoadEVLSC, L.getIngredient(), + {L.getAddr(), &EVL}, L.isConsecutive(), + L.isReverse(), L.getDebugLoc()), VPValue(this, &getIngredient()) { setMask(Mask); } @@ -2574,6 +3088,10 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { /// Generate the wide load or gather. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenLoadEVLRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2634,11 +3152,10 @@ struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { /// using the value to store, the address to store to, the explicit vector /// length and an optional mask. struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { - VPWidenStoreEVLRecipe(VPWidenStoreRecipe *S, VPValue *EVL, VPValue *Mask) - : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S->getIngredient(), - {S->getAddr(), S->getStoredValue(), EVL}, - S->isConsecutive(), S->isReverse(), - S->getDebugLoc()) { + VPWidenStoreEVLRecipe(VPWidenStoreRecipe &S, VPValue &EVL, VPValue *Mask) + : VPWidenMemoryRecipe(VPDef::VPWidenStoreEVLSC, S.getIngredient(), + {S.getAddr(), S.getStoredValue(), &EVL}, + S.isConsecutive(), S.isReverse(), S.getDebugLoc()) { setMask(Mask); } @@ -2653,6 +3170,10 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe { /// Generate the wide store or scatter. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenStoreEVLRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override; + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2694,6 +3215,13 @@ public: /// Generate a canonical vector induction variable of the vector loop, with void execute(VPTransformState &State) override; + /// Return the cost of this VPExpandSCEVRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2726,8 +3254,10 @@ public: return D->getVPDefID() == VPDef::VPCanonicalIVPHISC; } - /// Generate the canonical scalar induction phi of the vector loop. - void execute(VPTransformState &State) override; + void execute(VPTransformState &State) override { + llvm_unreachable( + "cannot execute this recipe, should be replaced by VPScalarPHIRecipe"); + } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. @@ -2754,10 +3284,12 @@ public: return true; } - /// Check if the induction described by \p Kind, /p Start and \p Step is - /// canonical, i.e. has the same start and step (of 1) as the canonical IV. - bool isCanonical(InductionDescriptor::InductionKind Kind, VPValue *Start, - VPValue *Step) const; + /// Return the cost of this VPCanonicalIVPHIRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // For now, match the behavior of the legacy cost model. + return 0; + } }; /// A recipe for generating the active lane mask for the vector loop that is @@ -2773,7 +3305,10 @@ public: ~VPActiveLaneMaskPHIRecipe() override = default; VPActiveLaneMaskPHIRecipe *clone() override { - return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); + auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc()); + if (getNumOperands() == 2) + R->addOperand(getOperand(1)); + return R; } VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC) @@ -2813,9 +3348,17 @@ public: return D->getVPDefID() == VPDef::VPEVLBasedIVPHISC; } - /// Generate phi for handling IV based on EVL over iterations correctly. - /// TODO: investigate if it can share the code with VPCanonicalIVPHIRecipe. - void execute(VPTransformState &State) override; + void execute(VPTransformState &State) override { + llvm_unreachable( + "cannot execute this recipe, should be replaced by VPScalarPHIRecipe"); + } + + /// Return the cost of this VPEVLBasedIVPHIRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // For now, match the behavior of the legacy cost model. + return 0; + } /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -2832,7 +3375,8 @@ public: }; /// A Recipe for widening the canonical induction variable of the vector loop. -class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe { +class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe, + public VPUnrollPartAccessor<1> { public: VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV) : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {} @@ -2851,6 +3395,13 @@ public: /// step = <VF*UF, VF*UF, ..., VF*UF>. void execute(VPTransformState &State) override; + /// Return the cost of this VPWidenCanonicalIVPHIRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2868,19 +3419,23 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe { /// for floating point inductions. const FPMathOperator *FPBinOp; + /// Name to use for the generated IR instruction for the derived IV. + std::string Name; + public: VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start, - VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step) + VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step, + const Twine &Name = "") : VPDerivedIVRecipe( IndDesc.getKind(), dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp()), - Start, CanonicalIV, Step) {} + Start, CanonicalIV, Step, Name) {} VPDerivedIVRecipe(InductionDescriptor::InductionKind Kind, const FPMathOperator *FPBinOp, VPValue *Start, VPValue *IV, - VPValue *Step) + VPValue *Step, const Twine &Name = "") : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, IV, Step}), Kind(Kind), - FPBinOp(FPBinOp) {} + FPBinOp(FPBinOp), Name(Name.str()) {} ~VPDerivedIVRecipe() override = default; @@ -2895,6 +3450,13 @@ public: /// operand) + IV (2. operand) * StepValue (3, operand). void execute(VPTransformState &State) override; + /// Return the cost of this VPDerivedIVRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2918,7 +3480,8 @@ public: /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their scalar values. -class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags { +class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, + public VPUnrollPartAccessor<2> { Instruction::BinaryOps InductionOpcode; public: @@ -2949,6 +3512,13 @@ public: /// Generate the scalarized versions of the phi node as needed by their users. void execute(VPTransformState &State) override; + /// Return the cost of this VPScalarIVStepsRecipe. + InstructionCost computeCost(ElementCount VF, + VPCostContext &Ctx) const override { + // TODO: Compute accurate cost after retiring the legacy cost model. + return 0; + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, @@ -2969,6 +3539,15 @@ public: /// holds a sequence of zero or more VPRecipe's each representing a sequence of /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. class VPBasicBlock : public VPBlockBase { + friend class VPlan; + + /// Use VPlan::createVPBasicBlock to create VPBasicBlocks. + VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) + : VPBlockBase(VPBasicBlockSC, Name.str()) { + if (Recipe) + appendRecipe(Recipe); + } + public: using RecipeListTy = iplist<VPRecipeBase>; @@ -2980,12 +3559,6 @@ protected: : VPBlockBase(BlockSC, Name.str()) {} public: - VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) - : VPBlockBase(VPBasicBlockSC, Name.str()) { - if (Recipe) - appendRecipe(Recipe); - } - ~VPBasicBlock() override { while (!Recipes.empty()) Recipes.pop_back(); @@ -3057,14 +3630,13 @@ public: return make_range(begin(), getFirstNonPhi()); } - void dropAllReferences(VPValue *NewValue) override; - /// Split current block at \p SplitAt by inserting a new block between the /// current block and its successors and moving all recipes starting at /// SplitAt to the new block. Returns the new block. VPBasicBlock *splitAt(iterator SplitAt); VPRegionBlock *getEnclosingLoopRegion(); + const VPRegionBlock *getEnclosingLoopRegion() const; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p @@ -3087,17 +3659,16 @@ public: /// Clone the current block and it's recipes, without updating the operands of /// the cloned recipes. - VPBasicBlock *clone() override { - auto *NewBlock = new VPBasicBlock(getName()); - for (VPRecipeBase &R : *this) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPBasicBlock *clone() override; protected: /// Execute the recipes in the IR basic block \p BB. void executeRecipes(VPTransformState *State, BasicBlock *BB); + /// Connect the VPBBs predecessors' in the VPlan CFG to the IR basic block + /// generated for this VPBB. + void connectToPredecessors(VPTransformState::CFGState &CFG); + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -3110,14 +3681,17 @@ private: /// Note: At the moment, VPIRBasicBlock can only be used to wrap VPlan's /// preheader block. class VPIRBasicBlock : public VPBasicBlock { + friend class VPlan; + BasicBlock *IRBB; -public: + /// Use VPlan::createVPIRBasicBlock to create VPIRBasicBlocks. VPIRBasicBlock(BasicBlock *IRBB) : VPBasicBlock(VPIRBasicBlockSC, (Twine("ir-bb<") + IRBB->getName() + Twine(">")).str()), IRBB(IRBB) {} +public: ~VPIRBasicBlock() override {} static inline bool classof(const VPBlockBase *V) { @@ -3128,12 +3702,7 @@ public: /// this VPBasicBlock, thereby "executing" the VPlan. void execute(VPTransformState *State) override; - VPIRBasicBlock *clone() override { - auto *NewBlock = new VPIRBasicBlock(IRBB); - for (VPRecipeBase &R : Recipes) - NewBlock->appendRecipe(R.clone()); - return NewBlock; - } + VPIRBasicBlock *clone() override; BasicBlock *getIRBasicBlock() const { return IRBB; } }; @@ -3147,6 +3716,8 @@ public: /// candidate VF's. The actual replication takes place only once the desired VF /// and UF have been determined. class VPRegionBlock : public VPBlockBase { + friend class VPlan; + /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock. VPBlockBase *Entry; @@ -3158,7 +3729,7 @@ class VPRegionBlock : public VPBlockBase { /// instances of output IR corresponding to its VPBlockBases. bool IsReplicator; -public: + /// Use VPlan::createVPRegionBlock to create VPRegionBlocks. VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name = "", bool IsReplicator = false) : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting), @@ -3172,13 +3743,8 @@ public: : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr), IsReplicator(IsReplicator) {} - ~VPRegionBlock() override { - if (Entry) { - VPValue DummyValue; - Entry->dropAllReferences(&DummyValue); - deleteCFG(Entry); - } - } +public: + ~VPRegionBlock() override {} /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPBlockBase *V) { @@ -3226,8 +3792,6 @@ public: // Return the cost of this region. InstructionCost cost(ElementCount VF, VPCostContext &Ctx) override; - void dropAllReferences(VPValue *NewValue) override; - #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using @@ -3254,14 +3818,15 @@ class VPlan { friend class VPlanPrinter; friend class VPSlotTracker; - /// Hold the single entry to the Hierarchical CFG of the VPlan, i.e. the - /// preheader of the vector loop. - VPBasicBlock *Entry; - /// VPBasicBlock corresponding to the original preheader. Used to place /// VPExpandSCEV recipes for expressions used during skeleton creation and the /// rest of VPlan execution. - VPBasicBlock *Preheader; + /// When this VPlan is used for the epilogue vector loop, the entry will be + /// replaced by a new entry block created during skeleton creation. + VPBasicBlock *Entry; + + /// VPIRBasicBlock wrapping the header of the original scalar loop. + VPIRBasicBlock *ScalarHeader; /// Holds the VFs applicable to this VPlan. SmallSetVector<ElementCount, 2> VFs; @@ -3284,6 +3849,9 @@ class VPlan { /// Represents the vector trip count. VPValue VectorTripCount; + /// Represents the vectorization factor of the loop. + VPValue VF; + /// Represents the loop-invariant VF * UF of the vector loop region. VPValue VFxUF; @@ -3295,56 +3863,63 @@ class VPlan { /// definitions are VPValues that hold a pointer to their underlying IR. SmallVector<VPValue *, 16> VPLiveInsToFree; - /// Values used outside the plan. It contains live-outs that need fixing. Any - /// live-out that is fixed outside VPlan needs to be removed. The remaining - /// live-outs are fixed via VPLiveOut::fixPhi. - MapVector<PHINode *, VPLiveOut *> LiveOuts; - /// Mapping from SCEVs to the VPValues representing their expansions. /// NOTE: This mapping is temporary and will be removed once all users have /// been modeled in VPlan directly. DenseMap<const SCEV *, VPValue *> SCEVToExpansion; -public: - /// Construct a VPlan with original preheader \p Preheader, trip count \p TC - /// and \p Entry to the plan. At the moment, \p Preheader and \p Entry need to - /// be disconnected, as the bypass blocks between them are not yet modeled in - /// VPlan. - VPlan(VPBasicBlock *Preheader, VPValue *TC, VPBasicBlock *Entry) - : VPlan(Preheader, Entry) { - TripCount = TC; - } + /// Blocks allocated and owned by the VPlan. They will be deleted once the + /// VPlan is destroyed. + SmallVector<VPBlockBase *> CreatedBlocks; - /// Construct a VPlan with original preheader \p Preheader and \p Entry to - /// the plan. At the moment, \p Preheader and \p Entry need to be - /// disconnected, as the bypass blocks between them are not yet modeled in - /// VPlan. - VPlan(VPBasicBlock *Preheader, VPBasicBlock *Entry) - : Entry(Entry), Preheader(Preheader) { + /// Construct a VPlan with \p Entry to the plan and with \p ScalarHeader + /// wrapping the original header of the scalar loop. + VPlan(VPBasicBlock *Entry, VPIRBasicBlock *ScalarHeader) + : Entry(Entry), ScalarHeader(ScalarHeader) { Entry->setPlan(this); - Preheader->setPlan(this); - assert(Preheader->getNumSuccessors() == 0 && - Preheader->getNumPredecessors() == 0 && - "preheader must be disconnected"); + assert(ScalarHeader->getNumSuccessors() == 0 && + "scalar header must be a leaf node"); + } + +public: + /// Construct a VPlan for \p L. This will create VPIRBasicBlocks wrapping the + /// original preheader and scalar header of \p L, to be used as entry and + /// scalar header blocks of the new VPlan. + VPlan(Loop *L); + + /// Construct a VPlan with a new VPBasicBlock as entry, a VPIRBasicBlock + /// wrapping \p ScalarHeaderBB and a trip count of \p TC. + VPlan(BasicBlock *ScalarHeaderBB, VPValue *TC) { + setEntry(createVPBasicBlock("preheader")); + ScalarHeader = createVPIRBasicBlock(ScalarHeaderBB); + TripCount = TC; } ~VPlan(); + void setEntry(VPBasicBlock *VPBB) { + Entry = VPBB; + VPBB->setPlan(this); + } + /// Create initial VPlan, having an "entry" VPBasicBlock (wrapping - /// original scalar pre-header ) which contains SCEV expansions that need - /// to happen before the CFG is modified; a VPBasicBlock for the vector - /// pre-header, followed by a region for the vector loop, followed by the - /// middle VPBasicBlock. If a check is needed to guard executing the scalar - /// epilogue loop, it will be added to the middle block, together with - /// VPBasicBlocks for the scalar preheader and exit blocks. - static VPlanPtr createInitialVPlan(const SCEV *TripCount, - ScalarEvolution &PSE, + /// original scalar pre-header) which contains SCEV expansions that need + /// to happen before the CFG is modified (when executing a VPlan for the + /// epilogue vector loop, the original entry needs to be replaced by a new + /// one); a VPBasicBlock for the vector pre-header, followed by a region for + /// the vector loop, followed by the middle VPBasicBlock. If a check is needed + /// to guard executing the scalar epilogue loop, it will be added to the + /// middle block, together with VPBasicBlocks for the scalar preheader and + /// exit blocks. \p InductionTy is the type of the canonical induction and + /// used for related values, like the trip count expression. + static VPlanPtr createInitialVPlan(Type *InductionTy, + PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop); /// Prepare the plan for execution, setting up the required live-in values. void prepareToExecute(Value *TripCount, Value *VectorTripCount, - Value *CanonicalIVStartValue, VPTransformState &State); + VPTransformState &State); /// Generate the IR code for this VPlan. void execute(VPTransformState *State); @@ -3355,6 +3930,43 @@ public: VPBasicBlock *getEntry() { return Entry; } const VPBasicBlock *getEntry() const { return Entry; } + /// Returns the preheader of the vector loop region, if one exists, or null + /// otherwise. + VPBasicBlock *getVectorPreheader() { + VPRegionBlock *VectorRegion = getVectorLoopRegion(); + return VectorRegion + ? cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()) + : nullptr; + } + + /// Returns the VPRegionBlock of the vector loop. + VPRegionBlock *getVectorLoopRegion(); + const VPRegionBlock *getVectorLoopRegion() const; + + /// Returns the 'middle' block of the plan, that is the block that selects + /// whether to execute the scalar tail loop or the exit block from the loop + /// latch. + const VPBasicBlock *getMiddleBlock() const { + return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front()); + } + VPBasicBlock *getMiddleBlock() { + return cast<VPBasicBlock>(getScalarPreheader()->getPredecessors().front()); + } + + /// Return the VPBasicBlock for the preheader of the scalar loop. + VPBasicBlock *getScalarPreheader() const { + return cast<VPBasicBlock>(getScalarHeader()->getSinglePredecessor()); + } + + /// Return the VPIRBasicBlock wrapping the header of the scalar loop. + VPIRBasicBlock *getScalarHeader() const { return ScalarHeader; } + + /// Return an iterator range over the VPIRBasicBlock wrapping the exit blocks + /// of the VPlan, that is leaf nodes except the scalar header. Defined in + /// VPlanHCFG, as the definition of the type needs access to the definitions + /// of VPBlockShallowTraversalWrapper. + auto getExitBlocks(); + /// The trip count of the original loop. VPValue *getTripCount() const { assert(TripCount && "trip count needs to be set before accessing it"); @@ -3379,6 +3991,9 @@ public: /// The vector trip count. VPValue &getVectorTripCount() { return VectorTripCount; } + /// Returns the VF of the vector loop region. + VPValue &getVF() { return VF; }; + /// Returns VF * UF of the vector loop region. VPValue &getVFxUF() { return VFxUF; } @@ -3405,6 +4020,11 @@ public: bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); } + unsigned getUF() const { + assert(UFs.size() == 1 && "Expected a single UF"); + return UFs[0]; + } + void setUF(unsigned UF) { assert(hasUF(UF) && "Cannot set the UF not already in plan"); UFs.clear(); @@ -3451,14 +4071,6 @@ public: LLVM_DUMP_METHOD void dump() const; #endif - /// Returns the VPRegionBlock of the vector loop. - VPRegionBlock *getVectorLoopRegion() { - return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); - } - const VPRegionBlock *getVectorLoopRegion() const { - return cast<VPRegionBlock>(getEntry()->getSingleSuccessor()); - } - /// Returns the canonical induction recipe of the vector loop. VPCanonicalIVPHIRecipe *getCanonicalIV() { VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock(); @@ -3469,17 +4081,6 @@ public: return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin()); } - void addLiveOut(PHINode *PN, VPValue *V); - - void removeLiveOut(PHINode *PN) { - delete LiveOuts[PN]; - LiveOuts.erase(PN); - } - - const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const { - return LiveOuts; - } - VPValue *getSCEVExpansion(const SCEV *S) const { return SCEVToExpansion.lookup(S); } @@ -3489,13 +4090,52 @@ public: SCEVToExpansion[S] = V; } - /// \return The block corresponding to the original preheader. - VPBasicBlock *getPreheader() { return Preheader; } - const VPBasicBlock *getPreheader() const { return Preheader; } - /// Clone the current VPlan, update all VPValues of the new VPlan and cloned /// recipes to refer to the clones, and return it. VPlan *duplicate(); + + /// Create a new VPBasicBlock with \p Name and containing \p Recipe if + /// present. The returned block is owned by the VPlan and deleted once the + /// VPlan is destroyed. + VPBasicBlock *createVPBasicBlock(const Twine &Name, + VPRecipeBase *Recipe = nullptr) { + auto *VPB = new VPBasicBlock(Name, Recipe); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a new VPRegionBlock with \p Entry, \p Exiting and \p Name. If \p + /// IsReplicator is true, the region is a replicate region. The returned block + /// is owned by the VPlan and deleted once the VPlan is destroyed. + VPRegionBlock *createVPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting, + const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Entry, Exiting, Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a new VPRegionBlock with \p Name and entry and exiting blocks set + /// to nullptr. If \p IsReplicator is true, the region is a replicate region. + /// The returned block is owned by the VPlan and deleted once the VPlan is + /// destroyed. + VPRegionBlock *createVPRegionBlock(const std::string &Name = "", + bool IsReplicator = false) { + auto *VPB = new VPRegionBlock(Name, IsReplicator); + CreatedBlocks.push_back(VPB); + return VPB; + } + + /// Create a VPIRBasicBlock wrapping \p IRBB, but do not create + /// VPIRInstructions wrapping the instructions in t\p IRBB. The returned + /// block is owned by the VPlan and deleted once the VPlan is destroyed. + VPIRBasicBlock *createEmptyVPIRBasicBlock(BasicBlock *IRBB); + + /// Create a VPIRBasicBlock from \p IRBB containing VPIRInstructions for all + /// instructions in \p IRBB, except its terminator which is managed by the + /// successors of the block in VPlan. The returned block is owned by the VPlan + /// and deleted once the VPlan is destroyed. + VPIRBasicBlock *createVPIRBasicBlock(BasicBlock *IRBB); }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -3567,93 +4207,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { } #endif -//===----------------------------------------------------------------------===// -// VPlan Utilities -//===----------------------------------------------------------------------===// - -/// Class that provides utilities for VPBlockBases in VPlan. -class VPBlockUtils { -public: - VPBlockUtils() = delete; - - /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p - /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p - /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's - /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must - /// have neither successors nor predecessors. - static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) { - assert(NewBlock->getSuccessors().empty() && - NewBlock->getPredecessors().empty() && - "Can't insert new block with predecessors or successors."); - NewBlock->setParent(BlockPtr->getParent()); - SmallVector<VPBlockBase *> Succs(BlockPtr->successors()); - for (VPBlockBase *Succ : Succs) { - disconnectBlocks(BlockPtr, Succ); - connectBlocks(NewBlock, Succ); - } - connectBlocks(BlockPtr, NewBlock); - } - - /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p - /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p - /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr - /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors - /// and \p IfTrue and \p IfFalse must have neither successors nor - /// predecessors. - static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, - VPBlockBase *BlockPtr) { - assert(IfTrue->getSuccessors().empty() && - "Can't insert IfTrue with successors."); - assert(IfFalse->getSuccessors().empty() && - "Can't insert IfFalse with successors."); - BlockPtr->setTwoSuccessors(IfTrue, IfFalse); - IfTrue->setPredecessors({BlockPtr}); - IfFalse->setPredecessors({BlockPtr}); - IfTrue->setParent(BlockPtr->getParent()); - IfFalse->setParent(BlockPtr->getParent()); - } - - /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to - /// the successors of \p From and \p From to the predecessors of \p To. Both - /// VPBlockBases must have the same parent, which can be null. Both - /// VPBlockBases can be already connected to other VPBlockBases. - static void connectBlocks(VPBlockBase *From, VPBlockBase *To) { - assert((From->getParent() == To->getParent()) && - "Can't connect two block with different parents"); - assert(From->getNumSuccessors() < 2 && - "Blocks can't have more than two successors."); - From->appendSuccessor(To); - To->appendPredecessor(From); - } - - /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To - /// from the successors of \p From and \p From from the predecessors of \p To. - static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) { - assert(To && "Successor to disconnect is null."); - From->removeSuccessor(To); - To->removePredecessor(From); - } - - /// Return an iterator range over \p Range which only includes \p BlockTy - /// blocks. The accesses are casted to \p BlockTy. - template <typename BlockTy, typename T> - static auto blocksOnly(const T &Range) { - // Create BaseTy with correct const-ness based on BlockTy. - using BaseTy = std::conditional_t<std::is_const<BlockTy>::value, - const VPBlockBase, VPBlockBase>; - - // We need to first create an iterator range over (const) BlocktTy & instead - // of (const) BlockTy * for filter_range to work properly. - auto Mapped = - map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; }); - auto Filter = make_filter_range( - Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); }); - return map_range(Filter, [](BaseTy &Block) -> BlockTy * { - return cast<BlockTy>(&Block); - }); - } -}; - class VPInterleavedAccessInfo { DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *> InterleaveGroupMap; @@ -3787,44 +4340,6 @@ public: /// Return true if all visited instruction can be combined. bool isCompletelySLP() const { return CompletelySLP; } }; - -namespace vputils { - -/// Returns true if only the first lane of \p Def is used. -bool onlyFirstLaneUsed(const VPValue *Def); - -/// Returns true if only the first part of \p Def is used. -bool onlyFirstPartUsed(const VPValue *Def); - -/// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p -/// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in -/// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's -/// pre-header already contains a recipe expanding \p Expr, return it. If not, -/// create a new one. -VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, - ScalarEvolution &SE); - -/// Returns true if \p VPV is uniform after vectorization. -inline bool isUniformAfterVectorization(VPValue *VPV) { - // A value defined outside the vector region must be uniform after - // vectorization inside a vector region. - if (VPV->isDefinedOutsideVectorRegions()) - return true; - VPRecipeBase *Def = VPV->getDefiningRecipe(); - assert(Def && "Must have definition for value defined inside vector region"); - if (auto Rep = dyn_cast<VPReplicateRecipe>(Def)) - return Rep->isUniform(); - if (auto *GEP = dyn_cast<VPWidenGEPRecipe>(Def)) - return all_of(GEP->operands(), isUniformAfterVectorization); - if (auto *VPI = dyn_cast<VPInstruction>(Def)) - return VPI->isSingleScalar() || VPI->isVectorToScalar(); - return false; -} - -/// Return true if \p V is a header mask in \p Plan. -bool isHeaderMask(VPValue *V, VPlan &Plan); -} // end namespace vputils - } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H |
