diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlan.h')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlan.h | 992 |
1 files changed, 690 insertions, 302 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 2cce127cd4ce..bdf09d15c27f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -40,6 +40,7 @@ #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/Support/InstructionCost.h" #include <algorithm> #include <cassert> #include <cstddef> @@ -60,6 +61,11 @@ class VPRegionBlock; class VPlan; class VPlanSlp; +/// Returns a calculation for the total number of elements for a given \p VF. +/// For fixed width vectors this value is a constant, whereas for scalable +/// vectors it is an expression determined at runtime. +Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); + /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 9) = {1, 2, 4, 8} @@ -89,169 +95,108 @@ using VPlanPtr = std::unique_ptr<VPlan>; /// vectorizer whereas the term "output IR" refers to code that is generated by /// the vectorizer. -/// VPIteration represents a single point in the iteration space of the output -/// (vectorized and/or unrolled) IR loop. -struct VPIteration { - /// in [0..UF) - unsigned Part; +/// VPLane provides a way to access lanes in both fixed width and scalable +/// vectors, where for the latter the lane index sometimes needs calculating +/// as a runtime expression. +class VPLane { +public: + /// Kind describes how to interpret Lane. + enum class Kind : uint8_t { + /// For First, Lane is the index into the first N elements of a + /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>. + First, + /// For ScalableLast, Lane is the offset from the start of the last + /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For + /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of + /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. + ScalableLast + }; +private: /// in [0..VF) unsigned Lane; -}; - -/// This is a helper struct for maintaining vectorization state. It's used for -/// mapping values from the original loop to their corresponding values in -/// the new loop. Two mappings are maintained: one for vectorized values and -/// one for scalarized values. Vectorized values are represented with UF -/// vector values in the new loop, and scalarized values are represented with -/// UF x VF scalar values in the new loop. UF and VF are the unroll and -/// vectorization factors, respectively. -/// -/// Entries can be added to either map with setVectorValue and setScalarValue, -/// which assert that an entry was not already added before. If an entry is to -/// replace an existing one, call resetVectorValue and resetScalarValue. This is -/// currently needed to modify the mapped values during "fix-up" operations that -/// occur once the first phase of widening is complete. These operations include -/// type truncation and the second phase of recurrence widening. -/// -/// Entries from either map can be retrieved using the getVectorValue and -/// getScalarValue functions, which assert that the desired value exists. -struct VectorizerValueMap { - friend struct VPTransformState; - -private: - /// The unroll factor. Each entry in the vector map contains UF vector values. - unsigned UF; - - /// The vectorization factor. Each entry in the scalar map contains UF x VF - /// scalar values. - ElementCount VF; - /// The vector and scalar map storage. We use std::map and not DenseMap - /// because insertions to DenseMap invalidate its iterators. - using VectorParts = SmallVector<Value *, 2>; - using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>; - std::map<Value *, VectorParts> VectorMapStorage; - std::map<Value *, ScalarParts> ScalarMapStorage; + /// Indicates how the Lane should be interpreted, as described above. + Kind LaneKind; public: - /// Construct an empty map with the given unroll and vectorization factors. - VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {} + VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} - /// \return True if the map has any vector entry for \p Key. - bool hasAnyVectorValue(Value *Key) const { - return VectorMapStorage.count(Key); - } + static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } - /// \return True if the map has a vector entry for \p Key and \p Part. - bool hasVectorValue(Value *Key, unsigned Part) const { - assert(Part < UF && "Queried Vector Part is too large."); - if (!hasAnyVectorValue(Key)) - return false; - const VectorParts &Entry = VectorMapStorage.find(Key)->second; - assert(Entry.size() == UF && "VectorParts has wrong dimensions."); - return Entry[Part] != nullptr; + static VPLane getLastLaneForVF(const ElementCount &VF) { + unsigned LaneOffset = VF.getKnownMinValue() - 1; + Kind LaneKind; + if (VF.isScalable()) + // In this case 'LaneOffset' refers to the offset from the start of the + // last subvector with VF.getKnownMinValue() elements. + LaneKind = VPLane::Kind::ScalableLast; + else + LaneKind = VPLane::Kind::First; + return VPLane(LaneOffset, LaneKind); } - /// \return True if the map has any scalar entry for \p Key. - bool hasAnyScalarValue(Value *Key) const { - return ScalarMapStorage.count(Key); + /// Returns a compile-time known value for the lane index and asserts if the + /// lane can only be calculated at runtime. + unsigned getKnownLane() const { + assert(LaneKind == Kind::First); + return Lane; } - /// \return True if the map has a scalar entry for \p Key and \p Instance. - bool hasScalarValue(Value *Key, const VPIteration &Instance) const { - assert(Instance.Part < UF && "Queried Scalar Part is too large."); - assert(Instance.Lane < VF.getKnownMinValue() && - "Queried Scalar Lane is too large."); + /// Returns an expression describing the lane index that can be used at + /// runtime. + Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const; - if (!hasAnyScalarValue(Key)) - return false; - const ScalarParts &Entry = ScalarMapStorage.find(Key)->second; - assert(Entry.size() == UF && "ScalarParts has wrong dimensions."); - assert(Entry[Instance.Part].size() == VF.getKnownMinValue() && - "ScalarParts has wrong dimensions."); - return Entry[Instance.Part][Instance.Lane] != nullptr; - } - - /// Retrieve the existing vector value that corresponds to \p Key and - /// \p Part. - Value *getVectorValue(Value *Key, unsigned Part) { - assert(hasVectorValue(Key, Part) && "Getting non-existent value."); - return VectorMapStorage[Key][Part]; - } - - /// Retrieve the existing scalar value that corresponds to \p Key and - /// \p Instance. - Value *getScalarValue(Value *Key, const VPIteration &Instance) { - assert(hasScalarValue(Key, Instance) && "Getting non-existent value."); - return ScalarMapStorage[Key][Instance.Part][Instance.Lane]; - } - - /// Set a vector value associated with \p Key and \p Part. Assumes such a - /// value is not already set. If it is, use resetVectorValue() instead. - void setVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(!hasVectorValue(Key, Part) && "Vector value already set for part"); - if (!VectorMapStorage.count(Key)) { - VectorParts Entry(UF); - VectorMapStorage[Key] = Entry; - } - VectorMapStorage[Key][Part] = Vector; - } - - /// Set a scalar value associated with \p Key and \p Instance. Assumes such a - /// value is not already set. - void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) { - assert(!hasScalarValue(Key, Instance) && "Scalar value already set"); - if (!ScalarMapStorage.count(Key)) { - ScalarParts Entry(UF); - // TODO: Consider storing uniform values only per-part, as they occupy - // lane 0 only, keeping the other VF-1 redundant entries null. - for (unsigned Part = 0; Part < UF; ++Part) - Entry[Part].resize(VF.getKnownMinValue(), nullptr); - ScalarMapStorage[Key] = Entry; - } - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; - } + /// Returns the Kind of lane offset. + Kind getKind() const { return LaneKind; } - /// Reset the vector value associated with \p Key for the given \p Part. - /// This function can be used to update values that have already been - /// vectorized. This is the case for "fix-up" operations including type - /// truncation and the second phase of recurrence vectorization. - void resetVectorValue(Value *Key, unsigned Part, Value *Vector) { - assert(hasVectorValue(Key, Part) && "Vector value not set for part"); - VectorMapStorage[Key][Part] = Vector; + /// Returns true if this is the first lane of the whole vector. + bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } + + /// Maps the lane to a cache index based on \p VF. + unsigned mapToCacheIndex(const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + assert(VF.isScalable() && Lane < VF.getKnownMinValue()); + return VF.getKnownMinValue() + Lane; + default: + assert(Lane < VF.getKnownMinValue()); + return Lane; + } } - /// Reset the scalar value associated with \p Key for \p Part and \p Lane. - /// This function can be used to update values that have already been - /// scalarized. This is the case for "fix-up" operations including scalar phi - /// nodes for scalarized and predicated instructions. - void resetScalarValue(Value *Key, const VPIteration &Instance, - Value *Scalar) { - assert(hasScalarValue(Key, Instance) && - "Scalar value not set for part and lane"); - ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar; + /// Returns the maxmimum number of lanes that we are able to consider + /// caching for \p VF. + static unsigned getNumCachedLanes(const ElementCount &VF) { + return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); } }; -/// This class is used to enable the VPlan to invoke a method of ILV. This is -/// needed until the method is refactored out of ILV and becomes reusable. -struct VPCallback { - virtual ~VPCallback() {} - virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0; - virtual Value *getOrCreateScalarValue(Value *V, - const VPIteration &Instance) = 0; +/// VPIteration represents a single point in the iteration space of the output +/// (vectorized and/or unrolled) IR loop. +struct VPIteration { + /// in [0..UF) + unsigned Part; + + VPLane Lane; + + VPIteration(unsigned Part, unsigned Lane, + VPLane::Kind Kind = VPLane::Kind::First) + : Part(Part), Lane(Lane, Kind) {} + + VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {} + + bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); } }; /// VPTransformState holds information passed down when "executing" a VPlan, /// needed for generating the output IR. struct VPTransformState { - VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI, + VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilder<> &Builder, - VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV, - VPCallback &Callback) - : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT), - Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {} + InnerLoopVectorizer *ILV, VPlan *Plan) + : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ILV(ILV), + Plan(Plan) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; @@ -279,13 +224,7 @@ struct VPTransformState { /// method will delegate the call to ILV in such cases in order to provide /// callers a consistent API. /// \see set. - Value *get(VPValue *Def, unsigned Part) { - // If Values have been set for this Def return the one relevant for \p Part. - if (Data.PerPartOutput.count(Def)) - return Data.PerPartOutput[Def][Part]; - // Def is managed by ILV: bring the Values from ValueMap. - return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part); - } + Value *get(VPValue *Def, unsigned Part); /// Get the generated Value for a given VPValue and given Part and Lane. Value *get(VPValue *Def, const VPIteration &Instance); @@ -296,13 +235,18 @@ struct VPTransformState { I->second[Part]; } + bool hasAnyVectorValue(VPValue *Def) const { + return Data.PerPartOutput.find(Def) != Data.PerPartOutput.end(); + } + bool hasScalarValue(VPValue *Def, VPIteration Instance) { auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) return false; + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); return Instance.Part < I->second.size() && - Instance.Lane < I->second[Instance.Part].size() && - I->second[Instance.Part][Instance.Lane]; + CacheIdx < I->second[Instance.Part].size() && + I->second[Instance.Part][CacheIdx]; } /// Set the generated Value for a given VPValue and a given Part. @@ -313,17 +257,39 @@ struct VPTransformState { } Data.PerPartOutput[Def][Part] = V; } - void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part); + /// Reset an existing vector value for \p Def and a given \p Part. + void reset(VPValue *Def, Value *V, unsigned Part) { + auto Iter = Data.PerPartOutput.find(Def); + assert(Iter != Data.PerPartOutput.end() && + "need to overwrite existing value"); + Iter->second[Part] = V; + } + /// Set the generated scalar \p V for \p Def and the given \p Instance. void set(VPValue *Def, Value *V, const VPIteration &Instance) { auto Iter = Data.PerPartScalars.insert({Def, {}}); auto &PerPartVec = Iter.first->second; while (PerPartVec.size() <= Instance.Part) PerPartVec.emplace_back(); auto &Scalars = PerPartVec[Instance.Part]; - while (Scalars.size() <= Instance.Lane) + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + while (Scalars.size() <= CacheIdx) Scalars.push_back(nullptr); - Scalars[Instance.Lane] = V; + assert(!Scalars[CacheIdx] && "should overwrite existing value"); + Scalars[CacheIdx] = V; + } + + /// Reset an existing scalar value for \p Def and a given \p Instance. + void reset(VPValue *Def, Value *V, const VPIteration &Instance) { + auto Iter = Data.PerPartScalars.find(Def); + assert(Iter != Data.PerPartScalars.end() && + "need to overwrite existing value"); + assert(Instance.Part < Iter->second.size() && + "need to overwrite existing value"); + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + assert(CacheIdx < Iter->second[Instance.Part].size() && + "need to overwrite existing value"); + Iter->second[Instance.Part][CacheIdx] = V; } /// Hold state information used when constructing the CFG of the output IR, @@ -340,6 +306,13 @@ struct VPTransformState { /// BasicBlock, used for placing the newly created BasicBlocks. BasicBlock *LastBB = nullptr; + /// The IR BasicBlock that is the preheader of the vector loop in the output + /// IR. + /// FIXME: The vector preheader should also be modeled in VPlan, so any code + /// that needs to be added to the preheader gets directly generated by + /// VPlan. There should be no need to manage a pointer to the IR BasicBlock. + BasicBlock *VectorPreHeader = nullptr; + /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case /// of replication, maps the BasicBlock of the last replica created. SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB; @@ -351,9 +324,6 @@ struct VPTransformState { CFGState() = default; } CFG; - /// Hold a pointer to the original loop. - Loop *OrigLoop; - /// Hold a pointer to LoopInfo to register new basic blocks in the loop. LoopInfo *LI; @@ -363,12 +333,6 @@ struct VPTransformState { /// Hold a reference to the IRBuilder used to generate output IR code. IRBuilder<> &Builder; - /// Hold a reference to the Value state information used when generating the - /// Values of the output IR. - VectorizerValueMap &ValueMap; - - /// Hold a reference to a mapping between VPValues in VPlan and original - /// Values they correspond to. VPValue2ValueTy VPValue2Value; /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF). @@ -380,7 +344,43 @@ struct VPTransformState { /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. InnerLoopVectorizer *ILV; - VPCallback &Callback; + /// Pointer to the VPlan code is generated for. + VPlan *Plan; +}; + +/// VPUsers instance used by VPBlockBase to manage CondBit and the block +/// predicate. Currently VPBlockUsers are used in VPBlockBase for historical +/// reasons, but in the future the only VPUsers should either be recipes or +/// live-outs.VPBlockBase uses. +struct VPBlockUser : public VPUser { + VPBlockUser() : VPUser({}, VPUserID::Block) {} + + VPValue *getSingleOperandOrNull() { + if (getNumOperands() == 1) + return getOperand(0); + + return nullptr; + } + const VPValue *getSingleOperandOrNull() const { + if (getNumOperands() == 1) + return getOperand(0); + + return nullptr; + } + + void resetSingleOpUser(VPValue *NewVal) { + assert(getNumOperands() <= 1 && "Didn't expect more than one operand!"); + if (!NewVal) { + if (getNumOperands() == 1) + removeLastOperand(); + return; + } + + if (getNumOperands() == 1) + setOperand(0, NewVal); + else + addOperand(NewVal); + } }; /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. @@ -403,11 +403,15 @@ class VPBlockBase { /// List of successor blocks. SmallVector<VPBlockBase *, 1> Successors; - /// Successor selector, null for zero or single successor blocks. - VPValue *CondBit = nullptr; + /// Successor selector managed by a VPUser. For blocks with zero or one + /// successors, there is no operand. Otherwise there is exactly one operand + /// which is the branch condition. + VPBlockUser CondBitUser; - /// Current block predicate - null if the block does not need a predicate. - VPValue *Predicate = nullptr; + /// If the block is predicated, its predicate is stored as an operand of this + /// VPUser to maintain the def-use relations. Otherwise there is no operand + /// here. + VPBlockUser PredicateUser; /// VPlan containing the block. Can only be set on the entry block of the /// plan. @@ -553,17 +557,18 @@ public: } /// \return the condition bit selecting the successor. - VPValue *getCondBit() { return CondBit; } - - const VPValue *getCondBit() const { return CondBit; } - - void setCondBit(VPValue *CV) { CondBit = CV; } - - VPValue *getPredicate() { return Predicate; } - - const VPValue *getPredicate() const { return Predicate; } + VPValue *getCondBit(); + /// \return the condition bit selecting the successor. + const VPValue *getCondBit() const; + /// Set the condition bit selecting the successor. + void setCondBit(VPValue *CV); - void setPredicate(VPValue *Pred) { Predicate = Pred; } + /// \return the block's predicate. + VPValue *getPredicate(); + /// \return the block's predicate. + const VPValue *getPredicate() const; + /// Set the block's predicate. + void setPredicate(VPValue *Pred); /// Set a given VPBlockBase \p Successor as the single successor of this /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor. @@ -581,7 +586,7 @@ public: VPValue *Condition) { assert(Successors.empty() && "Setting two successors when others exist."); assert(Condition && "Setting two successors without condition!"); - CondBit = Condition; + setCondBit(Condition); appendSuccessor(IfTrue); appendSuccessor(IfFalse); } @@ -601,7 +606,7 @@ public: /// Remove all the successors of this block and set to null its condition bit void clearSuccessors() { Successors.clear(); - CondBit = nullptr; + setCondBit(nullptr); } /// The method which generates the output IR that correspond to this @@ -611,16 +616,6 @@ public: /// Delete all blocks reachable from a given VPBlockBase, inclusive. static void deleteCFG(VPBlockBase *Entry); - void printAsOperand(raw_ostream &OS, bool PrintType) const { - OS << getName(); - } - - void print(raw_ostream &OS) const { - // TODO: Only printing VPBB name for now since we only have dot printing - // support for VPInstructions/Recipes. - printAsOperand(OS, false); - } - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -631,6 +626,34 @@ public: /// Replace all operands of VPUsers in the block with \p NewValue and also /// replaces all uses of VPValues defined in the block with NewValue. virtual void dropAllReferences(VPValue *NewValue) = 0; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void printAsOperand(raw_ostream &OS, bool PrintType) const { + OS << getName(); + } + + /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines + /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using + /// consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual blocks is consistent with the whole VPlan printing. + virtual void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const = 0; + + /// Print plain-text dump of this VPlan to \p O. + void print(raw_ostream &O) const { + VPSlotTracker SlotTracker(getPlan()); + print(O, "", SlotTracker); + } + + /// Print the successors of this block to \p O, prefixing all lines with \p + /// Indent. + void printSuccessors(raw_ostream &O, const Twine &Indent) const; + + /// Dump this VPBlockBase to dbgs(). + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR @@ -639,16 +662,21 @@ public: /// VPRecipeBases that also inherit from VPValue must make sure to inherit from /// VPRecipeBase before VPValue. class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>, - public VPDef { + public VPDef, + public VPUser { friend VPBasicBlock; friend class VPBlockUtils; - /// Each VPRecipe belongs to a single VPBasicBlock. VPBasicBlock *Parent = nullptr; public: - VPRecipeBase(const unsigned char SC) : VPDef(SC) {} + VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands) + : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {} + + template <typename IterT> + VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands) + : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {} virtual ~VPRecipeBase() = default; /// \return the VPBasicBlock which this VPRecipe belongs to. @@ -685,17 +713,13 @@ public: /// \returns an iterator pointing to the element after the erased one iplist<VPRecipeBase>::iterator eraseFromParent(); - /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or - /// nullptr otherwise. - VPUser *toVPUser(); - /// Returns the underlying instruction, if the recipe is a VPValue or nullptr /// otherwise. Instruction *getUnderlyingInstr() { - return cast<Instruction>(getVPValue()->getUnderlyingValue()); + return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()); } const Instruction *getUnderlyingInstr() const { - return cast<Instruction>(getVPValue()->getUnderlyingValue()); + return cast<Instruction>(getVPSingleValue()->getUnderlyingValue()); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -703,6 +727,29 @@ public: // All VPDefs are also VPRecipeBases. return true; } + + static inline bool classof(const VPUser *U) { + return U->getVPUserID() == VPUser::VPUserID::Recipe; + } + + /// Returns true if the recipe may have side-effects. + bool mayHaveSideEffects() const; + + /// Returns true for PHI-like recipes. + bool isPhi() const { + return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC; + } + + /// Returns true if the recipe may read from memory. + bool mayReadFromMemory() const; + + /// Returns true if the recipe may write to memory. + bool mayWriteToMemory() const; + + /// Returns true if the recipe may read from or write to memory. + bool mayReadOrWriteMemory() const { + return mayReadFromMemory() || mayWriteToMemory(); + } }; inline bool VPUser::classof(const VPDef *Def) { @@ -723,13 +770,16 @@ inline bool VPUser::classof(const VPDef *Def) { /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. -class VPInstruction : public VPRecipeBase, public VPUser, public VPValue { +class VPInstruction : public VPRecipeBase, public VPValue { friend class VPlanSlp; public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. enum { - Not = Instruction::OtherOpsEnd + 1, + FirstOrderRecurrenceSplice = + Instruction::OtherOpsEnd + 1, // Combines the incoming and previous + // values of a first-order recurrence. + Not, ICmpULE, SLPLoad, SLPStore, @@ -749,14 +799,14 @@ protected: public: VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands) - : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands), + : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {} VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands) - : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}), + : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}), VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) { for (auto *I : Operands) - addOperand(I->getVPValue()); + addOperand(I->getVPSingleValue()); } VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands) @@ -784,12 +834,14 @@ public: /// provided. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the VPInstruction to \p O. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; /// Print the VPInstruction to dbgs() (for debugging). - void dump() const; + LLVM_DUMP_METHOD void dump() const; +#endif /// Return true if this instruction may modify memory. bool mayWriteToMemory() const { @@ -823,12 +875,12 @@ public: /// VPWidenRecipe is a recipe for producing a copy of vector type its /// ingredient. This recipe covers most of the traditional vectorization cases /// where each ingredient transforms into a vectorized version of itself. -class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser { +class VPWidenRecipe : public VPRecipeBase, public VPValue { public: template <typename IterT> VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands) - : VPRecipeBase(VPRecipeBase::VPWidenSC), - VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {} + : VPRecipeBase(VPRecipeBase::VPWidenSC, Operands), + VPValue(VPValue::VPVWidenSC, &I, this) {} ~VPWidenRecipe() override = default; @@ -843,18 +895,20 @@ public: /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for widening Call instructions. -class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue { +class VPWidenCallRecipe : public VPRecipeBase, public VPValue { public: template <typename IterT> VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments) - : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments), + : VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments), VPValue(VPValue::VPVWidenCallSC, &I, this) {} ~VPWidenCallRecipe() override = default; @@ -867,13 +921,15 @@ public: /// Produce a widened version of the call instruction. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for widening select instructions. -class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue { +class VPWidenSelectRecipe : public VPRecipeBase, public VPValue { /// Is the condition of the select loop invariant? bool InvariantCond; @@ -882,7 +938,7 @@ public: template <typename IterT> VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands, bool InvariantCond) - : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands), + : VPRecipeBase(VPRecipeBase::VPWidenSelectSC, Operands), VPValue(VPValue::VPVWidenSelectSC, &I, this), InvariantCond(InvariantCond) {} @@ -896,29 +952,29 @@ public: /// Produce a widened version of the select instruction. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for handling GEP instructions. -class VPWidenGEPRecipe : public VPRecipeBase, - public VPUser, - public VPValue { +class VPWidenGEPRecipe : public VPRecipeBase, public VPValue { bool IsPtrLoopInvariant; SmallBitVector IsIndexLoopInvariant; public: template <typename IterT> VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands) - : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), + : VPRecipeBase(VPRecipeBase::VPWidenGEPSC, Operands), VPValue(VPWidenGEPSC, GEP, this), IsIndexLoopInvariant(GEP->getNumIndices(), false) {} template <typename IterT> VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands, Loop *OrigLoop) - : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands), + : VPRecipeBase(VPRecipeBase::VPWidenGEPSC, Operands), VPValue(VPValue::VPVWidenGEPSC, GEP, this), IsIndexLoopInvariant(GEP->getNumIndices(), false) { IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand()); @@ -936,26 +992,29 @@ public: /// Generate the gep nodes. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for handling phi nodes of integer and floating-point inductions, /// producing their vector and scalar values. -class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser { +class VPWidenIntOrFpInductionRecipe : public VPRecipeBase { PHINode *IV; - TruncInst *Trunc; public: - VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, + VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast, TruncInst *Trunc = nullptr) - : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV), - Trunc(Trunc) { + : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) { if (Trunc) new VPValue(Trunc, this); else new VPValue(IV, this); + + if (Cast) + new VPValue(Cast, this); } ~VPWidenIntOrFpInductionRecipe() override = default; @@ -968,59 +1027,199 @@ public: /// needed by their users. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } + + /// Returns the cast VPValue, if one is attached, or nullptr otherwise. + VPValue *getCastValue() { + if (getNumDefinedValues() != 2) + return nullptr; + return getVPValue(1); + } + + /// Returns the first defined value as TruncInst, if it is one or nullptr + /// otherwise. + TruncInst *getTruncInst() { + return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); + } + const TruncInst *getTruncInst() const { + return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue()); + } }; -/// A recipe for handling all phi nodes except for integer and FP inductions. -/// For reduction PHIs, RdxDesc must point to the corresponding recurrence -/// descriptor and the start value is the first operand of the recipe. -class VPWidenPHIRecipe : public VPRecipeBase, public VPUser { - PHINode *Phi; +/// A recipe for handling first order recurrences and pointer inductions. For +/// first-order recurrences, the start value is the first operand of the recipe +/// and the incoming value from the backedge is the second operand. It also +/// serves as base class for VPReductionPHIRecipe. In the VPlan native path, all +/// incoming VPValues & VPBasicBlock pairs are managed in the recipe directly. +class VPWidenPHIRecipe : public VPRecipeBase, public VPValue { + /// List of incoming blocks. Only used in the VPlan native path. + SmallVector<VPBasicBlock *, 2> IncomingBlocks; - /// Descriptor for a reduction PHI. - RecurrenceDescriptor *RdxDesc = nullptr; +protected: + VPWidenPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi, + VPValue *Start = nullptr) + : VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) { + if (Start) + addOperand(Start); + } public: - /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p - /// RdxDesc. - VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start) - : VPWidenPHIRecipe(Phi) { - this->RdxDesc = &RdxDesc; + /// Create a VPWidenPHIRecipe for \p Phi + VPWidenPHIRecipe(PHINode *Phi) + : VPWidenPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {} + + /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start. + VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) { addOperand(&Start); } - /// Create a VPWidenPHIRecipe for \p Phi - VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) { - new VPValue(Phi, this); - } ~VPWidenPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPWidenPHISC; + static inline bool classof(const VPRecipeBase *B) { + return B->getVPDefID() == VPRecipeBase::VPWidenPHISC || + B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC || + B->getVPDefID() == VPRecipeBase::VPReductionPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVWidenPHISC || + V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC || + V->getVPValueID() == VPValue::VPVReductionPHISC; } /// Generate the phi/select nodes. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif - /// Returns the start value of the phi, if it is a reduction. + /// Returns the start value of the phi, if it is a reduction or first-order + /// recurrence. VPValue *getStartValue() { return getNumOperands() == 0 ? nullptr : getOperand(0); } + + /// Returns the incoming value from the loop backedge, if it is a reduction or + /// first-order recurrence. + VPValue *getBackedgeValue() { + return getOperand(1); + } + + /// Returns the backedge value as a recipe. The backedge value is guaranteed + /// to be a recipe. + VPRecipeBase *getBackedgeRecipe() { + return cast<VPRecipeBase>(getBackedgeValue()->getDef()); + } + + /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi. + void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) { + addOperand(IncomingV); + IncomingBlocks.push_back(IncomingBlock); + } + + /// Returns the \p I th incoming VPValue. + VPValue *getIncomingValue(unsigned I) { return getOperand(I); } + + /// Returns the \p I th incoming VPBasicBlock. + VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; } +}; + +/// A recipe for handling first-order recurrence phis. The start value is the +/// first operand of the recipe and the incoming value from the backedge is the +/// second operand. +struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe { + VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start) + : VPWidenPHIRecipe(VPVFirstOrderRecurrencePHISC, + VPFirstOrderRecurrencePHISC, Phi, &Start) {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; + } + static inline bool classof(const VPWidenPHIRecipe *D) { + return D->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC; + } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + +/// A recipe for handling reduction phis. The start value is the first operand +/// of the recipe and the incoming value from the backedge is the second +/// operand. +class VPReductionPHIRecipe : public VPWidenPHIRecipe { + /// Descriptor for the reduction. + RecurrenceDescriptor &RdxDesc; + + /// The phi is part of an in-loop reduction. + bool IsInLoop; + + /// The phi is part of an ordered reduction. Requires IsInLoop to be true. + bool IsOrdered; + +public: + /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p + /// RdxDesc. + VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, + VPValue &Start, bool IsInLoop = false, + bool IsOrdered = false) + : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start), + RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) { + assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); + } + + ~VPReductionPHIRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *R) { + return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVReductionPHISC; + } + static inline bool classof(const VPWidenPHIRecipe *R) { + return R->getVPDefID() == VPRecipeBase::VPReductionPHISC; + } + + /// Generate the phi/select nodes. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; } + + /// Returns true, if the phi is part of an ordered reduction. + bool isOrdered() const { return IsOrdered; } + + /// Returns true, if the phi is part of an in-loop reduction. + bool isInLoop() const { return IsInLoop; } }; /// A recipe for vectorizing a phi-node as a sequence of mask-based select /// instructions. -class VPBlendRecipe : public VPRecipeBase, public VPUser { +class VPBlendRecipe : public VPRecipeBase, public VPValue { PHINode *Phi; public: @@ -1028,8 +1227,8 @@ public: /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value /// might be incoming with a full mask for which there is no VPValue. VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands) - : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) { - new VPValue(Phi, this); + : VPRecipeBase(VPBlendSC, Operands), + VPValue(VPValue::VPVBlendSC, Phi, this), Phi(Phi) { assert(Operands.size() > 0 && ((Operands.size() == 1) || (Operands.size() % 2 == 0)) && "Expected either a single incoming value or a positive even number " @@ -1054,16 +1253,18 @@ public: /// Generate the phi/select nodes. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// VPInterleaveRecipe is a recipe for transforming an interleave group of load /// or stores into one wide load/store and shuffles. The first operand of a /// VPInterleave recipe is the address, followed by the stored values, followed /// by an optional mask. -class VPInterleaveRecipe : public VPRecipeBase, public VPUser { +class VPInterleaveRecipe : public VPRecipeBase { const InterleaveGroup<Instruction> *IG; bool HasMask = false; @@ -1071,7 +1272,7 @@ class VPInterleaveRecipe : public VPRecipeBase, public VPUser { public: VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr, ArrayRef<VPValue *> StoredValues, VPValue *Mask) - : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) { + : VPRecipeBase(VPInterleaveSC, {Addr}), IG(IG) { for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) { if (I->getType()->isVoidTy()) @@ -1117,9 +1318,11 @@ public: /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; } }; @@ -1127,21 +1330,18 @@ public: /// A recipe to represent inloop reduction operations, performing a reduction on /// a vector operand into a scalar value, and adding the result to a chain. /// The Operands are {ChainOp, VecOp, [Condition]}. -class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue { +class VPReductionRecipe : public VPRecipeBase, public VPValue { /// The recurrence decriptor for the reduction in question. RecurrenceDescriptor *RdxDesc; - /// Fast math flags to use for the resulting reduction operation. - bool NoNaN; /// Pointer to the TTI, needed to create the target reduction const TargetTransformInfo *TTI; public: VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp, - VPValue *VecOp, VPValue *CondOp, bool NoNaN, + VPValue *VecOp, VPValue *CondOp, const TargetTransformInfo *TTI) - : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}), - VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN), - TTI(TTI) { + : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}), + VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) { if (CondOp) addOperand(CondOp); } @@ -1153,16 +1353,14 @@ public: return V->getVPValueID() == VPValue::VPVReductionSC; } - static inline bool classof(const VPDef *D) { - return D->getVPDefID() == VPRecipeBase::VPReductionSC; - } - /// Generate the reduction in the loop void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } @@ -1178,7 +1376,7 @@ public: /// copies of the original scalar type, one per lane, instead of producing a /// single copy of widened type for all lanes. If the instruction is known to be /// uniform only one copy, per lane zero, will be generated. -class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue { +class VPReplicateRecipe : public VPRecipeBase, public VPValue { /// Indicator if only a single replica per lane is needed. bool IsUniform; @@ -1192,9 +1390,8 @@ public: template <typename IterT> VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands, bool IsUniform, bool IsPredicated = false) - : VPRecipeBase(VPReplicateSC), VPUser(Operands), - VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform), - IsPredicated(IsPredicated) { + : VPRecipeBase(VPReplicateSC, Operands), VPValue(VPVReplicateSC, I, this), + IsUniform(IsUniform), IsPredicated(IsPredicated) { // Retain the previous behavior of predicateInstructions(), where an // insert-element of a predicated instruction got hoisted into the // predicated basic block iff it was its only user. This is achieved by @@ -1221,17 +1418,24 @@ public: void setAlsoPack(bool Pack) { AlsoPack = Pack; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif bool isUniform() const { return IsUniform; } + + bool isPacked() const { return AlsoPack; } + + bool isPredicated() const { return IsPredicated; } }; /// A recipe for generating conditional branches on the bits of a mask. -class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser { +class VPBranchOnMaskRecipe : public VPRecipeBase { public: - VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) { + VPBranchOnMaskRecipe(VPValue *BlockInMask) + : VPRecipeBase(VPBranchOnMaskSC, {}) { if (BlockInMask) // nullptr means all-one mask. addOperand(BlockInMask); } @@ -1245,16 +1449,17 @@ public: /// conditional branch. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override { - O << " +\n" << Indent << "\"BRANCH-ON-MASK "; + O << Indent << "BRANCH-ON-MASK "; if (VPValue *Mask = getMask()) Mask->printAsOperand(O, SlotTracker); else O << " All-One"; - O << "\\l\""; } +#endif /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. @@ -1270,15 +1475,13 @@ public: /// order to merge values that are set under such a branch and feed their uses. /// The phi nodes can be scalar or vector depending on the users of the value. /// This recipe works in concert with VPBranchOnMaskRecipe. -class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser { - +class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue { public: /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi /// nodes after merging back from a Branch-on-Mask. VPPredInstPHIRecipe(VPValue *PredV) - : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) { - new VPValue(PredV->getUnderlyingValue(), this); - } + : VPRecipeBase(VPPredInstPHISC, PredV), + VPValue(VPValue::VPVPredInstPHI, nullptr, this) {} ~VPPredInstPHIRecipe() override = default; /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -1289,9 +1492,11 @@ public: /// Generates phi nodes for live-outs as needed to retain SSA form. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A Recipe for widening load/store operations. @@ -1300,8 +1505,7 @@ public: /// - For store: Address, stored value, optional mask /// TODO: We currently execute only per-part unless a specific instance is /// provided. -class VPWidenMemoryInstructionRecipe : public VPRecipeBase, - public VPUser { +class VPWidenMemoryInstructionRecipe : public VPRecipeBase { Instruction &Ingredient; void setMask(VPValue *Mask) { @@ -1316,15 +1520,14 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase, public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}), - Ingredient(Load) { + : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load) { new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); setMask(Mask); } VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredValue, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}), + : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}), Ingredient(Store) { setMask(Mask); } @@ -1358,15 +1561,17 @@ public: /// Generate the wide load/store. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase { public: - VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) { + VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC, {}) { new VPValue(nullptr, this); } @@ -1382,14 +1587,16 @@ public: /// step = <VF*UF, VF*UF, ..., VF*UF>. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It /// holds a sequence of zero or more VPRecipe's each representing a sequence of -/// output IR instructions. +/// output IR instructions. All PHI-like recipes must come before any non-PHI recipes. class VPBasicBlock : public VPBlockBase { public: using RecipeListTy = iplist<VPRecipeBase>; @@ -1405,7 +1612,10 @@ public: appendRecipe(Recipe); } - ~VPBasicBlock() override { Recipes.clear(); } + ~VPBasicBlock() override { + while (!Recipes.empty()) + Recipes.pop_back(); + } /// Instruction iterators... using iterator = RecipeListTy::iterator; @@ -1464,8 +1674,29 @@ public: /// Return the position of the first non-phi node recipe in the block. iterator getFirstNonPhi(); + /// Returns an iterator range over the PHI-like recipes in the block. + iterator_range<iterator> phis() { + return make_range(begin(), getFirstNonPhi()); + } + void dropAllReferences(VPValue *NewValue) override; + /// Split current block at \p SplitAt by inserting a new block between the + /// current block and its successors and moving all recipes starting at + /// SplitAt to the new block. Returns the new block. + VPBasicBlock *splitAt(iterator SplitAt); + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p + /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual blocks is consistent with the whole VPlan printing. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + using VPBlockBase::print; // Get the print(raw_stream &O) version. +#endif + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1557,6 +1788,18 @@ public: void execute(struct VPTransformState *State) override; void dropAllReferences(VPValue *NewValue) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with + /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using + /// consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual regions is consistent with the whole VPlan printing. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + using VPBlockBase::print; // Get the print(raw_stream &O) version. +#endif }; //===----------------------------------------------------------------------===// @@ -1681,6 +1924,136 @@ struct GraphTraits<Inverse<VPRegionBlock *>> } }; +/// Iterator to traverse all successors of a VPBlockBase node. This includes the +/// entry node of VPRegionBlocks. Exit blocks of a region implicitly have their +/// parent region's successors. This ensures all blocks in a region are visited +/// before any blocks in a successor region when doing a reverse post-order +// traversal of the graph. +template <typename BlockPtrTy> +class VPAllSuccessorsIterator + : public iterator_facade_base<VPAllSuccessorsIterator<BlockPtrTy>, + std::forward_iterator_tag, VPBlockBase> { + BlockPtrTy Block; + /// Index of the current successor. For VPBasicBlock nodes, this simply is the + /// index for the successor array. For VPRegionBlock, SuccessorIdx == 0 is + /// used for the region's entry block, and SuccessorIdx - 1 are the indices + /// for the successor array. + size_t SuccessorIdx; + + static BlockPtrTy getBlockWithSuccs(BlockPtrTy Current) { + while (Current && Current->getNumSuccessors() == 0) + Current = Current->getParent(); + return Current; + } + + /// Templated helper to dereference successor \p SuccIdx of \p Block. Used by + /// both the const and non-const operator* implementations. + template <typename T1> static T1 deref(T1 Block, unsigned SuccIdx) { + if (auto *R = dyn_cast<VPRegionBlock>(Block)) { + if (SuccIdx == 0) + return R->getEntry(); + SuccIdx--; + } + + // For exit blocks, use the next parent region with successors. + return getBlockWithSuccs(Block)->getSuccessors()[SuccIdx]; + } + +public: + VPAllSuccessorsIterator(BlockPtrTy Block, size_t Idx = 0) + : Block(Block), SuccessorIdx(Idx) {} + VPAllSuccessorsIterator(const VPAllSuccessorsIterator &Other) + : Block(Other.Block), SuccessorIdx(Other.SuccessorIdx) {} + + VPAllSuccessorsIterator &operator=(const VPAllSuccessorsIterator &R) { + Block = R.Block; + SuccessorIdx = R.SuccessorIdx; + return *this; + } + + static VPAllSuccessorsIterator end(BlockPtrTy Block) { + BlockPtrTy ParentWithSuccs = getBlockWithSuccs(Block); + unsigned NumSuccessors = ParentWithSuccs + ? ParentWithSuccs->getNumSuccessors() + : Block->getNumSuccessors(); + + if (auto *R = dyn_cast<VPRegionBlock>(Block)) + return {R, NumSuccessors + 1}; + return {Block, NumSuccessors}; + } + + bool operator==(const VPAllSuccessorsIterator &R) const { + return Block == R.Block && SuccessorIdx == R.SuccessorIdx; + } + + const VPBlockBase *operator*() const { return deref(Block, SuccessorIdx); } + + BlockPtrTy operator*() { return deref(Block, SuccessorIdx); } + + VPAllSuccessorsIterator &operator++() { + SuccessorIdx++; + return *this; + } + + VPAllSuccessorsIterator operator++(int X) { + VPAllSuccessorsIterator Orig = *this; + SuccessorIdx++; + return Orig; + } +}; + +/// Helper for GraphTraits specialization that traverses through VPRegionBlocks. +template <typename BlockTy> class VPBlockRecursiveTraversalWrapper { + BlockTy Entry; + +public: + VPBlockRecursiveTraversalWrapper(BlockTy Entry) : Entry(Entry) {} + BlockTy getEntry() { return Entry; } +}; + +/// GraphTraits specialization to recursively traverse VPBlockBase nodes, +/// including traversing through VPRegionBlocks. Exit blocks of a region +/// implicitly have their parent region's successors. This ensures all blocks in +/// a region are visited before any blocks in a successor region when doing a +/// reverse post-order traversal of the graph. +template <> +struct GraphTraits<VPBlockRecursiveTraversalWrapper<VPBlockBase *>> { + using NodeRef = VPBlockBase *; + using ChildIteratorType = VPAllSuccessorsIterator<VPBlockBase *>; + + static NodeRef + getEntryNode(VPBlockRecursiveTraversalWrapper<VPBlockBase *> N) { + return N.getEntry(); + } + + static inline ChildIteratorType child_begin(NodeRef N) { + return ChildIteratorType(N); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return ChildIteratorType::end(N); + } +}; + +template <> +struct GraphTraits<VPBlockRecursiveTraversalWrapper<const VPBlockBase *>> { + using NodeRef = const VPBlockBase *; + using ChildIteratorType = VPAllSuccessorsIterator<const VPBlockBase *>; + + static NodeRef + getEntryNode(VPBlockRecursiveTraversalWrapper<const VPBlockBase *> N) { + return N.getEntry(); + } + + static inline ChildIteratorType child_begin(NodeRef N) { + return ChildIteratorType(N); + } + + static inline ChildIteratorType child_end(NodeRef N) { + return ChildIteratorType::end(N); + } +}; + /// VPlan models a candidate for vectorization, encoding various decisions take /// to produce efficient output IR, including which branches, basic-blocks and /// output IR instructions to generate, and their cost. VPlan holds a @@ -1704,7 +2077,7 @@ class VPlan { // VPlan. External definitions must be immutable and hold a pointer to its // underlying IR that will be used to implement its structural comparison // (operators '==' and '<'). - SmallPtrSet<VPValue *, 16> VPExternalDefs; + SetVector<VPValue *> VPExternalDefs; /// Represents the backedge taken count of the original loop, for folding /// the tail. @@ -1721,9 +2094,6 @@ class VPlan { /// Holds the VPLoopInfo analysis for this VPlan. VPLoopInfo VPLInfo; - /// Holds the condition bit values built during VPInstruction to VPRecipe transformation. - SmallVector<VPValue *, 4> VPCBVs; - public: VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { if (Entry) @@ -1744,8 +2114,6 @@ public: delete BackedgeTakenCount; for (VPValue *Def : VPExternalDefs) delete Def; - for (VPValue *CBV : VPCBVs) - delete CBV; } /// Generate the IR code for this VPlan. @@ -1777,14 +2145,7 @@ public: /// Add \p VPVal to the pool of external definitions if it's not already /// in the pool. - void addExternalDef(VPValue *VPVal) { - VPExternalDefs.insert(VPVal); - } - - /// Add \p CBV to the vector of condition bit values. - void addCBV(VPValue *CBV) { - VPCBVs.push_back(CBV); - } + void addExternalDef(VPValue *VPVal) { VPExternalDefs.insert(VPVal); } void addVPValue(Value *V) { assert(V && "Trying to add a null Value to VPlan"); @@ -1819,8 +2180,16 @@ public: VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print this VPlan to \p O. + void print(raw_ostream &O) const; + + /// Print this VPlan in DOT format to \p O. + void printDOT(raw_ostream &O) const; + /// Dump the plan to stderr (for debugging). - void dump() const; + LLVM_DUMP_METHOD void dump() const; +#endif /// Returns a range mapping the values the range \p Operands to their /// corresponding VPValues. @@ -1840,14 +2209,10 @@ private: BasicBlock *LoopExitBB); }; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { - friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); - friend inline raw_ostream &operator<<(raw_ostream &OS, - const struct VPlanIngredient &I); - -private: raw_ostream &OS; const VPlan &Plan; unsigned Depth = 0; @@ -1858,9 +2223,6 @@ private: VPSlotTracker SlotTracker; - VPlanPrinter(raw_ostream &O, const VPlan &P) - : OS(O), Plan(P), SlotTracker(&P) {} - /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1890,27 +2252,31 @@ private: void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, const Twine &Label); - void dump(); +public: + VPlanPrinter(raw_ostream &O, const VPlan &P) + : OS(O), Plan(P), SlotTracker(&P) {} - static void printAsIngredient(raw_ostream &O, const Value *V); + LLVM_DUMP_METHOD void dump(); }; struct VPlanIngredient { const Value *V; VPlanIngredient(const Value *V) : V(V) {} + + void print(raw_ostream &O) const; }; inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { - VPlanPrinter::printAsIngredient(OS, I.V); + I.print(OS); return OS; } inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { - VPlanPrinter Printer(OS, Plan); - Printer.dump(); + Plan.print(OS); return OS; } +#endif //===----------------------------------------------------------------------===// // VPlan Utilities @@ -2011,6 +2377,26 @@ public: } return Count; } + + /// Return an iterator range over \p Range which only includes \p BlockTy + /// blocks. The accesses are casted to \p BlockTy. + template <typename BlockTy, typename T> + static auto blocksOnly(const T &Range) { + // Create BaseTy with correct const-ness based on BlockTy. + using BaseTy = + typename std::conditional<std::is_const<BlockTy>::value, + const VPBlockBase, VPBlockBase>::type; + + // We need to first create an iterator range over (const) BlocktTy & instead + // of (const) BlockTy * for filter_range to work properly. + auto Mapped = + map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; }); + auto Filter = make_filter_range( + Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); }); + return map_range(Filter, [](BaseTy &Block) -> BlockTy * { + return cast<BlockTy>(&Block); + }); + } }; class VPInterleavedAccessInfo { @@ -2126,8 +2512,10 @@ class VPlanSlp { SmallPtrSetImpl<VPValue *> &Candidates, VPInterleavedAccessInfo &IAI); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print bundle \p Values to dbgs(). void dumpBundle(ArrayRef<VPValue *> Values); +#endif public: VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} |