1 files changed, 690 insertions, 302 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 2cce127cd4ce..bdf09d15c27f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -40,6 +40,7 @@
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/InstructionCost.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -60,6 +61,11 @@ class VPRegionBlock;
 class VPlan;
 class VPlanSlp;
 
+/// Returns a calculation for the total number of elements for a given \p VF.
+/// For fixed width vectors this value is a constant, whereas for scalable
+/// vectors it is an expression determined at runtime.
+Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF);
+
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
 /// [1, 9) = {1, 2, 4, 8}
@@ -89,169 +95,108 @@ using VPlanPtr = std::unique_ptr<VPlan>;
 /// vectorizer whereas the term "output IR" refers to code that is generated by
 /// the vectorizer.
 
-/// VPIteration represents a single point in the iteration space of the output
-/// (vectorized and/or unrolled) IR loop.
-struct VPIteration {
-  /// in [0..UF)
-  unsigned Part;
+/// VPLane provides a way to access lanes in both fixed width and scalable
+/// vectors, where for the latter the lane index sometimes needs calculating
+/// as a runtime expression.
+class VPLane {
+public:
+  /// Kind describes how to interpret Lane.
+  enum class Kind : uint8_t {
+    /// For First, Lane is the index into the first N elements of a
+    /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
+    First,
+    /// For ScalableLast, Lane is the offset from the start of the last
+    /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
+    /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
+    /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
+    ScalableLast
+  };
 
+private:
   /// in [0..VF)
   unsigned Lane;
-};
-
-/// This is a helper struct for maintaining vectorization state. It's used for
-/// mapping values from the original loop to their corresponding values in
-/// the new loop. Two mappings are maintained: one for vectorized values and
-/// one for scalarized values. Vectorized values are represented with UF
-/// vector values in the new loop, and scalarized values are represented with
-/// UF x VF scalar values in the new loop. UF and VF are the unroll and
-/// vectorization factors, respectively.
-///
-/// Entries can be added to either map with setVectorValue and setScalarValue,
-/// which assert that an entry was not already added before. If an entry is to
-/// replace an existing one, call resetVectorValue and resetScalarValue. This is
-/// currently needed to modify the mapped values during "fix-up" operations that
-/// occur once the first phase of widening is complete. These operations include
-/// type truncation and the second phase of recurrence widening.
-///
-/// Entries from either map can be retrieved using the getVectorValue and
-/// getScalarValue functions, which assert that the desired value exists.
-struct VectorizerValueMap {
-  friend struct VPTransformState;
-
-private:
-  /// The unroll factor. Each entry in the vector map contains UF vector values.
-  unsigned UF;
-
-  /// The vectorization factor. Each entry in the scalar map contains UF x VF
-  /// scalar values.
-  ElementCount VF;
 
-  /// The vector and scalar map storage. We use std::map and not DenseMap
-  /// because insertions to DenseMap invalidate its iterators.
-  using VectorParts = SmallVector<Value *, 2>;
-  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
-  std::map<Value *, VectorParts> VectorMapStorage;
-  std::map<Value *, ScalarParts> ScalarMapStorage;
+  /// Indicates how the Lane should be interpreted, as described above.
+  Kind LaneKind;
 
 public:
-  /// Construct an empty map with the given unroll and vectorization factors.
-  VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {}
+  VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
 
-  /// \return True if the map has any vector entry for \p Key.
-  bool hasAnyVectorValue(Value *Key) const {
-    return VectorMapStorage.count(Key);
-  }
+  static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
 
-  /// \return True if the map has a vector entry for \p Key and \p Part.
-  bool hasVectorValue(Value *Key, unsigned Part) const {
-    assert(Part < UF && "Queried Vector Part is too large.");
-    if (!hasAnyVectorValue(Key))
-      return false;
-    const VectorParts &Entry = VectorMapStorage.find(Key)->second;
-    assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
-    return Entry[Part] != nullptr;
+  static VPLane getLastLaneForVF(const ElementCount &VF) {
+    unsigned LaneOffset = VF.getKnownMinValue() - 1;
+    Kind LaneKind;
+    if (VF.isScalable())
+      // In this case 'LaneOffset' refers to the offset from the start of the
+      // last subvector with VF.getKnownMinValue() elements.
+      LaneKind = VPLane::Kind::ScalableLast;
+    else
+      LaneKind = VPLane::Kind::First;
+    return VPLane(LaneOffset, LaneKind);
   }
 
-  /// \return True if the map has any scalar entry for \p Key.
-  bool hasAnyScalarValue(Value *Key) const {
-    return ScalarMapStorage.count(Key);
+  /// Returns a compile-time known value for the lane index and asserts if the
+  /// lane can only be calculated at runtime.
+  unsigned getKnownLane() const {
+    assert(LaneKind == Kind::First);
+    return Lane;
   }
 
-  /// \return True if the map has a scalar entry for \p Key and \p Instance.
-  bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
-    assert(Instance.Part < UF && "Queried Scalar Part is too large.");
-    assert(Instance.Lane < VF.getKnownMinValue() &&
-           "Queried Scalar Lane is too large.");
+  /// Returns an expression describing the lane index that can be used at
+  /// runtime.
+  Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const;
 
-    if (!hasAnyScalarValue(Key))
-      return false;
-    const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
-    assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
-    assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
-           "ScalarParts has wrong dimensions.");
-    return Entry[Instance.Part][Instance.Lane] != nullptr;
-  }
-
-  /// Retrieve the existing vector value that corresponds to \p Key and
-  /// \p Part.
-  Value *getVectorValue(Value *Key, unsigned Part) {
-    assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
-    return VectorMapStorage[Key][Part];
-  }
-
-  /// Retrieve the existing scalar value that corresponds to \p Key and
-  /// \p Instance.
-  Value *getScalarValue(Value *Key, const VPIteration &Instance) {
-    assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
-    return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
-  }
-
-  /// Set a vector value associated with \p Key and \p Part. Assumes such a
-  /// value is not already set. If it is, use resetVectorValue() instead.
-  void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
-    assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
-    if (!VectorMapStorage.count(Key)) {
-      VectorParts Entry(UF);
-      VectorMapStorage[Key] = Entry;
-    }
-    VectorMapStorage[Key][Part] = Vector;
-  }
-
-  /// Set a scalar value associated with \p Key and \p Instance. Assumes such a
-  /// value is not already set.
-  void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) {
-    assert(!hasScalarValue(Key, Instance) && "Scalar value already set");
-    if (!ScalarMapStorage.count(Key)) {
-      ScalarParts Entry(UF);
-      // TODO: Consider storing uniform values only per-part, as they occupy
-      //       lane 0 only, keeping the other VF-1 redundant entries null.
-      for (unsigned Part = 0; Part < UF; ++Part)
-        Entry[Part].resize(VF.getKnownMinValue(), nullptr);
-      ScalarMapStorage[Key] = Entry;
-    }
-    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
-  }
+  /// Returns the Kind of lane offset.
+  Kind getKind() const { return LaneKind; }
 
-  /// Reset the vector value associated with \p Key for the given \p Part.
-  /// This function can be used to update values that have already been
-  /// vectorized. This is the case for "fix-up" operations including type
-  /// truncation and the second phase of recurrence vectorization.
-  void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
-    assert(hasVectorValue(Key, Part) && "Vector value not set for part");
-    VectorMapStorage[Key][Part] = Vector;
+  /// Returns true if this is the first lane of the whole vector.
+  bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
+
+  /// Maps the lane to a cache index based on \p VF.
+  unsigned mapToCacheIndex(const ElementCount &VF) const {
+    switch (LaneKind) {
+    case VPLane::Kind::ScalableLast:
+      assert(VF.isScalable() && Lane < VF.getKnownMinValue());
+      return VF.getKnownMinValue() + Lane;
+    default:
+      assert(Lane < VF.getKnownMinValue());
+      return Lane;
+    }
   }
 
-  /// Reset the scalar value associated with \p Key for \p Part and \p Lane.
-  /// This function can be used to update values that have already been
-  /// scalarized. This is the case for "fix-up" operations including scalar phi
-  /// nodes for scalarized and predicated instructions.
-  void resetScalarValue(Value *Key, const VPIteration &Instance,
-                        Value *Scalar) {
-    assert(hasScalarValue(Key, Instance) &&
-           "Scalar value not set for part and lane");
-    ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+  /// Returns the maxmimum number of lanes that we are able to consider
+  /// caching for \p VF.
+  static unsigned getNumCachedLanes(const ElementCount &VF) {
+    return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
   }
 };
 
-/// This class is used to enable the VPlan to invoke a method of ILV. This is
-/// needed until the method is refactored out of ILV and becomes reusable.
-struct VPCallback {
-  virtual ~VPCallback() {}
-  virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;
-  virtual Value *getOrCreateScalarValue(Value *V,
-                                        const VPIteration &Instance) = 0;
+/// VPIteration represents a single point in the iteration space of the output
+/// (vectorized and/or unrolled) IR loop.
+struct VPIteration {
+  /// in [0..UF)
+  unsigned Part;
+
+  VPLane Lane;
+
+  VPIteration(unsigned Part, unsigned Lane,
+              VPLane::Kind Kind = VPLane::Kind::First)
+      : Part(Part), Lane(Lane, Kind) {}
+
+  VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
+
+  bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
 };
 
 /// VPTransformState holds information passed down when "executing" a VPlan,
 /// needed for generating the output IR.
 struct VPTransformState {
-  VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI,
+  VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                    DominatorTree *DT, IRBuilder<> &Builder,
-                   VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV,
-                   VPCallback &Callback)
-      : VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT),
-        Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
+                   InnerLoopVectorizer *ILV, VPlan *Plan)
+      : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ILV(ILV),
+        Plan(Plan) {}
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
   ElementCount VF;
@@ -279,13 +224,7 @@ struct VPTransformState {
   /// method will delegate the call to ILV in such cases in order to provide
   /// callers a consistent API.
   /// \see set.
-  Value *get(VPValue *Def, unsigned Part) {
-    // If Values have been set for this Def return the one relevant for \p Part.
-    if (Data.PerPartOutput.count(Def))
-      return Data.PerPartOutput[Def][Part];
-    // Def is managed by ILV: bring the Values from ValueMap.
-    return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
-  }
+  Value *get(VPValue *Def, unsigned Part);
 
   /// Get the generated Value for a given VPValue and given Part and Lane.
   Value *get(VPValue *Def, const VPIteration &Instance);
@@ -296,13 +235,18 @@ struct VPTransformState {
            I->second[Part];
   }
 
+  bool hasAnyVectorValue(VPValue *Def) const {
+    return Data.PerPartOutput.find(Def) != Data.PerPartOutput.end();
+  }
+
   bool hasScalarValue(VPValue *Def, VPIteration Instance) {
     auto I = Data.PerPartScalars.find(Def);
     if (I == Data.PerPartScalars.end())
       return false;
+    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
     return Instance.Part < I->second.size() &&
-           Instance.Lane < I->second[Instance.Part].size() &&
-           I->second[Instance.Part][Instance.Lane];
+           CacheIdx < I->second[Instance.Part].size() &&
+           I->second[Instance.Part][CacheIdx];
   }
 
   /// Set the generated Value for a given VPValue and a given Part.
@@ -313,17 +257,39 @@ struct VPTransformState {
     }
     Data.PerPartOutput[Def][Part] = V;
   }
-  void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part);
+  /// Reset an existing vector value for \p Def and a given \p Part.
+  void reset(VPValue *Def, Value *V, unsigned Part) {
+    auto Iter = Data.PerPartOutput.find(Def);
+    assert(Iter != Data.PerPartOutput.end() &&
+           "need to overwrite existing value");
+    Iter->second[Part] = V;
+  }
 
+  /// Set the generated scalar \p V for \p Def and the given \p Instance.
   void set(VPValue *Def, Value *V, const VPIteration &Instance) {
     auto Iter = Data.PerPartScalars.insert({Def, {}});
     auto &PerPartVec = Iter.first->second;
     while (PerPartVec.size() <= Instance.Part)
       PerPartVec.emplace_back();
     auto &Scalars = PerPartVec[Instance.Part];
-    while (Scalars.size() <= Instance.Lane)
+    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
+    while (Scalars.size() <= CacheIdx)
       Scalars.push_back(nullptr);
-    Scalars[Instance.Lane] = V;
+    assert(!Scalars[CacheIdx] && "should overwrite existing value");
+    Scalars[CacheIdx] = V;
+  }
+
+  /// Reset an existing scalar value for \p Def and a given \p Instance.
+  void reset(VPValue *Def, Value *V, const VPIteration &Instance) {
+    auto Iter = Data.PerPartScalars.find(Def);
+    assert(Iter != Data.PerPartScalars.end() &&
+           "need to overwrite existing value");
+    assert(Instance.Part < Iter->second.size() &&
+           "need to overwrite existing value");
+    unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
+    assert(CacheIdx < Iter->second[Instance.Part].size() &&
+           "need to overwrite existing value");
+    Iter->second[Instance.Part][CacheIdx] = V;
   }
 
   /// Hold state information used when constructing the CFG of the output IR,
@@ -340,6 +306,13 @@ struct VPTransformState {
     /// BasicBlock, used for placing the newly created BasicBlocks.
     BasicBlock *LastBB = nullptr;
 
+    /// The IR BasicBlock that is the preheader of the vector loop in the output
+    /// IR.
+    /// FIXME: The vector preheader should also be modeled in VPlan, so any code
+    /// that needs to be added to the preheader gets directly generated by
+    /// VPlan. There should be no need to manage a pointer to the IR BasicBlock.
+    BasicBlock *VectorPreHeader = nullptr;
+
     /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
     /// of replication, maps the BasicBlock of the last replica created.
     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
@@ -351,9 +324,6 @@ struct VPTransformState {
     CFGState() = default;
   } CFG;
 
-  /// Hold a pointer to the original loop.
-  Loop *OrigLoop;
-
   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
   LoopInfo *LI;
 
@@ -363,12 +333,6 @@ struct VPTransformState {
   /// Hold a reference to the IRBuilder used to generate output IR code.
   IRBuilder<> &Builder;
 
-  /// Hold a reference to the Value state information used when generating the
-  /// Values of the output IR.
-  VectorizerValueMap &ValueMap;
-
-  /// Hold a reference to a mapping between VPValues in VPlan and original
-  /// Values they correspond to.
   VPValue2ValueTy VPValue2Value;
 
   /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
@@ -380,7 +344,43 @@ struct VPTransformState {
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
-  VPCallback &Callback;
+  /// Pointer to the VPlan code is generated for.
+  VPlan *Plan;
+};
+
+/// VPUsers instance used by VPBlockBase to manage CondBit and the block
+/// predicate. Currently VPBlockUsers are used in VPBlockBase for historical
+/// reasons, but in the future the only VPUsers should either be recipes or
+/// live-outs.VPBlockBase uses.
+struct VPBlockUser : public VPUser {
+  VPBlockUser() : VPUser({}, VPUserID::Block) {}
+
+  VPValue *getSingleOperandOrNull() {
+    if (getNumOperands() == 1)
+      return getOperand(0);
+
+    return nullptr;
+  }
+  const VPValue *getSingleOperandOrNull() const {
+    if (getNumOperands() == 1)
+      return getOperand(0);
+
+    return nullptr;
+  }
+
+  void resetSingleOpUser(VPValue *NewVal) {
+    assert(getNumOperands() <= 1 && "Didn't expect more than one operand!");
+    if (!NewVal) {
+      if (getNumOperands() == 1)
+        removeLastOperand();
+      return;
+    }
+
+    if (getNumOperands() == 1)
+      setOperand(0, NewVal);
+    else
+      addOperand(NewVal);
+  }
 };
 
 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
@@ -403,11 +403,15 @@ class VPBlockBase {
   /// List of successor blocks.
   SmallVector<VPBlockBase *, 1> Successors;
 
-  /// Successor selector, null for zero or single successor blocks.
-  VPValue *CondBit = nullptr;
+  /// Successor selector managed by a VPUser. For blocks with zero or one
+  /// successors, there is no operand. Otherwise there is exactly one operand
+  /// which is the branch condition.
+  VPBlockUser CondBitUser;
 
-  /// Current block predicate - null if the block does not need a predicate.
-  VPValue *Predicate = nullptr;
+  /// If the block is predicated, its predicate is stored as an operand of this
+  /// VPUser to maintain the def-use relations. Otherwise there is no operand
+  /// here.
+  VPBlockUser PredicateUser;
 
   /// VPlan containing the block. Can only be set on the entry block of the
   /// plan.
@@ -553,17 +557,18 @@ public:
   }
 
   /// \return the condition bit selecting the successor.
-  VPValue *getCondBit() { return CondBit; }
-
-  const VPValue *getCondBit() const { return CondBit; }
-
-  void setCondBit(VPValue *CV) { CondBit = CV; }
-
-  VPValue *getPredicate() { return Predicate; }
-
-  const VPValue *getPredicate() const { return Predicate; }
+  VPValue *getCondBit();
+  /// \return the condition bit selecting the successor.
+  const VPValue *getCondBit() const;
+  /// Set the condition bit selecting the successor.
+  void setCondBit(VPValue *CV);
 
-  void setPredicate(VPValue *Pred) { Predicate = Pred; }
+  /// \return the block's predicate.
+  VPValue *getPredicate();
+  /// \return the block's predicate.
+  const VPValue *getPredicate() const;
+  /// Set the block's predicate.
+  void setPredicate(VPValue *Pred);
 
   /// Set a given VPBlockBase \p Successor as the single successor of this
   /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
@@ -581,7 +586,7 @@ public:
                         VPValue *Condition) {
     assert(Successors.empty() && "Setting two successors when others exist.");
     assert(Condition && "Setting two successors without condition!");
-    CondBit = Condition;
+    setCondBit(Condition);
     appendSuccessor(IfTrue);
     appendSuccessor(IfFalse);
   }
@@ -601,7 +606,7 @@ public:
   /// Remove all the successors of this block and set to null its condition bit
   void clearSuccessors() {
     Successors.clear();
-    CondBit = nullptr;
+    setCondBit(nullptr);
   }
 
   /// The method which generates the output IR that correspond to this
@@ -611,16 +616,6 @@ public:
   /// Delete all blocks reachable from a given VPBlockBase, inclusive.
   static void deleteCFG(VPBlockBase *Entry);
 
-  void printAsOperand(raw_ostream &OS, bool PrintType) const {
-    OS << getName();
-  }
-
-  void print(raw_ostream &OS) const {
-    // TODO: Only printing VPBB name for now since we only have dot printing
-    // support for VPInstructions/Recipes.
-    printAsOperand(OS, false);
-  }
-
   /// Return true if it is legal to hoist instructions into this block.
   bool isLegalToHoistInto() {
     // There are currently no constraints that prevent an instruction to be
@@ -631,6 +626,34 @@ public:
   /// Replace all operands of VPUsers in the block with \p NewValue and also
   /// replaces all uses of VPValues defined in the block with NewValue.
   virtual void dropAllReferences(VPValue *NewValue) = 0;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void printAsOperand(raw_ostream &OS, bool PrintType) const {
+    OS << getName();
+  }
+
+  /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines
+  /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using
+  /// consequtive numbers.
+  ///
+  /// Note that the numbering is applied to the whole VPlan, so printing
+  /// individual blocks is consistent with the whole VPlan printing.
+  virtual void print(raw_ostream &O, const Twine &Indent,
+                     VPSlotTracker &SlotTracker) const = 0;
+
+  /// Print plain-text dump of this VPlan to \p O.
+  void print(raw_ostream &O) const {
+    VPSlotTracker SlotTracker(getPlan());
+    print(O, "", SlotTracker);
+  }
+
+  /// Print the successors of this block to \p O, prefixing all lines with \p
+  /// Indent.
+  void printSuccessors(raw_ostream &O, const Twine &Indent) const;
+
+  /// Dump this VPBlockBase to dbgs().
+  LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
 };
 
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
@@ -639,16 +662,21 @@ public:
 /// VPRecipeBases that also inherit from VPValue must make sure to inherit from
 /// VPRecipeBase before VPValue.
 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
-                     public VPDef {
+                     public VPDef,
+                     public VPUser {
   friend VPBasicBlock;
   friend class VPBlockUtils;
 
-
   /// Each VPRecipe belongs to a single VPBasicBlock.
   VPBasicBlock *Parent = nullptr;
 
 public:
-  VPRecipeBase(const unsigned char SC) : VPDef(SC) {}
+  VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands)
+      : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
+
+  template <typename IterT>
+  VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands)
+      : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
   virtual ~VPRecipeBase() = default;
 
   /// \return the VPBasicBlock which this VPRecipe belongs to.
@@ -685,17 +713,13 @@ public:
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
-  /// Returns a pointer to a VPUser, if the recipe inherits from VPUser or
-  /// nullptr otherwise.
-  VPUser *toVPUser();
-
   /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
   /// otherwise.
   Instruction *getUnderlyingInstr() {
-    return cast<Instruction>(getVPValue()->getUnderlyingValue());
+    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
   }
   const Instruction *getUnderlyingInstr() const {
-    return cast<Instruction>(getVPValue()->getUnderlyingValue());
+    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -703,6 +727,29 @@ public:
     // All VPDefs are also VPRecipeBases.
     return true;
   }
+
+  static inline bool classof(const VPUser *U) {
+    return U->getVPUserID() == VPUser::VPUserID::Recipe;
+  }
+
+  /// Returns true if the recipe may have side-effects.
+  bool mayHaveSideEffects() const;
+
+  /// Returns true for PHI-like recipes.
+  bool isPhi() const {
+    return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
+  }
+
+  /// Returns true if the recipe may read from memory.
+  bool mayReadFromMemory() const;
+
+  /// Returns true if the recipe may write to memory.
+  bool mayWriteToMemory() const;
+
+  /// Returns true if the recipe may read from or write to memory.
+  bool mayReadOrWriteMemory() const {
+    return mayReadFromMemory() || mayWriteToMemory();
+  }
 };
 
 inline bool VPUser::classof(const VPDef *Def) {
@@ -723,13 +770,16 @@ inline bool VPUser::classof(const VPDef *Def) {
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPRecipeBase, public VPUser, public VPValue {
+class VPInstruction : public VPRecipeBase, public VPValue {
   friend class VPlanSlp;
 
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
   enum {
-    Not = Instruction::OtherOpsEnd + 1,
+    FirstOrderRecurrenceSplice =
+        Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
+                                      // values of a first-order recurrence.
+    Not,
     ICmpULE,
     SLPLoad,
     SLPStore,
@@ -749,14 +799,14 @@ protected:
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
-      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands),
+      : VPRecipeBase(VPRecipeBase::VPInstructionSC, Operands),
         VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
 
   VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
-      : VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}),
+      : VPRecipeBase(VPRecipeBase::VPInstructionSC, {}),
         VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
     for (auto *I : Operands)
-      addOperand(I->getVPValue());
+      addOperand(I->getVPSingleValue());
   }
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
@@ -784,12 +834,14 @@ public:
   /// provided.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the VPInstruction to \p O.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 
   /// Print the VPInstruction to dbgs() (for debugging).
-  void dump() const;
+  LLVM_DUMP_METHOD void dump() const;
+#endif
 
   /// Return true if this instruction may modify memory.
   bool mayWriteToMemory() const {
@@ -823,12 +875,12 @@ public:
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
 /// ingredient. This recipe covers most of the traditional vectorization cases
 /// where each ingredient transforms into a vectorized version of itself.
-class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser {
+class VPWidenRecipe : public VPRecipeBase, public VPValue {
 public:
   template <typename IterT>
   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
-      : VPRecipeBase(VPRecipeBase::VPWidenSC),
-        VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {}
+      : VPRecipeBase(VPRecipeBase::VPWidenSC, Operands),
+        VPValue(VPValue::VPVWidenSC, &I, this) {}
 
   ~VPWidenRecipe() override = default;
 
@@ -843,18 +895,20 @@ public:
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for widening Call instructions.
-class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue {
+class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
 
 public:
   template <typename IterT>
   VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
-      : VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments),
+      : VPRecipeBase(VPRecipeBase::VPWidenCallSC, CallArguments),
         VPValue(VPValue::VPVWidenCallSC, &I, this) {}
 
   ~VPWidenCallRecipe() override = default;
@@ -867,13 +921,15 @@ public:
   /// Produce a widened version of the call instruction.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for widening select instructions.
-class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue {
+class VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
 
   /// Is the condition of the select loop invariant?
   bool InvariantCond;
@@ -882,7 +938,7 @@ public:
   template <typename IterT>
   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
                       bool InvariantCond)
-      : VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands),
+      : VPRecipeBase(VPRecipeBase::VPWidenSelectSC, Operands),
         VPValue(VPValue::VPVWidenSelectSC, &I, this),
         InvariantCond(InvariantCond) {}
 
@@ -896,29 +952,29 @@ public:
   /// Produce a widened version of the select instruction.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for handling GEP instructions.
-class VPWidenGEPRecipe : public VPRecipeBase,
-                         public VPUser,
-                         public VPValue {
+class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {
   bool IsPtrLoopInvariant;
   SmallBitVector IsIndexLoopInvariant;
 
 public:
   template <typename IterT>
   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
-      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
+      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC, Operands),
         VPValue(VPWidenGEPSC, GEP, this),
         IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
 
   template <typename IterT>
   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
                    Loop *OrigLoop)
-      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
+      : VPRecipeBase(VPRecipeBase::VPWidenGEPSC, Operands),
         VPValue(VPValue::VPVWidenGEPSC, GEP, this),
         IsIndexLoopInvariant(GEP->getNumIndices(), false) {
     IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
@@ -936,26 +992,29 @@ public:
   /// Generate the gep nodes.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
-class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser {
+class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
   PHINode *IV;
-  TruncInst *Trunc;
 
 public:
-  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
+  VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, Instruction *Cast,
                                 TruncInst *Trunc = nullptr)
-      : VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV),
-        Trunc(Trunc) {
+      : VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), IV(IV) {
     if (Trunc)
       new VPValue(Trunc, this);
     else
       new VPValue(IV, this);
+
+    if (Cast)
+      new VPValue(Cast, this);
   }
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
@@ -968,59 +1027,199 @@ public:
   /// needed by their users.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 
   /// Returns the start value of the induction.
   VPValue *getStartValue() { return getOperand(0); }
+
+  /// Returns the cast VPValue, if one is attached, or nullptr otherwise.
+  VPValue *getCastValue() {
+    if (getNumDefinedValues() != 2)
+      return nullptr;
+    return getVPValue(1);
+  }
+
+  /// Returns the first defined value as TruncInst, if it is one or nullptr
+  /// otherwise.
+  TruncInst *getTruncInst() {
+    return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
+  }
+  const TruncInst *getTruncInst() const {
+    return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
+  }
 };
 
-/// A recipe for handling all phi nodes except for integer and FP inductions.
-/// For reduction PHIs, RdxDesc must point to the corresponding recurrence
-/// descriptor and the start value is the first operand of the recipe.
-class VPWidenPHIRecipe : public VPRecipeBase, public VPUser {
-  PHINode *Phi;
+/// A recipe for handling first order recurrences and pointer inductions. For
+/// first-order recurrences, the start value is the first operand of the recipe
+/// and the incoming value from the backedge is the second operand. It also
+/// serves as base class for VPReductionPHIRecipe. In the VPlan native path, all
+/// incoming VPValues & VPBasicBlock pairs are managed in the recipe directly.
+class VPWidenPHIRecipe : public VPRecipeBase, public VPValue {
+  /// List of incoming blocks. Only used in the VPlan native path.
+  SmallVector<VPBasicBlock *, 2> IncomingBlocks;
 
-  /// Descriptor for a reduction PHI.
-  RecurrenceDescriptor *RdxDesc = nullptr;
+protected:
+  VPWidenPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
+                   VPValue *Start = nullptr)
+      : VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) {
+    if (Start)
+      addOperand(Start);
+  }
 
 public:
-  /// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p
-  /// RdxDesc.
-  VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start)
-      : VPWidenPHIRecipe(Phi) {
-    this->RdxDesc = &RdxDesc;
+  /// Create a VPWidenPHIRecipe for \p Phi
+  VPWidenPHIRecipe(PHINode *Phi)
+      : VPWidenPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {}
+
+  /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
+  VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) {
     addOperand(&Start);
   }
 
-  /// Create a VPWidenPHIRecipe for \p Phi
-  VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {
-    new VPValue(Phi, this);
-  }
   ~VPWidenPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPWidenPHISC;
+  static inline bool classof(const VPRecipeBase *B) {
+    return B->getVPDefID() == VPRecipeBase::VPWidenPHISC ||
+           B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC ||
+           B->getVPDefID() == VPRecipeBase::VPReductionPHISC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVWidenPHISC ||
+           V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC ||
+           V->getVPValueID() == VPValue::VPVReductionPHISC;
   }
 
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 
-  /// Returns the start value of the phi, if it is a reduction.
+  /// Returns the start value of the phi, if it is a reduction or first-order
+  /// recurrence.
   VPValue *getStartValue() {
     return getNumOperands() == 0 ? nullptr : getOperand(0);
   }
+
+  /// Returns the incoming value from the loop backedge, if it is a reduction or
+  /// first-order recurrence.
+  VPValue *getBackedgeValue() {
+    return getOperand(1);
+  }
+
+  /// Returns the backedge value as a recipe. The backedge value is guaranteed
+  /// to be a recipe.
+  VPRecipeBase *getBackedgeRecipe() {
+    return cast<VPRecipeBase>(getBackedgeValue()->getDef());
+  }
+
+  /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
+  void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
+    addOperand(IncomingV);
+    IncomingBlocks.push_back(IncomingBlock);
+  }
+
+  /// Returns the \p I th incoming VPValue.
+  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
+
+  /// Returns the \p I th incoming VPBasicBlock.
+  VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
+};
+
+/// A recipe for handling first-order recurrence phis. The start value is the
+/// first operand of the recipe and the incoming value from the backedge is the
+/// second operand.
+struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
+  VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
+      : VPWidenPHIRecipe(VPVFirstOrderRecurrencePHISC,
+                         VPFirstOrderRecurrencePHISC, Phi, &Start) {}
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
+  }
+  static inline bool classof(const VPWidenPHIRecipe *D) {
+    return D->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC;
+  }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
+/// A recipe for handling reduction phis. The start value is the first operand
+/// of the recipe and the incoming value from the backedge is the second
+/// operand.
+class VPReductionPHIRecipe : public VPWidenPHIRecipe {
+  /// Descriptor for the reduction.
+  RecurrenceDescriptor &RdxDesc;
+
+  /// The phi is part of an in-loop reduction.
+  bool IsInLoop;
+
+  /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
+  bool IsOrdered;
+
+public:
+  /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
+  /// RdxDesc.
+  VPReductionPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc,
+                       VPValue &Start, bool IsInLoop = false,
+                       bool IsOrdered = false)
+      : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+    assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
+  }
+
+  ~VPReductionPHIRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVReductionPHISC;
+  }
+  static inline bool classof(const VPWidenPHIRecipe *R) {
+    return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
+  }
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  RecurrenceDescriptor &getRecurrenceDescriptor() { return RdxDesc; }
+
+  /// Returns true, if the phi is part of an ordered reduction.
+  bool isOrdered() const { return IsOrdered; }
+
+  /// Returns true, if the phi is part of an in-loop reduction.
+  bool isInLoop() const { return IsInLoop; }
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
-class VPBlendRecipe : public VPRecipeBase, public VPUser {
+class VPBlendRecipe : public VPRecipeBase, public VPValue {
   PHINode *Phi;
 
 public:
@@ -1028,8 +1227,8 @@ public:
   /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
   /// might be incoming with a full mask for which there is no VPValue.
   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
-      : VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) {
-    new VPValue(Phi, this);
+      : VPRecipeBase(VPBlendSC, Operands),
+        VPValue(VPValue::VPVBlendSC, Phi, this), Phi(Phi) {
     assert(Operands.size() > 0 &&
            ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
            "Expected either a single incoming value or a positive even number "
@@ -1054,16 +1253,18 @@ public:
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
 /// or stores into one wide load/store and shuffles. The first operand of a
 /// VPInterleave recipe is the address, followed by the stored values, followed
 /// by an optional mask.
-class VPInterleaveRecipe : public VPRecipeBase, public VPUser {
+class VPInterleaveRecipe : public VPRecipeBase {
   const InterleaveGroup<Instruction> *IG;
 
   bool HasMask = false;
@@ -1071,7 +1272,7 @@ class VPInterleaveRecipe : public VPRecipeBase, public VPUser {
 public:
   VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
                      ArrayRef<VPValue *> StoredValues, VPValue *Mask)
-      : VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) {
+      : VPRecipeBase(VPInterleaveSC, {Addr}), IG(IG) {
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (Instruction *I = IG->getMember(i)) {
         if (I->getType()->isVoidTy())
@@ -1117,9 +1318,11 @@ public:
   /// Generate the wide load or store, and shuffles.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 
   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
 };
@@ -1127,21 +1330,18 @@ public:
 /// A recipe to represent inloop reduction operations, performing a reduction on
 /// a vector operand into a scalar value, and adding the result to a chain.
 /// The Operands are {ChainOp, VecOp, [Condition]}.
-class VPReductionRecipe : public VPRecipeBase, public VPUser, public VPValue {
+class VPReductionRecipe : public VPRecipeBase, public VPValue {
   /// The recurrence decriptor for the reduction in question.
   RecurrenceDescriptor *RdxDesc;
-  /// Fast math flags to use for the resulting reduction operation.
-  bool NoNaN;
   /// Pointer to the TTI, needed to create the target reduction
   const TargetTransformInfo *TTI;
 
 public:
   VPReductionRecipe(RecurrenceDescriptor *R, Instruction *I, VPValue *ChainOp,
-                    VPValue *VecOp, VPValue *CondOp, bool NoNaN,
+                    VPValue *VecOp, VPValue *CondOp,
                     const TargetTransformInfo *TTI)
-      : VPRecipeBase(VPRecipeBase::VPReductionSC), VPUser({ChainOp, VecOp}),
-        VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), NoNaN(NoNaN),
-        TTI(TTI) {
+      : VPRecipeBase(VPRecipeBase::VPReductionSC, {ChainOp, VecOp}),
+        VPValue(VPValue::VPVReductionSC, I, this), RdxDesc(R), TTI(TTI) {
     if (CondOp)
       addOperand(CondOp);
   }
@@ -1153,16 +1353,14 @@ public:
     return V->getVPValueID() == VPValue::VPVReductionSC;
   }
 
-  static inline bool classof(const VPDef *D) {
-    return D->getVPDefID() == VPRecipeBase::VPReductionSC;
-  }
-
   /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 
   /// The VPValue of the scalar Chain being accumulated.
   VPValue *getChainOp() const { return getOperand(0); }
@@ -1178,7 +1376,7 @@ public:
 /// copies of the original scalar type, one per lane, instead of producing a
 /// single copy of widened type for all lanes. If the instruction is known to be
 /// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue {
+class VPReplicateRecipe : public VPRecipeBase, public VPValue {
   /// Indicator if only a single replica per lane is needed.
   bool IsUniform;
 
@@ -1192,9 +1390,8 @@ public:
   template <typename IterT>
   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
                     bool IsUniform, bool IsPredicated = false)
-      : VPRecipeBase(VPReplicateSC), VPUser(Operands),
-        VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform),
-        IsPredicated(IsPredicated) {
+      : VPRecipeBase(VPReplicateSC, Operands), VPValue(VPVReplicateSC, I, this),
+        IsUniform(IsUniform), IsPredicated(IsPredicated) {
     // Retain the previous behavior of predicateInstructions(), where an
     // insert-element of a predicated instruction got hoisted into the
     // predicated basic block iff it was its only user. This is achieved by
@@ -1221,17 +1418,24 @@ public:
 
   void setAlsoPack(bool Pack) { AlsoPack = Pack; }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 
   bool isUniform() const { return IsUniform; }
+
+  bool isPacked() const { return AlsoPack; }
+
+  bool isPredicated() const { return IsPredicated; }
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
-class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser {
+class VPBranchOnMaskRecipe : public VPRecipeBase {
 public:
-  VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
+  VPBranchOnMaskRecipe(VPValue *BlockInMask)
+      : VPRecipeBase(VPBranchOnMaskSC, {}) {
     if (BlockInMask) // nullptr means all-one mask.
       addOperand(BlockInMask);
   }
@@ -1245,16 +1449,17 @@ public:
   /// conditional branch.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override {
-    O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
+    O << Indent << "BRANCH-ON-MASK ";
     if (VPValue *Mask = getMask())
       Mask->printAsOperand(O, SlotTracker);
     else
       O << " All-One";
-    O << "\\l\"";
   }
+#endif
 
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
@@ -1270,15 +1475,13 @@ public:
 /// order to merge values that are set under such a branch and feed their uses.
 /// The phi nodes can be scalar or vector depending on the users of the value.
 /// This recipe works in concert with VPBranchOnMaskRecipe.
-class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser {
-
+class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
 public:
   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
   /// nodes after merging back from a Branch-on-Mask.
   VPPredInstPHIRecipe(VPValue *PredV)
-      : VPRecipeBase(VPPredInstPHISC), VPUser(PredV) {
-    new VPValue(PredV->getUnderlyingValue(), this);
-  }
+      : VPRecipeBase(VPPredInstPHISC, PredV),
+        VPValue(VPValue::VPVPredInstPHI, nullptr, this) {}
   ~VPPredInstPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1289,9 +1492,11 @@ public:
   /// Generates phi nodes for live-outs as needed to retain SSA form.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A Recipe for widening load/store operations.
@@ -1300,8 +1505,7 @@ public:
 /// - For store: Address, stored value, optional mask
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase,
-                                       public VPUser {
+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   Instruction &Ingredient;
 
   void setMask(VPValue *Mask) {
@@ -1316,15 +1520,14 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase,
 
 public:
   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}),
-        Ingredient(Load) {
+      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load) {
     new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
     setMask(Mask);
   }
 
   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
                                  VPValue *StoredValue, VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}),
+      : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}),
         Ingredient(Store) {
     setMask(Mask);
   }
@@ -1358,15 +1561,17 @@ public:
   /// Generate the wide load/store.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPRecipeBase {
 public:
-  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {
+  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC, {}) {
     new VPValue(nullptr, this);
   }
 
@@ -1382,14 +1587,16 @@ public:
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
   void execute(VPTransformState &State) override;
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
+#endif
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
-/// output IR instructions.
+/// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
 class VPBasicBlock : public VPBlockBase {
 public:
   using RecipeListTy = iplist<VPRecipeBase>;
@@ -1405,7 +1612,10 @@ public:
       appendRecipe(Recipe);
   }
 
-  ~VPBasicBlock() override { Recipes.clear(); }
+  ~VPBasicBlock() override {
+    while (!Recipes.empty())
+      Recipes.pop_back();
+  }
 
   /// Instruction iterators...
   using iterator = RecipeListTy::iterator;
@@ -1464,8 +1674,29 @@ public:
   /// Return the position of the first non-phi node recipe in the block.
   iterator getFirstNonPhi();
 
+  /// Returns an iterator range over the PHI-like recipes in the block.
+  iterator_range<iterator> phis() {
+    return make_range(begin(), getFirstNonPhi());
+  }
+
   void dropAllReferences(VPValue *NewValue) override;
 
+  /// Split current block at \p SplitAt by inserting a new block between the
+  /// current block and its successors and moving all recipes starting at
+  /// SplitAt to the new block. Returns the new block.
+  VPBasicBlock *splitAt(iterator SplitAt);
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
+  /// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
+  ///
+  /// Note that the numbering is applied to the whole VPlan, so printing
+  /// individual blocks is consistent with the whole VPlan printing.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+  using VPBlockBase::print; // Get the print(raw_stream &O) version.
+#endif
+
 private:
   /// Create an IR BasicBlock to hold the output instructions generated by this
   /// VPBasicBlock, and return it. Update the CFGState accordingly.
@@ -1557,6 +1788,18 @@ public:
   void execute(struct VPTransformState *State) override;
 
   void dropAllReferences(VPValue *NewValue) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
+  /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
+  /// consequtive numbers.
+  ///
+  /// Note that the numbering is applied to the whole VPlan, so printing
+  /// individual regions is consistent with the whole VPlan printing.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+  using VPBlockBase::print; // Get the print(raw_stream &O) version.
+#endif
 };
 
 //===----------------------------------------------------------------------===//
@@ -1681,6 +1924,136 @@ struct GraphTraits<Inverse<VPRegionBlock *>>
   }
 };
 
+/// Iterator to traverse all successors of a VPBlockBase node. This includes the
+/// entry node of VPRegionBlocks. Exit blocks of a region implicitly have their
+/// parent region's successors. This ensures all blocks in a region are visited
+/// before any blocks in a successor region when doing a reverse post-order
+// traversal of the graph.
+template <typename BlockPtrTy>
+class VPAllSuccessorsIterator
+    : public iterator_facade_base<VPAllSuccessorsIterator<BlockPtrTy>,
+                                  std::forward_iterator_tag, VPBlockBase> {
+  BlockPtrTy Block;
+  /// Index of the current successor. For VPBasicBlock nodes, this simply is the
+  /// index for the successor array. For VPRegionBlock, SuccessorIdx == 0 is
+  /// used for the region's entry block, and SuccessorIdx - 1 are the indices
+  /// for the successor array.
+  size_t SuccessorIdx;
+
+  static BlockPtrTy getBlockWithSuccs(BlockPtrTy Current) {
+    while (Current && Current->getNumSuccessors() == 0)
+      Current = Current->getParent();
+    return Current;
+  }
+
+  /// Templated helper to dereference successor \p SuccIdx of \p Block. Used by
+  /// both the const and non-const operator* implementations.
+  template <typename T1> static T1 deref(T1 Block, unsigned SuccIdx) {
+    if (auto *R = dyn_cast<VPRegionBlock>(Block)) {
+      if (SuccIdx == 0)
+        return R->getEntry();
+      SuccIdx--;
+    }
+
+    // For exit blocks, use the next parent region with successors.
+    return getBlockWithSuccs(Block)->getSuccessors()[SuccIdx];
+  }
+
+public:
+  VPAllSuccessorsIterator(BlockPtrTy Block, size_t Idx = 0)
+      : Block(Block), SuccessorIdx(Idx) {}
+  VPAllSuccessorsIterator(const VPAllSuccessorsIterator &Other)
+      : Block(Other.Block), SuccessorIdx(Other.SuccessorIdx) {}
+
+  VPAllSuccessorsIterator &operator=(const VPAllSuccessorsIterator &R) {
+    Block = R.Block;
+    SuccessorIdx = R.SuccessorIdx;
+    return *this;
+  }
+
+  static VPAllSuccessorsIterator end(BlockPtrTy Block) {
+    BlockPtrTy ParentWithSuccs = getBlockWithSuccs(Block);
+    unsigned NumSuccessors = ParentWithSuccs
+                                 ? ParentWithSuccs->getNumSuccessors()
+                                 : Block->getNumSuccessors();
+
+    if (auto *R = dyn_cast<VPRegionBlock>(Block))
+      return {R, NumSuccessors + 1};
+    return {Block, NumSuccessors};
+  }
+
+  bool operator==(const VPAllSuccessorsIterator &R) const {
+    return Block == R.Block && SuccessorIdx == R.SuccessorIdx;
+  }
+
+  const VPBlockBase *operator*() const { return deref(Block, SuccessorIdx); }
+
+  BlockPtrTy operator*() { return deref(Block, SuccessorIdx); }
+
+  VPAllSuccessorsIterator &operator++() {
+    SuccessorIdx++;
+    return *this;
+  }
+
+  VPAllSuccessorsIterator operator++(int X) {
+    VPAllSuccessorsIterator Orig = *this;
+    SuccessorIdx++;
+    return Orig;
+  }
+};
+
+/// Helper for GraphTraits specialization that traverses through VPRegionBlocks.
+template <typename BlockTy> class VPBlockRecursiveTraversalWrapper {
+  BlockTy Entry;
+
+public:
+  VPBlockRecursiveTraversalWrapper(BlockTy Entry) : Entry(Entry) {}
+  BlockTy getEntry() { return Entry; }
+};
+
+/// GraphTraits specialization to recursively traverse VPBlockBase nodes,
+/// including traversing through VPRegionBlocks.  Exit blocks of a region
+/// implicitly have their parent region's successors. This ensures all blocks in
+/// a region are visited before any blocks in a successor region when doing a
+/// reverse post-order traversal of the graph.
+template <>
+struct GraphTraits<VPBlockRecursiveTraversalWrapper<VPBlockBase *>> {
+  using NodeRef = VPBlockBase *;
+  using ChildIteratorType = VPAllSuccessorsIterator<VPBlockBase *>;
+
+  static NodeRef
+  getEntryNode(VPBlockRecursiveTraversalWrapper<VPBlockBase *> N) {
+    return N.getEntry();
+  }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return ChildIteratorType(N);
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return ChildIteratorType::end(N);
+  }
+};
+
+template <>
+struct GraphTraits<VPBlockRecursiveTraversalWrapper<const VPBlockBase *>> {
+  using NodeRef = const VPBlockBase *;
+  using ChildIteratorType = VPAllSuccessorsIterator<const VPBlockBase *>;
+
+  static NodeRef
+  getEntryNode(VPBlockRecursiveTraversalWrapper<const VPBlockBase *> N) {
+    return N.getEntry();
+  }
+
+  static inline ChildIteratorType child_begin(NodeRef N) {
+    return ChildIteratorType(N);
+  }
+
+  static inline ChildIteratorType child_end(NodeRef N) {
+    return ChildIteratorType::end(N);
+  }
+};
+
 /// VPlan models a candidate for vectorization, encoding various decisions take
 /// to produce efficient output IR, including which branches, basic-blocks and
 /// output IR instructions to generate, and their cost. VPlan holds a
@@ -1704,7 +2077,7 @@ class VPlan {
   // VPlan. External definitions must be immutable and hold a pointer to its
   // underlying IR that will be used to implement its structural comparison
   // (operators '==' and '<').
-  SmallPtrSet<VPValue *, 16> VPExternalDefs;
+  SetVector<VPValue *> VPExternalDefs;
 
   /// Represents the backedge taken count of the original loop, for folding
   /// the tail.
@@ -1721,9 +2094,6 @@ class VPlan {
   /// Holds the VPLoopInfo analysis for this VPlan.
   VPLoopInfo VPLInfo;
 
-  /// Holds the condition bit values built during VPInstruction to VPRecipe transformation.
-  SmallVector<VPValue *, 4> VPCBVs;
-
 public:
   VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
     if (Entry)
@@ -1744,8 +2114,6 @@ public:
       delete BackedgeTakenCount;
     for (VPValue *Def : VPExternalDefs)
       delete Def;
-    for (VPValue *CBV : VPCBVs)
-      delete CBV;
   }
 
   /// Generate the IR code for this VPlan.
@@ -1777,14 +2145,7 @@ public:
 
   /// Add \p VPVal to the pool of external definitions if it's not already
   /// in the pool.
-  void addExternalDef(VPValue *VPVal) {
-    VPExternalDefs.insert(VPVal);
-  }
-
-  /// Add \p CBV to the vector of condition bit values.
-  void addCBV(VPValue *CBV) {
-    VPCBVs.push_back(CBV);
-  }
+  void addExternalDef(VPValue *VPVal) { VPExternalDefs.insert(VPVal); }
 
   void addVPValue(Value *V) {
     assert(V && "Trying to add a null Value to VPlan");
@@ -1819,8 +2180,16 @@ public:
   VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
   const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print this VPlan to \p O.
+  void print(raw_ostream &O) const;
+
+  /// Print this VPlan in DOT format to \p O.
+  void printDOT(raw_ostream &O) const;
+
   /// Dump the plan to stderr (for debugging).
-  void dump() const;
+  LLVM_DUMP_METHOD void dump() const;
+#endif
 
   /// Returns a range mapping the values the range \p Operands to their
   /// corresponding VPValues.
@@ -1840,14 +2209,10 @@ private:
                                   BasicBlock *LoopExitBB);
 };
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is
 /// indented and follows the dot format.
 class VPlanPrinter {
-  friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan);
-  friend inline raw_ostream &operator<<(raw_ostream &OS,
-                                        const struct VPlanIngredient &I);
-
-private:
   raw_ostream &OS;
   const VPlan &Plan;
   unsigned Depth = 0;
@@ -1858,9 +2223,6 @@ private:
 
   VPSlotTracker SlotTracker;
 
-  VPlanPrinter(raw_ostream &O, const VPlan &P)
-      : OS(O), Plan(P), SlotTracker(&P) {}
-
   /// Handle indentation.
   void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
 
@@ -1890,27 +2252,31 @@ private:
   void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
                 const Twine &Label);
 
-  void dump();
+public:
+  VPlanPrinter(raw_ostream &O, const VPlan &P)
+      : OS(O), Plan(P), SlotTracker(&P) {}
 
-  static void printAsIngredient(raw_ostream &O, const Value *V);
+  LLVM_DUMP_METHOD void dump();
 };
 
 struct VPlanIngredient {
   const Value *V;
 
   VPlanIngredient(const Value *V) : V(V) {}
+
+  void print(raw_ostream &O) const;
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
-  VPlanPrinter::printAsIngredient(OS, I.V);
+  I.print(OS);
   return OS;
 }
 
 inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
-  VPlanPrinter Printer(OS, Plan);
-  Printer.dump();
+  Plan.print(OS);
   return OS;
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // VPlan Utilities
@@ -2011,6 +2377,26 @@ public:
     }
     return Count;
   }
+
+  /// Return an iterator range over \p Range which only includes \p BlockTy
+  /// blocks. The accesses are casted to \p BlockTy.
+  template <typename BlockTy, typename T>
+  static auto blocksOnly(const T &Range) {
+    // Create BaseTy with correct const-ness based on BlockTy.
+    using BaseTy =
+        typename std::conditional<std::is_const<BlockTy>::value,
+                                  const VPBlockBase, VPBlockBase>::type;
+
+    // We need to first create an iterator range over (const) BlocktTy & instead
+    // of (const) BlockTy * for filter_range to work properly.
+    auto Mapped =
+        map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; });
+    auto Filter = make_filter_range(
+        Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); });
+    return map_range(Filter, [](BaseTy &Block) -> BlockTy * {
+      return cast<BlockTy>(&Block);
+    });
+  }
 };
 
 class VPInterleavedAccessInfo {
@@ -2126,8 +2512,10 @@ class VPlanSlp {
                                        SmallPtrSetImpl<VPValue *> &Candidates,
                                        VPInterleavedAccessInfo &IAI);
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print bundle \p Values to dbgs().
   void dumpBundle(ArrayRef<VPValue *> Values);
+#endif
 
 public:
   VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}