964 files changed, 31115 insertions, 13302 deletions
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 49199060786c..a8132e5abf54 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -242,7 +242,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
 
   if (onlyReadsMemory(MRB))
     Result = clearMod(Result);
-  else if (doesNotReadMemory(MRB))
+  else if (onlyWritesMemory(MRB))
     Result = clearRef(Result);
 
   if (onlyAccessesArgPointees(MRB) || onlyAccessesInaccessibleOrArgMem(MRB)) {
@@ -320,7 +320,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call1,
   // from Call1 reading memory written by Call2.
   if (onlyReadsMemory(Call1B))
     Result = clearMod(Result);
-  else if (doesNotReadMemory(Call1B))
+  else if (onlyWritesMemory(Call1B))
     Result = clearRef(Result);
 
   // If Call2 only access memory through arguments, accumulate the mod/ref
@@ -988,6 +988,29 @@ bool llvm::isIdentifiedFunctionLocal(const Value *V) {
   return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasOrByValArgument(V);
 }
 
+bool llvm::isNotVisibleOnUnwind(const Value *Object,
+                                bool &RequiresNoCaptureBeforeUnwind) {
+  RequiresNoCaptureBeforeUnwind = false;
+
+  // Alloca goes out of scope on unwind.
+  if (isa<AllocaInst>(Object))
+    return true;
+
+  // Byval goes out of scope on unwind.
+  if (auto *A = dyn_cast<Argument>(Object))
+    return A->hasByValAttr();
+
+  // A noalias return is not accessible from any other code. If the pointer
+  // does not escape prior to the unwind, then the caller cannot access the
+  // memory either.
+  if (isNoAliasCall(Object)) {
+    RequiresNoCaptureBeforeUnwind = true;
+    return true;
+  }
+
+  return false;
+}
+
 void llvm::getAAResultsAnalysisUsage(AnalysisUsage &AU) {
   // This function needs to be in sync with llvm::createLegacyPMAAResults -- if
   // more alias analyses are added to llvm::createLegacyPMAAResults, they need
diff --git a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index 0c097b2fa302..1577f1eb70b1 100644
--- a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -142,13 +142,13 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
   for (SetVector<Value *>::iterator I1 = Pointers.begin(), E = Pointers.end();
        I1 != E; ++I1) {
     auto I1Size = LocationSize::afterPointer();
-    Type *I1ElTy = cast<PointerType>((*I1)->getType())->getElementType();
+    Type *I1ElTy = (*I1)->getType()->getPointerElementType();
     if (I1ElTy->isSized())
       I1Size = LocationSize::precise(DL.getTypeStoreSize(I1ElTy));
 
     for (SetVector<Value *>::iterator I2 = Pointers.begin(); I2 != I1; ++I2) {
       auto I2Size = LocationSize::afterPointer();
-      Type *I2ElTy = cast<PointerType>((*I2)->getType())->getElementType();
+      Type *I2ElTy = (*I2)->getType()->getPointerElementType();
       if (I2ElTy->isSized())
         I2Size = LocationSize::precise(DL.getTypeStoreSize(I2ElTy));
 
@@ -233,7 +233,7 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
   for (CallBase *Call : Calls) {
     for (auto Pointer : Pointers) {
       auto Size = LocationSize::afterPointer();
-      Type *ElTy = cast<PointerType>(Pointer->getType())->getElementType();
+      Type *ElTy = Pointer->getType()->getPointerElementType();
       if (ElTy->isSized())
         Size = LocationSize::precise(DL.getTypeStoreSize(ElTy));
 
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 5f1bf2001d47..b4c985962837 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -779,7 +779,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const CallBase *Call) {
   // than that.
   if (Call->onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
-  else if (Call->doesNotReadMemory())
+  else if (Call->onlyWritesMemory())
     Min = FMRB_OnlyWritesMemory;
 
   if (Call->onlyAccessesArgMemory())
@@ -812,7 +812,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
   // If the function declares it only reads memory, go with that.
   if (F->onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
-  else if (F->doesNotReadMemory())
+  else if (F->onlyWritesMemory())
     Min = FMRB_OnlyWritesMemory;
 
   if (F->onlyAccessesArgMemory())
@@ -972,7 +972,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
         continue;
       }
       // Operand aliases 'Object' but call only writes into it.
-      if (Call->doesNotReadMemory(OperandNo)) {
+      if (Call->onlyWritesMemory(OperandNo)) {
         Result = setMod(Result);
         continue;
       }
@@ -1020,9 +1020,9 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
         getBestAAResults().alias(MemoryLocation::getForDest(Inst), Loc, AAQI);
     // It's also possible for Loc to alias both src and dest, or neither.
     ModRefInfo rv = ModRefInfo::NoModRef;
-    if (SrcAA != AliasResult::NoAlias)
+    if (SrcAA != AliasResult::NoAlias || Call->hasReadingOperandBundles())
       rv = setRef(rv);
-    if (DestAA != AliasResult::NoAlias)
+    if (DestAA != AliasResult::NoAlias || Call->hasClobberingOperandBundles())
       rv = setMod(rv);
     return rv;
   }
@@ -1248,8 +1248,8 @@ AliasResult BasicAAResult::aliasGEP(
     else
       GCD = APIntOps::GreatestCommonDivisor(GCD, ScaleForGCD.abs());
 
-    ConstantRange CR =
-        computeConstantRange(Index.Val.V, true, &AC, Index.CxtI);
+    ConstantRange CR = computeConstantRange(Index.Val.V, /* ForSigned */ false,
+                                            true, &AC, Index.CxtI);
     KnownBits Known =
         computeKnownBits(Index.Val.V, DL, 0, &AC, Index.CxtI, DT);
     CR = CR.intersectWith(
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 856d7e90acb2..ffb80134749a 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -42,6 +42,7 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
+#include <map>
 #include <utility>
 
 using namespace llvm;
diff --git a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
index 9467bb3c9b2d..090dccc53b6e 100644
--- a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -63,7 +63,7 @@ using namespace llvm::cflaa;
 
 CFLSteensAAResult::CFLSteensAAResult(
     std::function<const TargetLibraryInfo &(Function &F)> GetTLI)
-    : AAResultBase(), GetTLI(std::move(GetTLI)) {}
+    : GetTLI(std::move(GetTLI)) {}
 CFLSteensAAResult::CFLSteensAAResult(CFLSteensAAResult &&Arg)
     : AAResultBase(std::move(Arg)), GetTLI(std::move(Arg.GetTLI)) {}
 CFLSteensAAResult::~CFLSteensAAResult() = default;
diff --git a/llvm/lib/Analysis/CallGraphSCCPass.cpp b/llvm/lib/Analysis/CallGraphSCCPass.cpp
index f2e5eab72bf2..930cb13c0cb3 100644
--- a/llvm/lib/Analysis/CallGraphSCCPass.cpp
+++ b/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -61,7 +61,7 @@ class CGPassManager : public ModulePass, public PMDataManager {
 public:
   static char ID;
 
-  explicit CGPassManager() : ModulePass(ID), PMDataManager() {}
+  explicit CGPassManager() : ModulePass(ID) {}
 
   /// Execute all of the passes scheduled for execution.  Keep track of
   /// whether any of the passes modifies the module, and if so, return true.
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index 9b45f455be08..ba8462e659d5 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -75,7 +75,7 @@ bool CaptureTracker::isDereferenceableOrNull(Value *O, const DataLayout &DL) {
 namespace {
   struct SimpleCaptureTracker : public CaptureTracker {
     explicit SimpleCaptureTracker(bool ReturnCaptures)
-      : ReturnCaptures(ReturnCaptures), Captured(false) {}
+        : ReturnCaptures(ReturnCaptures) {}
 
     void tooManyUses() override { Captured = true; }
 
@@ -89,7 +89,7 @@ namespace {
 
     bool ReturnCaptures;
 
-    bool Captured;
+    bool Captured = false;
   };
 
   /// Only find pointer captures which happen before the given instruction. Uses
@@ -101,7 +101,7 @@ namespace {
     CapturesBefore(bool ReturnCaptures, const Instruction *I,
                    const DominatorTree *DT, bool IncludeI, const LoopInfo *LI)
         : BeforeHere(I), DT(DT), ReturnCaptures(ReturnCaptures),
-          IncludeI(IncludeI), Captured(false), LI(LI) {}
+          IncludeI(IncludeI), LI(LI) {}
 
     void tooManyUses() override { Captured = true; }
 
@@ -139,7 +139,7 @@ namespace {
     bool ReturnCaptures;
     bool IncludeI;
 
-    bool Captured;
+    bool Captured = false;
 
     const LoopInfo *LI;
   };
@@ -155,7 +155,7 @@ namespace {
   struct EarliestCaptures : public CaptureTracker {
 
     EarliestCaptures(bool ReturnCaptures, Function &F, const DominatorTree &DT)
-        : DT(DT), ReturnCaptures(ReturnCaptures), Captured(false), F(F) {}
+        : DT(DT), ReturnCaptures(ReturnCaptures), F(F) {}
 
     void tooManyUses() override {
       Captured = true;
@@ -199,7 +199,7 @@ namespace {
 
     bool ReturnCaptures;
 
-    bool Captured;
+    bool Captured = false;
 
     Function &F;
   };
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 922b38e92785..7cf69f613c66 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -106,11 +106,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
          "Invalid constantexpr bitcast!");
 
   // Catch the obvious splat cases.
-  if (C->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy())
-    return Constant::getNullValue(DestTy);
-  if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy() &&
-      !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
-    return Constant::getAllOnesValue(DestTy);
+  if (Constant *Res = ConstantFoldLoadFromUniformValue(C, DestTy))
+    return Res;
 
   if (auto *VTy = dyn_cast<VectorType>(C->getType())) {
     // Handle a vector->scalar integer/fp cast.
@@ -362,16 +359,8 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
 
     // Catch the obvious splat cases (since all-zeros can coerce non-integral
     // pointers legally).
-    if (C->isNullValue() && !DestTy->isX86_MMXTy() && !DestTy->isX86_AMXTy())
-      return Constant::getNullValue(DestTy);
-    if (C->isAllOnesValue() &&
-        (DestTy->isIntegerTy() || DestTy->isFloatingPointTy() ||
-         DestTy->isVectorTy()) &&
-        !DestTy->isX86_AMXTy() && !DestTy->isX86_MMXTy() &&
-        !DestTy->isPtrOrPtrVectorTy())
-      // Get ones when the input is trivial, but
-      // only for supported types inside getAllOnesValue.
-      return Constant::getAllOnesValue(DestTy);
+    if (Constant *Res = ConstantFoldLoadFromUniformValue(C, DestTy))
+      return Res;
 
     // If the type sizes are the same and a cast is legal, just directly
     // cast the constant.
@@ -410,6 +399,12 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
       } while (ElemC && DL.getTypeSizeInBits(ElemC->getType()).isZero());
       C = ElemC;
     } else {
+      // For non-byte-sized vector elements, the first element is not
+      // necessarily located at the vector base address.
+      if (auto *VT = dyn_cast<VectorType>(SrcTy))
+        if (!DL.typeSizeEqualsStoreSize(VT->getElementType()))
+          return nullptr;
+
       C = C->getAggregateElement(0u);
     }
   } while (C);
@@ -558,23 +553,16 @@ Constant *FoldReinterpretLoadFromConst(Constant *C, Type *LoadTy,
 
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
-    // If this is a float/double load, we can try folding it as an int32/64 load
-    // and then bitcast the result.  This can be useful for union cases.  Note
+    // If this is a non-integer load, we can try folding it as an int load and
+    // then bitcast the result.  This can be useful for union cases.  Note
     // that address spaces don't matter here since we're not going to result in
     // an actual new load.
-    Type *MapTy;
-    if (LoadTy->isHalfTy())
-      MapTy = Type::getInt16Ty(C->getContext());
-    else if (LoadTy->isFloatTy())
-      MapTy = Type::getInt32Ty(C->getContext());
-    else if (LoadTy->isDoubleTy())
-      MapTy = Type::getInt64Ty(C->getContext());
-    else if (LoadTy->isVectorTy()) {
-      MapTy = PointerType::getIntNTy(
-          C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedSize());
-    } else
+    if (!LoadTy->isFloatingPointTy() && !LoadTy->isPointerTy() &&
+        !LoadTy->isVectorTy())
       return nullptr;
 
+    Type *MapTy = Type::getIntNTy(
+          C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedSize());
     if (Constant *Res = FoldReinterpretLoadFromConst(C, MapTy, Offset, DL)) {
       if (Res->isNullValue() && !LoadTy->isX86_MMXTy() &&
           !LoadTy->isX86_AMXTy())
@@ -680,9 +668,21 @@ Constant *llvm::ConstantFoldLoadFromConst(Constant *C, Type *Ty,
     if (Constant *Result = ConstantFoldLoadThroughBitcast(AtOffset, Ty, DL))
       return Result;
 
+  // Explicitly check for out-of-bounds access, so we return undef even if the
+  // constant is a uniform value.
+  TypeSize Size = DL.getTypeAllocSize(C->getType());
+  if (!Size.isScalable() && Offset.sge(Size.getFixedSize()))
+    return UndefValue::get(Ty);
+
+  // Try an offset-independent fold of a uniform value.
+  if (Constant *Result = ConstantFoldLoadFromUniformValue(C, Ty))
+    return Result;
+
   // Try hard to fold loads from bitcasted strange and non-type-safe things.
   if (Offset.getMinSignedBits() <= 64)
-    return FoldReinterpretLoadFromConst(C, Ty, Offset.getSExtValue(), DL);
+    if (Constant *Result =
+            FoldReinterpretLoadFromConst(C, Ty, Offset.getSExtValue(), DL))
+      return Result;
 
   return nullptr;
 }
@@ -704,15 +704,13 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
                                                        Offset, DL))
         return Result;
 
-  // If this load comes from anywhere in a constant global, and if the global
-  // is all undef or zero, we know what it loads.
+  // If this load comes from anywhere in a uniform constant global, the value
+  // is always the same, regardless of the loaded offset.
   if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C))) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-      if (GV->getInitializer()->isNullValue() && !Ty->isX86_MMXTy() &&
-          !Ty->isX86_AMXTy())
-        return Constant::getNullValue(Ty);
-      if (isa<UndefValue>(GV->getInitializer()))
-        return UndefValue::get(Ty);
+      if (Constant *Res =
+              ConstantFoldLoadFromUniformValue(GV->getInitializer(), Ty))
+        return Res;
     }
   }
 
@@ -725,6 +723,19 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
   return ConstantFoldLoadFromConstPtr(C, Ty, Offset, DL);
 }
 
+Constant *llvm::ConstantFoldLoadFromUniformValue(Constant *C, Type *Ty) {
+  if (isa<PoisonValue>(C))
+    return PoisonValue::get(Ty);
+  if (isa<UndefValue>(C))
+    return UndefValue::get(Ty);
+  if (C->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy())
+    return Constant::getNullValue(Ty);
+  if (C->isAllOnesValue() &&
+      (Ty->isIntOrIntVectorTy() || Ty->isFPOrFPVectorTy()))
+    return Constant::getAllOnesValue(Ty);
+  return nullptr;
+}
+
 namespace {
 
 /// One of Op0/Op1 is a constant expression.
@@ -930,7 +941,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   if (auto *GV = dyn_cast<GlobalValue>(Ptr))
     SrcElemTy = GV->getValueType();
   else if (!PTy->isOpaque())
-    SrcElemTy = PTy->getElementType();
+    SrcElemTy = PTy->getNonOpaquePointerElementType();
   else
     SrcElemTy = Type::getInt8Ty(Ptr->getContext());
 
@@ -1171,10 +1182,11 @@ Constant *llvm::ConstantFoldInstOperands(Instruction *I,
   return ConstantFoldInstOperandsImpl(I, I->getOpcode(), Ops, DL, TLI);
 }
 
-Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
+Constant *llvm::ConstantFoldCompareInstOperands(unsigned IntPredicate,
                                                 Constant *Ops0, Constant *Ops1,
                                                 const DataLayout &DL,
                                                 const TargetLibraryInfo *TLI) {
+  CmpInst::Predicate Predicate = (CmpInst::Predicate)IntPredicate;
   // fold: icmp (inttoptr x), null         -> icmp x, 0
   // fold: icmp null, (inttoptr x)         -> icmp 0, x
   // fold: icmp (ptrtoint x), 0            -> icmp x, null
@@ -1248,10 +1260,30 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
         Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
       return ConstantFoldBinaryOpOperands(OpC, LHS, RHS, DL);
     }
+
+    // Convert pointer comparison (base+offset1) pred (base+offset2) into
+    // offset1 pred offset2, for the case where the offset is inbounds. This
+    // only works for equality and unsigned comparison, as inbounds permits
+    // crossing the sign boundary. However, the offset comparison itself is
+    // signed.
+    if (Ops0->getType()->isPointerTy() && !ICmpInst::isSigned(Predicate)) {
+      unsigned IndexWidth = DL.getIndexTypeSizeInBits(Ops0->getType());
+      APInt Offset0(IndexWidth, 0);
+      Value *Stripped0 =
+          Ops0->stripAndAccumulateInBoundsConstantOffsets(DL, Offset0);
+      APInt Offset1(IndexWidth, 0);
+      Value *Stripped1 =
+          Ops1->stripAndAccumulateInBoundsConstantOffsets(DL, Offset1);
+      if (Stripped0 == Stripped1)
+        return ConstantExpr::getCompare(
+            ICmpInst::getSignedPredicate(Predicate),
+            ConstantInt::get(CE0->getContext(), Offset0),
+            ConstantInt::get(CE0->getContext(), Offset1));
+    }
   } else if (isa<ConstantExpr>(Ops1)) {
     // If RHS is a constant expression, but the left side isn't, swap the
     // operands and try again.
-    Predicate = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)Predicate);
+    Predicate = ICmpInst::getSwappedPredicate(Predicate);
     return ConstantFoldCompareInstOperands(Predicate, Ops1, Ops0, DL, TLI);
   }
 
@@ -1347,23 +1379,6 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
   }
 }
 
-Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
-                                                       ConstantExpr *CE,
-                                                       Type *Ty,
-                                                       const DataLayout &DL) {
-  if (!CE->getOperand(1)->isNullValue())
-    return nullptr;  // Do not allow stepping over the value!
-
-  // Loop over all of the operands, tracking down which value we are
-  // addressing.
-  for (unsigned i = 2, e = CE->getNumOperands(); i != e; ++i) {
-    C = C->getAggregateElement(CE->getOperand(i));
-    if (!C)
-      return nullptr;
-  }
-  return ConstantFoldLoadThroughBitcast(C, Ty, DL);
-}
-
 //===----------------------------------------------------------------------===//
 //  Constant Folding for Calls
 //
@@ -2463,36 +2478,21 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         !getConstIntOrUndef(Operands[1], C1))
       return nullptr;
 
-    unsigned BitWidth = Ty->getScalarSizeInBits();
     switch (IntrinsicID) {
     default: break;
     case Intrinsic::smax:
-      if (!C0 && !C1)
-        return UndefValue::get(Ty);
-      if (!C0 || !C1)
-        return ConstantInt::get(Ty, APInt::getSignedMaxValue(BitWidth));
-      return ConstantInt::get(Ty, C0->sgt(*C1) ? *C0 : *C1);
-
     case Intrinsic::smin:
-      if (!C0 && !C1)
-        return UndefValue::get(Ty);
-      if (!C0 || !C1)
-        return ConstantInt::get(Ty, APInt::getSignedMinValue(BitWidth));
-      return ConstantInt::get(Ty, C0->slt(*C1) ? *C0 : *C1);
-
     case Intrinsic::umax:
-      if (!C0 && !C1)
-        return UndefValue::get(Ty);
-      if (!C0 || !C1)
-        return ConstantInt::get(Ty, APInt::getMaxValue(BitWidth));
-      return ConstantInt::get(Ty, C0->ugt(*C1) ? *C0 : *C1);
-
     case Intrinsic::umin:
       if (!C0 && !C1)
         return UndefValue::get(Ty);
       if (!C0 || !C1)
-        return ConstantInt::get(Ty, APInt::getMinValue(BitWidth));
-      return ConstantInt::get(Ty, C0->ult(*C1) ? *C0 : *C1);
+        return MinMaxIntrinsic::getSaturationPoint(IntrinsicID, Ty);
+      return ConstantInt::get(
+          Ty, ICmpInst::compare(*C0, *C1,
+                                MinMaxIntrinsic::getPredicate(IntrinsicID))
+                  ? *C0
+                  : *C1);
 
     case Intrinsic::usub_with_overflow:
     case Intrinsic::ssub_with_overflow:
@@ -2572,9 +2572,9 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
     case Intrinsic::ctlz:
       assert(C1 && "Must be constant int");
 
-      // cttz(0, 1) and ctlz(0, 1) are undef.
+      // cttz(0, 1) and ctlz(0, 1) are poison.
       if (C1->isOne() && (!C0 || C0->isZero()))
-        return UndefValue::get(Ty);
+        return PoisonValue::get(Ty);
       if (!C0)
         return Constant::getNullValue(Ty);
       if (IntrinsicID == Intrinsic::cttz)
@@ -2583,13 +2583,15 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return ConstantInt::get(Ty, C0->countLeadingZeros());
 
     case Intrinsic::abs:
-      // Undef or minimum val operand with poison min --> undef
       assert(C1 && "Must be constant int");
+      assert((C1->isOne() || C1->isZero()) && "Must be 0 or 1");
+
+      // Undef or minimum val operand with poison min --> undef
       if (C1->isOne() && (!C0 || C0->isMinSignedValue()))
         return UndefValue::get(Ty);
 
       // Undef operand with no poison min --> 0 (sign bit must be clear)
-      if (C1->isZero() && !C0)
+      if (!C0)
         return Constant::getNullValue(Ty);
 
       return ConstantInt::get(Ty, C0->abs());
diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp
index 9739c6af5769..773f71ada0ee 100644
--- a/llvm/lib/Analysis/ConstraintSystem.cpp
+++ b/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -142,7 +142,7 @@ bool ConstraintSystem::mayHaveSolution() {
   return HasSolution;
 }
 
-bool ConstraintSystem::isConditionImplied(SmallVector<int64_t, 8> R) {
+bool ConstraintSystem::isConditionImplied(SmallVector<int64_t, 8> R) const {
   // If all variable coefficients are 0, we have 'C >= 0'. If the constant is >=
   // 0, R is always true, regardless of the system.
   if (all_of(makeArrayRef(R).drop_front(1), [](int64_t C) { return C == 0; }))
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index f407ec0d017a..326bacad01fe 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -50,7 +50,7 @@ namespace {
 
   public:
     static char ID; // Class identification, replacement for typeinfo
-    CostModelAnalysis() : FunctionPass(ID), F(nullptr), TTI(nullptr) {
+    CostModelAnalysis() : FunctionPass(ID) {
       initializeCostModelAnalysisPass(
         *PassRegistry::getPassRegistry());
     }
@@ -69,9 +69,9 @@ namespace {
     void print(raw_ostream &OS, const Module*) const override;
 
     /// The function that we analyze.
-    Function *F;
+    Function *F = nullptr;
     /// Target information.
-    const TargetTransformInfo *TTI;
+    const TargetTransformInfo *TTI = nullptr;
   };
 }  // End of anonymous namespace
 
diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp
index da5de75a038c..7e1357959a3f 100644
--- a/llvm/lib/Analysis/DDG.cpp
+++ b/llvm/lib/Analysis/DDG.cpp
@@ -106,7 +106,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode &N) {
 //===--------------------------------------------------------------------===//
 
 SimpleDDGNode::SimpleDDGNode(Instruction &I)
-  : DDGNode(NodeKind::SingleInstruction), InstList() {
+    : DDGNode(NodeKind::SingleInstruction) {
   assert(InstList.empty() && "Expected empty list.");
   InstList.push_back(&I);
 }
diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index 31b2dafa29b4..4a792fce51d1 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -11,8 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Config/config.h"
+#include "llvm/Support/Casting.h"
 #if defined(LLVM_HAVE_TF_API)
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/MLInlineAdvisor.h"
@@ -111,7 +113,7 @@ private:
   StringRef LogFileName;
   const ModelUnderTrainingRunner *const MUTR;
   std::unique_ptr<Logger> L;
-  std::vector<bool> Effects;
+  BitVector Effects;
   /// There's at least one output. We'll set this to a different value if MUTR
   /// is avaliable.
   size_t OutputCount = 1;
@@ -150,7 +152,7 @@ public:
   DevelopmentModeMLInlineAdvisor(
       Module &M, ModuleAnalysisManager &MAM,
       std::unique_ptr<MLModelRunner> ModelRunner,
-      std::function<bool(CallBase &)> GetDefaultAdvice, bool IsDoingInference,
+      std::function<bool(CallBase &)> GetDefaultAdvice,
       std::unique_ptr<TrainingLogger> Logger);
 
   size_t getTotalSizeEstimate();
@@ -341,10 +343,11 @@ void TrainingLogger::print() {
 DevelopmentModeMLInlineAdvisor::DevelopmentModeMLInlineAdvisor(
     Module &M, ModuleAnalysisManager &MAM,
     std::unique_ptr<MLModelRunner> ModelRunner,
-    std::function<bool(CallBase &)> GetDefaultAdvice, bool IsDoingInference,
+    std::function<bool(CallBase &)> GetDefaultAdvice,
     std::unique_ptr<TrainingLogger> Logger)
     : MLInlineAdvisor(M, MAM, std::move(ModelRunner)),
-      GetDefaultAdvice(GetDefaultAdvice), IsDoingInference(IsDoingInference),
+      GetDefaultAdvice(GetDefaultAdvice),
+      IsDoingInference(isa<ModelUnderTrainingRunner>(getModelRunner())),
       Logger(std::move(Logger)),
       InitialNativeSize(isLogging() ? getTotalSizeEstimate() : 0),
       CurrentNativeSize(InitialNativeSize) {
@@ -410,8 +413,6 @@ size_t DevelopmentModeMLInlineAdvisor::getTotalSizeEstimate() {
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
-    if (isFunctionDeleted(&F))
-      continue;
     Ret += *getNativeSizeEstimate(F);
   }
   return Ret;
@@ -422,30 +423,20 @@ std::unique_ptr<InlineAdvisor> llvm::getDevelopmentModeAdvisor(
     std::function<bool(CallBase &)> GetDefaultAdvice) {
   auto &Ctx = M.getContext();
   std::unique_ptr<MLModelRunner> Runner;
-  ModelUnderTrainingRunner *MUTRPtr = nullptr;
-  bool IsDoingInference = false;
   if (TFModelUnderTrainingPath.empty())
     Runner.reset(new NoInferenceModelRunner(Ctx, getInputFeatures()));
-  else {
-    std::unique_ptr<ModelUnderTrainingRunner> MUTR;
-    if (auto MaybeOutputSpecs = loadOutputSpecs(
-            Ctx, DecisionName, TFModelUnderTrainingPath, TFOutputSpecOverride))
-      MUTR = std::make_unique<ModelUnderTrainingRunner>(
-          Ctx, TFModelUnderTrainingPath, getInputFeatures(), *MaybeOutputSpecs);
-    if (!MUTR || !MUTR->isValid()) {
-      Ctx.emitError("Could not load the policy model from the provided path");
-      return nullptr;
-    }
-    IsDoingInference = true;
-    MUTRPtr = MUTR.get();
-    Runner = std::move(MUTR);
-  }
+  else
+    Runner = ModelUnderTrainingRunner::createAndEnsureValid(
+        Ctx, TFModelUnderTrainingPath, DecisionName, getInputFeatures(),
+        TFOutputSpecOverride);
+  if (!Runner)
+    return nullptr;
   std::unique_ptr<TrainingLogger> Logger;
   if (!TrainingLog.empty())
-    Logger = std::make_unique<TrainingLogger>(TrainingLog, MUTRPtr);
+    Logger = std::make_unique<TrainingLogger>(
+        TrainingLog, dyn_cast<ModelUnderTrainingRunner>(Runner.get()));
 
   return std::make_unique<DevelopmentModeMLInlineAdvisor>(
-      M, MAM, std::move(Runner), GetDefaultAdvice, IsDoingInference,
-      std::move(Logger));
+      M, MAM, std::move(Runner), GetDefaultAdvice, std::move(Logger));
 }
 #endif // defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/DivergenceAnalysis.cpp b/llvm/lib/Analysis/DivergenceAnalysis.cpp
index 7426d0c07592..39e80c2ad51c 100644
--- a/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -130,7 +130,7 @@ bool DivergenceAnalysisImpl::inRegion(const Instruction &I) const {
 }
 
 bool DivergenceAnalysisImpl::inRegion(const BasicBlock &BB) const {
-  return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
+  return RegionLoop ? RegionLoop->contains(&BB) : (BB.getParent() == &F);
 }
 
 void DivergenceAnalysisImpl::pushUsers(const Value &V) {
@@ -348,7 +348,7 @@ DivergenceInfo::DivergenceInfo(Function &F, const DominatorTree &DT,
                                const PostDominatorTree &PDT, const LoopInfo &LI,
                                const TargetTransformInfo &TTI,
                                bool KnownReducible)
-    : F(F), ContainsIrreducible(false) {
+    : F(F) {
   if (!KnownReducible) {
     using RPOTraversal = ReversePostOrderTraversal<const Function *>;
     RPOTraversal FuncRPOT(&F);
diff --git a/llvm/lib/Analysis/DomPrinter.cpp b/llvm/lib/Analysis/DomPrinter.cpp
index ebbe0d3e2c5f..6088de53028d 100644
--- a/llvm/lib/Analysis/DomPrinter.cpp
+++ b/llvm/lib/Analysis/DomPrinter.cpp
@@ -80,6 +80,19 @@ struct DOTGraphTraits<PostDominatorTree*>
 };
 }
 
+PreservedAnalyses DomTreePrinterPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  WriteDOTGraphToFile(F, &AM.getResult<DominatorTreeAnalysis>(F), "dom", false);
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses DomTreeOnlyPrinterPass::run(Function &F,
+                                              FunctionAnalysisManager &AM) {
+  WriteDOTGraphToFile(F, &AM.getResult<DominatorTreeAnalysis>(F), "domonly",
+                      true);
+  return PreservedAnalyses::all();
+}
+
 void DominatorTree::viewGraph(const Twine &Name, const Twine &Title) {
 #ifndef NDEBUG
   ViewGraph(this, Name, false, Title);
diff --git a/llvm/lib/Analysis/DominanceFrontier.cpp b/llvm/lib/Analysis/DominanceFrontier.cpp
index 14e6965f1259..a8806fe5a480 100644
--- a/llvm/lib/Analysis/DominanceFrontier.cpp
+++ b/llvm/lib/Analysis/DominanceFrontier.cpp
@@ -37,7 +37,7 @@ INITIALIZE_PASS_END(DominanceFrontierWrapperPass, "domfrontier",
                 "Dominance Frontier Construction", true, true)
 
 DominanceFrontierWrapperPass::DominanceFrontierWrapperPass()
-    : FunctionPass(ID), DF() {
+    : FunctionPass(ID) {
   initializeDominanceFrontierWrapperPassPass(*PassRegistry::getPassRegistry());
 }
 
diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp
index d00a7c944f10..6869530148c5 100644
--- a/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -102,7 +102,7 @@ class GlobalsAAResult::FunctionInfo {
                 "Insufficient low bits to store our flag and ModRef info.");
 
 public:
-  FunctionInfo() : Info() {}
+  FunctionInfo() {}
   ~FunctionInfo() {
     delete Info.getPointer();
   }
@@ -401,14 +401,14 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V,
 
 /// AnalyzeIndirectGlobalMemory - We found an non-address-taken global variable
 /// which holds a pointer type.  See if the global always points to non-aliased
-/// heap memory: that is, all initializers of the globals are allocations, and
-/// those allocations have no use other than initialization of the global.
+/// heap memory: that is, all initializers of the globals store a value known
+/// to be obtained via a noalias return function call which have no other use.
 /// Further, all loads out of GV must directly use the memory, not store the
 /// pointer somewhere.  If this is true, we consider the memory pointed to by
 /// GV to be owned by GV and can disambiguate other pointers from it.
 bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) {
   // Keep track of values related to the allocation of the memory, f.e. the
-  // value produced by the malloc call and any casts.
+  // value produced by the noalias call and any casts.
   std::vector<Value *> AllocRelatedValues;
 
   // If the initializer is a valid pointer, bail.
@@ -438,7 +438,7 @@ bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) {
       // Check the value being stored.
       Value *Ptr = getUnderlyingObject(SI->getOperand(0));
 
-      if (!isAllocLikeFn(Ptr, &GetTLI(*SI->getFunction())))
+      if (!isNoAliasCall(Ptr))
         return false; // Too hard to analyze.
 
       // Analyze all uses of the allocation.  If any of them are used in a
@@ -963,7 +963,7 @@ ModRefInfo GlobalsAAResult::getModRefInfo(const CallBase *Call,
 GlobalsAAResult::GlobalsAAResult(
     const DataLayout &DL,
     std::function<const TargetLibraryInfo &(Function &F)> GetTLI)
-    : AAResultBase(), DL(DL), GetTLI(std::move(GetTLI)) {}
+    : DL(DL), GetTLI(std::move(GetTLI)) {}
 
 GlobalsAAResult::GlobalsAAResult(GlobalsAAResult &&Arg)
     : AAResultBase(std::move(Arg)), DL(Arg.DL), GetTLI(std::move(Arg.GetTLI)),
diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
index 2ec6cbeabda2..d2f0c57f6dab 100644
--- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -23,12 +23,24 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
+namespace llvm {
 cl::opt<bool>
     DisableBranches("no-ir-sim-branch-matching", cl::init(false),
                     cl::ReallyHidden,
                     cl::desc("disable similarity matching, and outlining, "
                              "across branches for debugging purposes."));
 
+cl::opt<bool>
+    DisableIndirectCalls("no-ir-sim-indirect-calls", cl::init(false),
+                         cl::ReallyHidden,
+                         cl::desc("disable outlining indirect calls."));
+
+cl::opt<bool>
+    MatchCallsByName("ir-sim-calls-by-name", cl::init(false), cl::ReallyHidden,
+                     cl::desc("only allow matching call instructions if the "
+                              "name and type signature match."));
+} // namespace llvm
+
 IRInstructionData::IRInstructionData(Instruction &I, bool Legality,
                                      IRInstructionDataList &IDList)
     : Inst(&I), Legal(Legality), IDL(&IDList) {
@@ -57,10 +69,16 @@ void IRInstructionData::initializeInstruction() {
 
     OperVals.push_back(OI.get());
   }
+
+  // We capture the incoming BasicBlocks as values as well as the incoming
+  // Values in order to check for structural similarity.
+  if (PHINode *PN = dyn_cast<PHINode>(Inst))
+    for (BasicBlock *BB : PN->blocks())
+      OperVals.push_back(BB);
 }
 
 IRInstructionData::IRInstructionData(IRInstructionDataList &IDList)
-    : Inst(nullptr), Legal(false), IDL(&IDList) {}
+    : IDL(&IDList) {}
 
 void IRInstructionData::setBranchSuccessors(
     DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
@@ -86,6 +104,43 @@ void IRInstructionData::setBranchSuccessors(
   }
 }
 
+void IRInstructionData::setCalleeName(bool MatchByName) {
+  CallInst *CI = dyn_cast<CallInst>(Inst);
+  assert(CI && "Instruction must be call");
+
+  CalleeName = "";
+  if (!CI->isIndirectCall() && MatchByName)
+    CalleeName = CI->getCalledFunction()->getName().str();
+}
+
+void IRInstructionData::setPHIPredecessors(
+    DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger) {
+  assert(isa<PHINode>(Inst) && "Instruction must be phi node");
+
+  PHINode *PN = cast<PHINode>(Inst);
+  DenseMap<BasicBlock *, unsigned>::iterator BBNumIt;
+
+  BBNumIt = BasicBlockToInteger.find(PN->getParent());
+  assert(BBNumIt != BasicBlockToInteger.end() &&
+         "Could not find location for BasicBlock!");
+
+  int CurrentBlockNumber = static_cast<int>(BBNumIt->second);
+
+  // Convert the incoming blocks of the PHINode to an integer value, based on
+  // the relative distances between the current block and the incoming block.
+  for (unsigned Idx = 0; Idx < PN->getNumIncomingValues(); Idx++) {
+    BasicBlock *Incoming = PN->getIncomingBlock(Idx);
+    BBNumIt = BasicBlockToInteger.find(Incoming);
+    assert(BBNumIt != BasicBlockToInteger.end() &&
+           "Could not find number for BasicBlock!");
+    int OtherBlockNumber = static_cast<int>(BBNumIt->second);
+
+    int Relative = OtherBlockNumber - CurrentBlockNumber;
+    RelativeBlockLocations.push_back(Relative);
+    RelativeBlockLocations.push_back(Relative);
+  }
+}
+
 CmpInst::Predicate IRInstructionData::predicateForConsistency(CmpInst *CI) {
   switch (CI->getPredicate()) {
   case CmpInst::FCMP_OGT:
@@ -112,10 +167,13 @@ CmpInst::Predicate IRInstructionData::getPredicate() const {
   return cast<CmpInst>(Inst)->getPredicate();
 }
 
-static StringRef getCalledFunctionName(CallInst &CI) {
-  assert(CI.getCalledFunction() != nullptr && "Called Function is nullptr?");
+StringRef IRInstructionData::getCalleeName() const {
+  assert(isa<CallInst>(Inst) &&
+         "Can only get a name from a call instruction");
 
-  return CI.getCalledFunction()->getName();
+  assert(CalleeName.hasValue() && "CalleeName has not been set");
+
+  return *CalleeName;
 }
 
 bool IRSimilarity::isClose(const IRInstructionData &A,
@@ -170,13 +228,11 @@ bool IRSimilarity::isClose(const IRInstructionData &A,
                   });
   }
 
-  // If the instructions are functions, we make sure that the function name is
-  // the same.  We already know that the types are since is isSameOperationAs is
-  // true.
+  // If the instructions are functions calls, we make sure that the function
+  // name is the same.  We already know that the types are since is
+  // isSameOperationAs is true.
   if (isa<CallInst>(A.Inst) && isa<CallInst>(B.Inst)) {
-    CallInst *CIA = cast<CallInst>(A.Inst);
-    CallInst *CIB = cast<CallInst>(B.Inst);
-    if (getCalledFunctionName(*CIA).compare(getCalledFunctionName(*CIB)) != 0)
+    if (A.getCalleeName().str().compare(B.getCalleeName().str()) != 0)
       return false;
   }
 
@@ -244,6 +300,12 @@ unsigned IRInstructionMapper::mapToLegalUnsigned(
   if (isa<BranchInst>(*It))
     ID->setBranchSuccessors(BasicBlockToInteger);
 
+  if (isa<CallInst>(*It))
+    ID->setCalleeName(EnableMatchCallsByName);
+
+  if (isa<PHINode>(*It))
+    ID->setPHIPredecessors(BasicBlockToInteger);
+
   // Add to the instruction list
   bool WasInserted;
   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>::iterator
@@ -1075,6 +1137,8 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
   std::vector<IRInstructionData *> InstrList;
   std::vector<unsigned> IntegerMapping;
   Mapper.InstClassifier.EnableBranches = this->EnableBranches;
+  Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
+  Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
 
   populateMapper(Modules, InstrList, IntegerMapping);
   findCandidates(InstrList, IntegerMapping);
@@ -1085,6 +1149,8 @@ SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(
 SimilarityGroupList &IRSimilarityIdentifier::findSimilarity(Module &M) {
   resetSimilarityCandidates();
   Mapper.InstClassifier.EnableBranches = this->EnableBranches;
+  Mapper.InstClassifier.EnableIndirectCalls = EnableIndirectCalls;
+  Mapper.EnableMatchCallsByName = EnableMatchingCallsByName;
 
   std::vector<IRInstructionData *> InstrList;
   std::vector<unsigned> IntegerMapping;
@@ -1105,7 +1171,8 @@ IRSimilarityIdentifierWrapperPass::IRSimilarityIdentifierWrapperPass()
 }
 
 bool IRSimilarityIdentifierWrapperPass::doInitialization(Module &M) {
-  IRSI.reset(new IRSimilarityIdentifier(!DisableBranches));
+  IRSI.reset(new IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
+                                        MatchCallsByName));
   return false;
 }
 
@@ -1123,7 +1190,8 @@ AnalysisKey IRSimilarityAnalysis::Key;
 IRSimilarityIdentifier IRSimilarityAnalysis::run(Module &M,
                                                  ModuleAnalysisManager &) {
 
-  auto IRSI = IRSimilarityIdentifier(!DisableBranches);
+  auto IRSI = IRSimilarityIdentifier(!DisableBranches, !DisableIndirectCalls,
+                                     MatchCallsByName);
   IRSI.findSimilarity(M);
   return IRSI;
 }
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index f5fa6748d053..44b1d94ebdc8 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -161,19 +161,22 @@ static std::pair<Type *, bool> computeRecurrenceType(Instruction *Exit,
 
 /// Collect cast instructions that can be ignored in the vectorizer's cost
 /// model, given a reduction exit value and the minimal type in which the
-/// reduction can be represented.
-static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
-                                 Type *RecurrenceType,
-                                 SmallPtrSetImpl<Instruction *> &Casts) {
+// reduction can be represented. Also search casts to the recurrence type
+// to find the minimum width used by the recurrence.
+static void collectCastInstrs(Loop *TheLoop, Instruction *Exit,
+                              Type *RecurrenceType,
+                              SmallPtrSetImpl<Instruction *> &Casts,
+                              unsigned &MinWidthCastToRecurTy) {
 
   SmallVector<Instruction *, 8> Worklist;
   SmallPtrSet<Instruction *, 8> Visited;
   Worklist.push_back(Exit);
+  MinWidthCastToRecurTy = -1U;
 
   while (!Worklist.empty()) {
     Instruction *Val = Worklist.pop_back_val();
     Visited.insert(Val);
-    if (auto *Cast = dyn_cast<CastInst>(Val))
+    if (auto *Cast = dyn_cast<CastInst>(Val)) {
       if (Cast->getSrcTy() == RecurrenceType) {
         // If the source type of a cast instruction is equal to the recurrence
         // type, it will be eliminated, and should be ignored in the vectorizer
@@ -181,7 +184,16 @@ static void collectCastsToIgnore(Loop *TheLoop, Instruction *Exit,
         Casts.insert(Cast);
         continue;
       }
-
+      if (Cast->getDestTy() == RecurrenceType) {
+        // The minimum width used by the recurrence is found by checking for
+        // casts on its operands. The minimum width is used by the vectorizer
+        // when finding the widest type for in-loop reductions without any
+        // loads/stores.
+        MinWidthCastToRecurTy = std::min<unsigned>(
+            MinWidthCastToRecurTy, Cast->getSrcTy()->getScalarSizeInBits());
+        continue;
+      }
+    }
     // Add all operands to the work list if they are loop-varying values that
     // we haven't yet visited.
     for (Value *O : cast<User>(Val)->operands())
@@ -265,6 +277,7 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // Data used for determining if the recurrence has been type-promoted.
   Type *RecurrenceType = Phi->getType();
   SmallPtrSet<Instruction *, 4> CastInsts;
+  unsigned MinWidthCastToRecurrenceType;
   Instruction *Start = Phi;
   bool IsSigned = false;
 
@@ -296,6 +309,10 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // flags from all the reduction operations.
   FastMathFlags FMF = FastMathFlags::getFast();
 
+  // The first instruction in the use-def chain of the Phi node that requires
+  // exact floating point operations.
+  Instruction *ExactFPMathInst = nullptr;
+
   // A value in the reduction can be used:
   //  - By the reduction:
   //      - Reduction operation:
@@ -339,6 +356,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
     if (Cur != Start) {
       ReduxDesc =
           isRecurrenceInstr(TheLoop, Phi, Cur, Kind, ReduxDesc, FuncFMF);
+      ExactFPMathInst = ExactFPMathInst == nullptr
+                            ? ReduxDesc.getExactFPMathInst()
+                            : ExactFPMathInst;
       if (!ReduxDesc.isRecurrence())
         return false;
       // FIXME: FMF is allowed on phi, but propagation is not handled correctly.
@@ -467,8 +487,8 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
     return false;
 
-  const bool IsOrdered = checkOrderedReduction(
-      Kind, ReduxDesc.getExactFPMathInst(), ExitInstruction, Phi);
+  const bool IsOrdered =
+      checkOrderedReduction(Kind, ExactFPMathInst, ExitInstruction, Phi);
 
   if (Start != Phi) {
     // If the starting value is not the same as the phi node, we speculatively
@@ -500,21 +520,24 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
         computeRecurrenceType(ExitInstruction, DB, AC, DT);
     if (ComputedType != RecurrenceType)
       return false;
-
-    // The recurrence expression will be represented in a narrower type. If
-    // there are any cast instructions that will be unnecessary, collect them
-    // in CastInsts. Note that the 'and' instruction was already included in
-    // this list.
-    //
-    // TODO: A better way to represent this may be to tag in some way all the
-    //       instructions that are a part of the reduction. The vectorizer cost
-    //       model could then apply the recurrence type to these instructions,
-    //       without needing a white list of instructions to ignore.
-    //       This may also be useful for the inloop reductions, if it can be
-    //       kept simple enough.
-    collectCastsToIgnore(TheLoop, ExitInstruction, RecurrenceType, CastInsts);
   }
 
+  // Collect cast instructions and the minimum width used by the recurrence.
+  // If the starting value is not the same as the phi node and the computed
+  // recurrence type is equal to the recurrence type, the recurrence expression
+  // will be represented in a narrower or wider type. If there are any cast
+  // instructions that will be unnecessary, collect them in CastsFromRecurTy.
+  // Note that the 'and' instruction was already included in this list.
+  //
+  // TODO: A better way to represent this may be to tag in some way all the
+  //       instructions that are a part of the reduction. The vectorizer cost
+  //       model could then apply the recurrence type to these instructions,
+  //       without needing a white list of instructions to ignore.
+  //       This may also be useful for the inloop reductions, if it can be
+  //       kept simple enough.
+  collectCastInstrs(TheLoop, ExitInstruction, RecurrenceType, CastInsts,
+                    MinWidthCastToRecurrenceType);
+
   // We found a reduction var if we have reached the original phi node and we
   // only have a single instruction with out-of-loop users.
 
@@ -522,9 +545,9 @@ bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind,
   // is saved as part of the RecurrenceDescriptor.
 
   // Save the description of this reduction variable.
-  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF,
-                          ReduxDesc.getExactFPMathInst(), RecurrenceType,
-                          IsSigned, IsOrdered, CastInsts);
+  RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ExactFPMathInst,
+                          RecurrenceType, IsSigned, IsOrdered, CastInsts,
+                          MinWidthCastToRecurrenceType);
   RedDes = RD;
 
   return true;
@@ -1397,8 +1420,9 @@ bool InductionDescriptor::isInductionPHI(
 
   // Always use i8 element type for opaque pointer inductions.
   PointerType *PtrTy = cast<PointerType>(PhiTy);
-  Type *ElementType = PtrTy->isOpaque() ? Type::getInt8Ty(PtrTy->getContext())
-                                        : PtrTy->getElementType();
+  Type *ElementType = PtrTy->isOpaque()
+                          ? Type::getInt8Ty(PtrTy->getContext())
+                          : PtrTy->getNonOpaquePointerElementType();
   if (!ElementType->isSized())
     return false;
 
diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp
index d7b202f83189..0f3929f45506 100644
--- a/llvm/lib/Analysis/IVUsers.cpp
+++ b/llvm/lib/Analysis/IVUsers.cpp
@@ -254,7 +254,7 @@ IVStrideUse &IVUsers::AddUser(Instruction *User, Value *Operand) {
 
 IVUsers::IVUsers(Loop *L, AssumptionCache *AC, LoopInfo *LI, DominatorTree *DT,
                  ScalarEvolution *SE)
-    : L(L), AC(AC), LI(LI), DT(DT), SE(SE), IVUses() {
+    : L(L), AC(AC), LI(LI), DT(DT), SE(SE) {
   // Collect ephemeral values so that AddUsersIfInteresting skips them.
   EphValues.clear();
   CodeMetrics::collectEphemeralValues(L, AC, EphValues);
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
index 140c88eb8b0d..f6e3dd354ff8 100644
--- a/llvm/lib/Analysis/InlineAdvisor.cpp
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -21,11 +21,15 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 #define DEBUG_TYPE "inline"
+#ifdef LLVM_HAVE_TF_AOT_INLINERSIZEMODEL
+#define LLVM_HAVE_TF_AOT
+#endif
 
 // This weirdly named statistic tracks the number of times that, when attempting
 // to inline a function A into B, we analyze the callers of B in order to see
@@ -160,18 +164,6 @@ InlineAdvice::InlineAdvice(InlineAdvisor *Advisor, CallBase &CB,
       DLoc(CB.getDebugLoc()), Block(CB.getParent()), ORE(ORE),
       IsInliningRecommended(IsInliningRecommended) {}
 
-void InlineAdvisor::markFunctionAsDeleted(Function *F) {
-  assert((!DeletedFunctions.count(F)) &&
-         "Cannot put cause a function to become dead twice!");
-  DeletedFunctions.insert(F);
-}
-
-void InlineAdvisor::freeDeletedFunctions() {
-  for (auto *F : DeletedFunctions)
-    delete F;
-  DeletedFunctions.clear();
-}
-
 void InlineAdvice::recordInlineStatsIfNeeded() {
   if (Advisor->ImportedFunctionsStats)
     Advisor->ImportedFunctionsStats->recordInline(*Caller, *Callee);
@@ -186,7 +178,6 @@ void InlineAdvice::recordInlining() {
 void InlineAdvice::recordInliningWithCalleeDeleted() {
   markRecorded();
   recordInlineStatsIfNeeded();
-  Advisor->markFunctionAsDeleted(Callee);
   recordInliningWithCalleeDeletedImpl();
 }
 
@@ -523,8 +514,6 @@ InlineAdvisor::~InlineAdvisor() {
     ImportedFunctionsStats->dump(InlinerFunctionImportStats ==
                                  InlinerFunctionImportStatsOpts::Verbose);
   }
-
-  freeDeletedFunctions();
 }
 
 std::unique_ptr<InlineAdvice> InlineAdvisor::getMandatoryAdvice(CallBase &CB,
@@ -569,3 +558,13 @@ std::unique_ptr<InlineAdvice> InlineAdvisor::getAdvice(CallBase &CB,
 OptimizationRemarkEmitter &InlineAdvisor::getCallerORE(CallBase &CB) {
   return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*CB.getCaller());
 }
+
+PreservedAnalyses
+InlineAdvisorAnalysisPrinterPass::run(Module &M, ModuleAnalysisManager &MAM) {
+  const auto *IA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
+  if (!IA)
+    OS << "No Inline Advisor\n";
+  else
+    IA->getAdvisor()->print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index ff31e81aad08..d5411d916c77 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -361,10 +361,10 @@ protected:
   /// Model the elimination of repeated loads that is expected to happen
   /// whenever we simplify away the stores that would otherwise cause them to be
   /// loads.
-  bool EnableLoadElimination;
+  bool EnableLoadElimination = true;
 
   /// Whether we allow inlining for recursive call.
-  bool AllowRecursiveCall;
+  bool AllowRecursiveCall = false;
 
   SmallPtrSet<Value *, 16> LoadAddrSet;
 
@@ -455,8 +455,7 @@ public:
                OptimizationRemarkEmitter *ORE = nullptr)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
-        CandidateCall(Call), EnableLoadElimination(true),
-        AllowRecursiveCall(false) {}
+        CandidateCall(Call) {}
 
   InlineResult analyze();
 
@@ -2898,15 +2897,6 @@ Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
   if (Call.isNoInline())
     return InlineResult::failure("noinline call site attribute");
 
-  // Don't inline functions if one does not have any stack protector attribute
-  // but the other does.
-  if (Caller->hasStackProtectorFnAttr() && !Callee->hasStackProtectorFnAttr())
-    return InlineResult::failure(
-        "stack protected caller but callee requested no stack protector");
-  if (Callee->hasStackProtectorFnAttr() && !Caller->hasStackProtectorFnAttr())
-    return InlineResult::failure(
-        "stack protected callee but caller requested no stack protector");
-
   return None;
 }
 
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4831b22b1d46..b71b39334ace 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/CmpInstAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/OverflowInstAnalysis.h"
@@ -70,7 +71,7 @@ static Value *SimplifyOrInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyXorInst(Value *, Value *, const SimplifyQuery &, unsigned);
 static Value *SimplifyCastInst(unsigned, Value *, Type *,
                                const SimplifyQuery &, unsigned);
-static Value *SimplifyGEPInst(Type *, ArrayRef<Value *>, bool,
+static Value *SimplifyGEPInst(Type *, Value *, ArrayRef<Value *>, bool,
                               const SimplifyQuery &, unsigned);
 static Value *SimplifySelectInst(Value *, Value *, Value *,
                                  const SimplifyQuery &, unsigned);
@@ -620,6 +621,10 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
   if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
     return C;
 
+  // X + poison -> poison
+  if (isa<PoisonValue>(Op1))
+    return Op1;
+
   // X + undef -> undef
   if (Q.isUndefValue(Op1))
     return Op1;
@@ -1074,6 +1079,16 @@ static bool isDivZero(Value *X, Value *Y, const SimplifyQuery &Q,
   }
 
   // IsSigned == false.
+
+  // Is the unsigned dividend known to be less than a constant divisor?
+  // TODO: Convert this (and above) to range analysis
+  //      ("computeConstantRangeIncludingKnownBits")?
+  const APInt *C;
+  if (match(Y, m_APInt(C)) &&
+      computeKnownBits(X, Q.DL, 0, Q.AC, Q.CxtI, Q.DT).getMaxValue().ult(*C))
+    return true;
+
+  // Try again for any divisor:
   // Is the dividend unsigned less than the divisor?
   return isICmpTrue(ICmpInst::ICMP_ULT, X, Y, Q, MaxRecurse);
 }
@@ -2254,14 +2269,21 @@ static Value *simplifyOrLogic(Value *X, Value *Y) {
       match(Y, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
     return NotA;
 
-  // ~(A ^ B) | (A & B) --> ~(A & B)
-  // ~(A ^ B) | (B & A) --> ~(A & B)
+  // ~(A ^ B) | (A & B) --> ~(A ^ B)
+  // ~(A ^ B) | (B & A) --> ~(A ^ B)
   Value *NotAB;
   if (match(X, m_CombineAnd(m_NotForbidUndef(m_Xor(m_Value(A), m_Value(B))),
                             m_Value(NotAB))) &&
       match(Y, m_c_And(m_Specific(A), m_Specific(B))))
     return NotAB;
 
+  // ~(A & B) | (A ^ B) --> ~(A & B)
+  // ~(A & B) | (B ^ A) --> ~(A & B)
+  if (match(X, m_CombineAnd(m_NotForbidUndef(m_And(m_Value(A), m_Value(B))),
+                            m_Value(NotAB))) &&
+      match(Y, m_c_Xor(m_Specific(A), m_Specific(B))))
+    return NotAB;
+
   return nullptr;
 }
 
@@ -2685,7 +2707,9 @@ computePointerICmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
 
     // Fold comparisons for non-escaping pointer even if the allocation call
     // cannot be elided. We cannot fold malloc comparison to null. Also, the
-    // dynamic allocation call could be either of the operands.
+    // dynamic allocation call could be either of the operands.  Note that
+    // the other operand can not be based on the alloc - if it were, then
+    // the cmp itself would be a capture.
     Value *MI = nullptr;
     if (isAllocLikeFn(LHS, TLI) &&
         llvm::isKnownNonZero(RHS, DL, 0, nullptr, CxtI, DT))
@@ -2890,7 +2914,8 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS,
   if (RHS_CR.isFullSet())
     return ConstantInt::getTrue(ITy);
 
-  ConstantRange LHS_CR = computeConstantRange(LHS, IIQ.UseInstrInfo);
+  ConstantRange LHS_CR =
+      computeConstantRange(LHS, CmpInst::isSigned(Pred), IIQ.UseInstrInfo);
   if (!LHS_CR.isFullSet()) {
     if (RHS_CR.contains(LHS_CR))
       return ConstantInt::getTrue(ITy);
@@ -4057,9 +4082,9 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
                                                  NewOps[1], Q, MaxRecurse - 1));
 
     if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
-      return PreventSelfSimplify(SimplifyGEPInst(GEP->getSourceElementType(),
-                                                 NewOps, GEP->isInBounds(), Q,
-                                                 MaxRecurse - 1));
+      return PreventSelfSimplify(SimplifyGEPInst(
+          GEP->getSourceElementType(), NewOps[0], makeArrayRef(NewOps).slice(1),
+          GEP->isInBounds(), Q, MaxRecurse - 1));
 
     if (isa<SelectInst>(I))
       return PreventSelfSimplify(
@@ -4417,45 +4442,52 @@ Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
 
 /// Given operands for an GetElementPtrInst, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
+static Value *SimplifyGEPInst(Type *SrcTy, Value *Ptr,
+                              ArrayRef<Value *> Indices, bool InBounds,
                               const SimplifyQuery &Q, unsigned) {
   // The type of the GEP pointer operand.
   unsigned AS =
-      cast<PointerType>(Ops[0]->getType()->getScalarType())->getAddressSpace();
+      cast<PointerType>(Ptr->getType()->getScalarType())->getAddressSpace();
 
   // getelementptr P -> P.
-  if (Ops.size() == 1)
-    return Ops[0];
+  if (Indices.empty())
+    return Ptr;
 
   // Compute the (pointer) type returned by the GEP instruction.
-  Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Ops.slice(1));
+  Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Indices);
   Type *GEPTy = PointerType::get(LastType, AS);
-  for (Value *Op : Ops) {
-    // If one of the operands is a vector, the result type is a vector of
-    // pointers. All vector operands must have the same number of elements.
-    if (VectorType *VT = dyn_cast<VectorType>(Op->getType())) {
-      GEPTy = VectorType::get(GEPTy, VT->getElementCount());
-      break;
+  if (VectorType *VT = dyn_cast<VectorType>(Ptr->getType()))
+    GEPTy = VectorType::get(GEPTy, VT->getElementCount());
+  else {
+    for (Value *Op : Indices) {
+      // If one of the operands is a vector, the result type is a vector of
+      // pointers. All vector operands must have the same number of elements.
+      if (VectorType *VT = dyn_cast<VectorType>(Op->getType())) {
+        GEPTy = VectorType::get(GEPTy, VT->getElementCount());
+        break;
+      }
     }
   }
 
   // getelementptr poison, idx -> poison
   // getelementptr baseptr, poison -> poison
-  if (any_of(Ops, [](const auto *V) { return isa<PoisonValue>(V); }))
+  if (isa<PoisonValue>(Ptr) ||
+      any_of(Indices, [](const auto *V) { return isa<PoisonValue>(V); }))
     return PoisonValue::get(GEPTy);
 
-  if (Q.isUndefValue(Ops[0]))
-    return UndefValue::get(GEPTy);
+  if (Q.isUndefValue(Ptr))
+    // If inbounds, we can choose an out-of-bounds pointer as a base pointer.
+    return InBounds ? PoisonValue::get(GEPTy) : UndefValue::get(GEPTy);
 
   bool IsScalableVec =
-      isa<ScalableVectorType>(SrcTy) || any_of(Ops, [](const Value *V) {
+      isa<ScalableVectorType>(SrcTy) || any_of(Indices, [](const Value *V) {
         return isa<ScalableVectorType>(V->getType());
       });
 
-  if (Ops.size() == 2) {
+  if (Indices.size() == 1) {
     // getelementptr P, 0 -> P.
-    if (match(Ops[1], m_Zero()) && Ops[0]->getType() == GEPTy)
-      return Ops[0];
+    if (match(Indices[0], m_Zero()) && Ptr->getType() == GEPTy)
+      return Ptr;
 
     Type *Ty = SrcTy;
     if (!IsScalableVec && Ty->isSized()) {
@@ -4463,37 +4495,37 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
       uint64_t C;
       uint64_t TyAllocSize = Q.DL.getTypeAllocSize(Ty);
       // getelementptr P, N -> P if P points to a type of zero size.
-      if (TyAllocSize == 0 && Ops[0]->getType() == GEPTy)
-        return Ops[0];
+      if (TyAllocSize == 0 && Ptr->getType() == GEPTy)
+        return Ptr;
 
       // The following transforms are only safe if the ptrtoint cast
       // doesn't truncate the pointers.
-      if (Ops[1]->getType()->getScalarSizeInBits() ==
+      if (Indices[0]->getType()->getScalarSizeInBits() ==
           Q.DL.getPointerSizeInBits(AS)) {
-        auto CanSimplify = [GEPTy, &P, V = Ops[0]]() -> bool {
+        auto CanSimplify = [GEPTy, &P, Ptr]() -> bool {
           return P->getType() == GEPTy &&
-                 getUnderlyingObject(P) == getUnderlyingObject(V);
+                 getUnderlyingObject(P) == getUnderlyingObject(Ptr);
         };
         // getelementptr V, (sub P, V) -> P if P points to a type of size 1.
         if (TyAllocSize == 1 &&
-            match(Ops[1], m_Sub(m_PtrToInt(m_Value(P)),
-                                m_PtrToInt(m_Specific(Ops[0])))) &&
+            match(Indices[0],
+                  m_Sub(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Specific(Ptr)))) &&
             CanSimplify())
           return P;
 
         // getelementptr V, (ashr (sub P, V), C) -> P if P points to a type of
         // size 1 << C.
-        if (match(Ops[1], m_AShr(m_Sub(m_PtrToInt(m_Value(P)),
-                                       m_PtrToInt(m_Specific(Ops[0]))),
-                                 m_ConstantInt(C))) &&
+        if (match(Indices[0], m_AShr(m_Sub(m_PtrToInt(m_Value(P)),
+                                           m_PtrToInt(m_Specific(Ptr))),
+                                     m_ConstantInt(C))) &&
             TyAllocSize == 1ULL << C && CanSimplify())
           return P;
 
         // getelementptr V, (sdiv (sub P, V), C) -> P if P points to a type of
         // size C.
-        if (match(Ops[1], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)),
-                                       m_PtrToInt(m_Specific(Ops[0]))),
-                                 m_SpecificInt(TyAllocSize))) &&
+        if (match(Indices[0], m_SDiv(m_Sub(m_PtrToInt(m_Value(P)),
+                                           m_PtrToInt(m_Specific(Ptr))),
+                                     m_SpecificInt(TyAllocSize))) &&
             CanSimplify())
           return P;
       }
@@ -4501,29 +4533,28 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
   }
 
   if (!IsScalableVec && Q.DL.getTypeAllocSize(LastType) == 1 &&
-      all_of(Ops.slice(1).drop_back(1),
+      all_of(Indices.drop_back(1),
              [](Value *Idx) { return match(Idx, m_Zero()); })) {
     unsigned IdxWidth =
-        Q.DL.getIndexSizeInBits(Ops[0]->getType()->getPointerAddressSpace());
-    if (Q.DL.getTypeSizeInBits(Ops.back()->getType()) == IdxWidth) {
+        Q.DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace());
+    if (Q.DL.getTypeSizeInBits(Indices.back()->getType()) == IdxWidth) {
       APInt BasePtrOffset(IdxWidth, 0);
       Value *StrippedBasePtr =
-          Ops[0]->stripAndAccumulateInBoundsConstantOffsets(Q.DL,
-                                                            BasePtrOffset);
+          Ptr->stripAndAccumulateInBoundsConstantOffsets(Q.DL, BasePtrOffset);
 
       // Avoid creating inttoptr of zero here: While LLVMs treatment of
       // inttoptr is generally conservative, this particular case is folded to
       // a null pointer, which will have incorrect provenance.
 
       // gep (gep V, C), (sub 0, V) -> C
-      if (match(Ops.back(),
+      if (match(Indices.back(),
                 m_Sub(m_Zero(), m_PtrToInt(m_Specific(StrippedBasePtr)))) &&
           !BasePtrOffset.isZero()) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset);
         return ConstantExpr::getIntToPtr(CI, GEPTy);
       }
       // gep (gep V, C), (xor V, -1) -> C-1
-      if (match(Ops.back(),
+      if (match(Indices.back(),
                 m_Xor(m_PtrToInt(m_Specific(StrippedBasePtr)), m_AllOnes())) &&
           !BasePtrOffset.isOne()) {
         auto *CI = ConstantInt::get(GEPTy->getContext(), BasePtrOffset - 1);
@@ -4533,17 +4564,18 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
   }
 
   // Check to see if this is constant foldable.
-  if (!all_of(Ops, [](Value *V) { return isa<Constant>(V); }))
+  if (!isa<Constant>(Ptr) ||
+      !all_of(Indices, [](Value *V) { return isa<Constant>(V); }))
     return nullptr;
 
-  auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
-                                            Ops.slice(1), InBounds);
+  auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ptr), Indices,
+                                            InBounds);
   return ConstantFoldConstant(CE, Q.DL);
 }
 
-Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops, bool InBounds,
-                             const SimplifyQuery &Q) {
-  return ::SimplifyGEPInst(SrcTy, Ops, InBounds, Q, RecursionLimit);
+Value *llvm::SimplifyGEPInst(Type *SrcTy, Value *Ptr, ArrayRef<Value *> Indices,
+                             bool InBounds, const SimplifyQuery &Q) {
+  return ::SimplifyGEPInst(SrcTy, Ptr, Indices, InBounds, Q, RecursionLimit);
 }
 
 /// Given operands for an InsertValueInst, see if we can fold the result.
@@ -5603,26 +5635,6 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
   return nullptr;
 }
 
-static APInt getMaxMinLimit(Intrinsic::ID IID, unsigned BitWidth) {
-  switch (IID) {
-  case Intrinsic::smax: return APInt::getSignedMaxValue(BitWidth);
-  case Intrinsic::smin: return APInt::getSignedMinValue(BitWidth);
-  case Intrinsic::umax: return APInt::getMaxValue(BitWidth);
-  case Intrinsic::umin: return APInt::getMinValue(BitWidth);
-  default: llvm_unreachable("Unexpected intrinsic");
-  }
-}
-
-static ICmpInst::Predicate getMaxMinPredicate(Intrinsic::ID IID) {
-  switch (IID) {
-  case Intrinsic::smax: return ICmpInst::ICMP_SGE;
-  case Intrinsic::smin: return ICmpInst::ICMP_SLE;
-  case Intrinsic::umax: return ICmpInst::ICMP_UGE;
-  case Intrinsic::umin: return ICmpInst::ICMP_ULE;
-  default: llvm_unreachable("Unexpected intrinsic");
-  }
-}
-
 /// Given a min/max intrinsic, see if it can be removed based on having an
 /// operand that is another min/max intrinsic with shared operand(s). The caller
 /// is expected to swap the operand arguments to handle commutation.
@@ -5690,19 +5702,21 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
 
     // Assume undef is the limit value.
     if (Q.isUndefValue(Op1))
-      return ConstantInt::get(ReturnType, getMaxMinLimit(IID, BitWidth));
+      return ConstantInt::get(
+          ReturnType, MinMaxIntrinsic::getSaturationPoint(IID, BitWidth));
 
     const APInt *C;
     if (match(Op1, m_APIntAllowUndef(C))) {
       // Clamp to limit value. For example:
       // umax(i8 %x, i8 255) --> 255
-      if (*C == getMaxMinLimit(IID, BitWidth))
+      if (*C == MinMaxIntrinsic::getSaturationPoint(IID, BitWidth))
         return ConstantInt::get(ReturnType, *C);
 
       // If the constant op is the opposite of the limit value, the other must
       // be larger/smaller or equal. For example:
       // umin(i8 %x, i8 255) --> %x
-      if (*C == getMaxMinLimit(getInverseMinMaxIntrinsic(IID), BitWidth))
+      if (*C == MinMaxIntrinsic::getSaturationPoint(
+                    getInverseMinMaxIntrinsic(IID), BitWidth))
         return Op0;
 
       // Remove nested call if constant operands allow it. Example:
@@ -5713,10 +5727,9 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
         Value *M00 = MinMax0->getOperand(0), *M01 = MinMax0->getOperand(1);
         const APInt *InnerC;
         if ((match(M00, m_APInt(InnerC)) || match(M01, m_APInt(InnerC))) &&
-            ((IID == Intrinsic::smax && InnerC->sge(*C)) ||
-             (IID == Intrinsic::smin && InnerC->sle(*C)) ||
-             (IID == Intrinsic::umax && InnerC->uge(*C)) ||
-             (IID == Intrinsic::umin && InnerC->ule(*C))))
+            ICmpInst::compare(*InnerC, *C,
+                              ICmpInst::getNonStrictPredicate(
+                                  MinMaxIntrinsic::getPredicate(IID))))
           return Op0;
       }
     }
@@ -5726,7 +5739,8 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
     if (Value *V = foldMinMaxSharedOp(IID, Op1, Op0))
       return V;
 
-    ICmpInst::Predicate Pred = getMaxMinPredicate(IID);
+    ICmpInst::Predicate Pred =
+        ICmpInst::getNonStrictPredicate(MinMaxIntrinsic::getPredicate(IID));
     if (isICmpTrue(Pred, Op0, Op1, Q.getWithoutUndef(), RecursionLimit))
       return Op0;
     if (isICmpTrue(Pred, Op1, Op0, Q.getWithoutUndef(), RecursionLimit))
@@ -6277,8 +6291,9 @@ static Value *simplifyInstructionWithOperands(Instruction *I,
     break;
   case Instruction::GetElementPtr: {
     auto *GEPI = cast<GetElementPtrInst>(I);
-    Result = SimplifyGEPInst(GEPI->getSourceElementType(), NewOps,
-                             GEPI->isInBounds(), Q);
+    Result =
+        SimplifyGEPInst(GEPI->getSourceElementType(), NewOps[0],
+                        makeArrayRef(NewOps).slice(1), GEPI->isInBounds(), Q);
     break;
   }
   case Instruction::InsertValue: {
@@ -6460,3 +6475,5 @@ const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &AM,
 template const SimplifyQuery getBestSimplifyQuery(AnalysisManager<Function> &,
                                                   Function &);
 }
+
+void InstSimplifyFolder::anchor() {}
diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp
index 0007c54b16d0..e8e9593d7030 100644
--- a/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -1503,7 +1503,7 @@ void LazyCallGraph::removeEdge(Node &SourceN, Node &TargetN) {
 void LazyCallGraph::removeDeadFunction(Function &F) {
   // FIXME: This is unnecessarily restrictive. We should be able to remove
   // functions which recursively call themselves.
-  assert(F.use_empty() &&
+  assert(F.hasZeroLiveUses() &&
          "This routine should only be called on trivially dead functions!");
 
   // We shouldn't remove library functions as they are never really dead while
@@ -1522,13 +1522,6 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   // Remove this from the entry edges if present.
   EntryEdges.removeEdgeInternal(N);
 
-  if (SCCMap.empty()) {
-    // No SCCs have been formed, so removing this is fine and there is nothing
-    // else necessary at this point but clearing out the node.
-    N.clear();
-    return;
-  }
-
   // Cannot remove a function which has yet to be visited in the DFS walk, so
   // if we have a node at all then we must have an SCC and RefSCC.
   auto CI = SCCMap.find(&N);
@@ -1544,15 +1537,9 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   assert(C.size() == 1 && "Dead functions must be in a singular SCC");
   assert(RC.size() == 1 && "Dead functions must be in a singular RefSCC");
 
-  auto RCIndexI = RefSCCIndices.find(&RC);
-  int RCIndex = RCIndexI->second;
-  PostOrderRefSCCs.erase(PostOrderRefSCCs.begin() + RCIndex);
-  RefSCCIndices.erase(RCIndexI);
-  for (int i = RCIndex, Size = PostOrderRefSCCs.size(); i < Size; ++i)
-    RefSCCIndices[PostOrderRefSCCs[i]] = i;
-
   // Finally clear out all the data structures from the node down through the
-  // components.
+  // components. postorder_ref_scc_iterator will skip empty RefSCCs, so no need
+  // to adjust LazyCallGraph data structures.
   N.clear();
   N.G = nullptr;
   N.F = nullptr;
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 5b5d48bf6fe5..e311b40ab25c 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -395,7 +395,8 @@ class LazyValueInfoImpl {
   /// if it exists in the module.
   Function *GuardDecl;
 
-  Optional<ValueLatticeElement> getBlockValue(Value *Val, BasicBlock *BB);
+  Optional<ValueLatticeElement> getBlockValue(Value *Val, BasicBlock *BB,
+                                              Instruction *CxtI);
   Optional<ValueLatticeElement> getEdgeValue(Value *V, BasicBlock *F,
                                 BasicBlock *T, Instruction *CxtI = nullptr);
 
@@ -533,15 +534,17 @@ void LazyValueInfoImpl::solve() {
   }
 }
 
-Optional<ValueLatticeElement> LazyValueInfoImpl::getBlockValue(Value *Val,
-                                                               BasicBlock *BB) {
+Optional<ValueLatticeElement> LazyValueInfoImpl::getBlockValue(
+    Value *Val, BasicBlock *BB, Instruction *CxtI) {
   // If already a constant, there is nothing to compute.
   if (Constant *VC = dyn_cast<Constant>(Val))
     return ValueLatticeElement::get(VC);
 
   if (Optional<ValueLatticeElement> OptLatticeVal =
-          TheCache.getCachedValueInfo(Val, BB))
+          TheCache.getCachedValueInfo(Val, BB)) {
+    intersectAssumeOrGuardBlockValueConstantRange(Val, *OptLatticeVal, CxtI);
     return OptLatticeVal;
+  }
 
   // We have hit a cycle, assume overdefined.
   if (!pushBlockValue({ BB, Val }))
@@ -792,31 +795,41 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
   }
 }
 
+static ConstantRange getConstantRangeOrFull(const ValueLatticeElement &Val,
+                                            Type *Ty, const DataLayout &DL) {
+  if (Val.isConstantRange())
+    return Val.getConstantRange();
+  return ConstantRange::getFull(DL.getTypeSizeInBits(Ty));
+}
+
 Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect(
     SelectInst *SI, BasicBlock *BB) {
   // Recurse on our inputs if needed
   Optional<ValueLatticeElement> OptTrueVal =
-      getBlockValue(SI->getTrueValue(), BB);
+      getBlockValue(SI->getTrueValue(), BB, SI);
   if (!OptTrueVal)
     return None;
   ValueLatticeElement &TrueVal = *OptTrueVal;
 
   Optional<ValueLatticeElement> OptFalseVal =
-      getBlockValue(SI->getFalseValue(), BB);
+      getBlockValue(SI->getFalseValue(), BB, SI);
   if (!OptFalseVal)
     return None;
   ValueLatticeElement &FalseVal = *OptFalseVal;
 
-  if (TrueVal.isConstantRange() && FalseVal.isConstantRange()) {
-    const ConstantRange &TrueCR = TrueVal.getConstantRange();
-    const ConstantRange &FalseCR = FalseVal.getConstantRange();
+  if (TrueVal.isConstantRange() || FalseVal.isConstantRange()) {
+    const ConstantRange &TrueCR =
+        getConstantRangeOrFull(TrueVal, SI->getType(), DL);
+    const ConstantRange &FalseCR =
+        getConstantRangeOrFull(FalseVal, SI->getType(), DL);
     Value *LHS = nullptr;
     Value *RHS = nullptr;
     SelectPatternResult SPR = matchSelectPattern(SI, LHS, RHS);
     // Is this a min specifically of our two inputs?  (Avoid the risk of
     // ValueTracking getting smarter looking back past our immediate inputs.)
     if (SelectPatternResult::isMinOrMax(SPR.Flavor) &&
-        LHS == SI->getTrueValue() && RHS == SI->getFalseValue()) {
+        ((LHS == SI->getTrueValue() && RHS == SI->getFalseValue()) ||
+         (RHS == SI->getTrueValue() && LHS == SI->getFalseValue()))) {
       ConstantRange ResultCR = [&]() {
         switch (SPR.Flavor) {
         default:
@@ -873,17 +886,10 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect(
 Optional<ConstantRange> LazyValueInfoImpl::getRangeFor(Value *V,
                                                        Instruction *CxtI,
                                                        BasicBlock *BB) {
-  Optional<ValueLatticeElement> OptVal = getBlockValue(V, BB);
+  Optional<ValueLatticeElement> OptVal = getBlockValue(V, BB, CxtI);
   if (!OptVal)
     return None;
-
-  ValueLatticeElement &Val = *OptVal;
-  intersectAssumeOrGuardBlockValueConstantRange(V, Val, CxtI);
-  if (Val.isConstantRange())
-    return Val.getConstantRange();
-
-  const unsigned OperandBitWidth = DL.getTypeSizeInBits(V->getType());
-  return ConstantRange::getFull(OperandBitWidth);
+  return getConstantRangeOrFull(*OptVal, V->getType(), DL);
 }
 
 Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueCast(
@@ -1017,7 +1023,7 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueExtractValue(
   if (Value *V = SimplifyExtractValueInst(
           EVI->getAggregateOperand(), EVI->getIndices(),
           EVI->getModule()->getDataLayout()))
-    return getBlockValue(V, BB);
+    return getBlockValue(V, BB, EVI);
 
   LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                     << "' - overdefined (unknown extractvalue).\n");
@@ -1126,14 +1132,16 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
   }
 
   // If (X urem Modulus) >= C, then X >= C.
+  // If trunc X >= C, then X >= C.
   // TODO: An upper bound could be computed as well.
-  if (match(LHS, m_URem(m_Specific(Val), m_Value())) &&
+  if (match(LHS, m_CombineOr(m_URem(m_Specific(Val), m_Value()),
+                             m_Trunc(m_Specific(Val)))) &&
       match(RHS, m_APInt(C))) {
     // Use the icmp region so we don't have to deal with different predicates.
     ConstantRange CR = ConstantRange::makeExactICmpRegion(EdgePred, *C);
     if (!CR.isEmptySet())
       return ValueLatticeElement::getRange(ConstantRange::getNonEmpty(
-          CR.getUnsignedMin(), APInt(BitWidth, 0)));
+          CR.getUnsignedMin().zextOrSelf(BitWidth), APInt(BitWidth, 0)));
   }
 
   return ValueLatticeElement::getOverdefined();
@@ -1430,14 +1438,12 @@ Optional<ValueLatticeElement> LazyValueInfoImpl::getEdgeValue(
     // Can't get any more precise here
     return LocalResult;
 
-  Optional<ValueLatticeElement> OptInBlock = getBlockValue(Val, BBFrom);
+  Optional<ValueLatticeElement> OptInBlock =
+      getBlockValue(Val, BBFrom, BBFrom->getTerminator());
   if (!OptInBlock)
     return None;
   ValueLatticeElement &InBlock = *OptInBlock;
 
-  // Try to intersect ranges of the BB and the constraint on the edge.
-  intersectAssumeOrGuardBlockValueConstantRange(Val, InBlock,
-                                                BBFrom->getTerminator());
   // We can use the context instruction (generically the ultimate instruction
   // the calling pass is trying to simplify) here, even though the result of
   // this function is generally cached when called from the solve* functions
@@ -1457,15 +1463,14 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
                     << BB->getName() << "'\n");
 
   assert(BlockValueStack.empty() && BlockValueSet.empty());
-  Optional<ValueLatticeElement> OptResult = getBlockValue(V, BB);
+  Optional<ValueLatticeElement> OptResult = getBlockValue(V, BB, CxtI);
   if (!OptResult) {
     solve();
-    OptResult = getBlockValue(V, BB);
+    OptResult = getBlockValue(V, BB, CxtI);
     assert(OptResult && "Value not available after solving");
   }
-  ValueLatticeElement Result = *OptResult;
-  intersectAssumeOrGuardBlockValueConstantRange(V, Result, CxtI);
 
+  ValueLatticeElement Result = *OptResult;
   LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
   return Result;
 }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 6444518dc70c..2ab78d2b7ee2 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -519,8 +519,7 @@ public:
   AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE)
-      : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
-        IsRTCheckAnalysisNeeded(false), PSE(PSE) {}
+      : TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA), PSE(PSE) {}
 
   /// Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
@@ -620,7 +619,7 @@ private:
   /// memcheck analysis without dependency checking
   /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
   /// cleared while this remains set if we have potentially dependent accesses.
-  bool IsRTCheckAnalysisNeeded;
+  bool IsRTCheckAnalysisNeeded = false;
 
   /// The SCEV predicate containing all the SCEV-related assumptions.
   PredicatedScalarEvolution &PSE;
@@ -1055,7 +1054,6 @@ int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy,
                            bool ShouldCheckWrap) {
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
-  assert(!AccessTy->isAggregateType() && "Bad stride - Not a pointer to a scalar type");
 
   if (isa<ScalableVectorType>(AccessTy)) {
     LLVM_DEBUG(dbgs() << "LAA: Bad stride - Scalable object: " << *AccessTy
@@ -2245,10 +2243,7 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
                                DominatorTree *DT, LoopInfo *LI)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
-      DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
-      NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
-      HasConvergentOp(false),
-      HasDependenceInvolvingLoopInvariantAddress(false) {
+      DepChecker(std::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L) {
   if (canAnalyzeLoop())
     analyzeLoop(AA, LI, TLI, DT);
 }
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 7b895d8a5dc2..ba014bd08c98 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -477,9 +477,8 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const CacheCost &CC) {
 
 CacheCost::CacheCost(const LoopVectorTy &Loops, const LoopInfo &LI,
                      ScalarEvolution &SE, TargetTransformInfo &TTI,
-                     AAResults &AA, DependenceInfo &DI,
-                     Optional<unsigned> TRT)
-    : Loops(Loops), TripCounts(), LoopCosts(),
+                     AAResults &AA, DependenceInfo &DI, Optional<unsigned> TRT)
+    : Loops(Loops),
       TRT((TRT == None) ? Optional<unsigned>(TemporalReuseThreshold) : TRT),
       LI(LI), SE(SE), TTI(TTI), AA(AA), DI(DI) {
   assert(!Loops.empty() && "Expecting a non-empty loop vector.");
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index b35fb2a190f6..dd6958716127 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -695,11 +695,10 @@ class UnloopUpdater {
 
   // Flag the presence of an irreducible backedge whose destination is a block
   // directly contained by the original unloop.
-  bool FoundIB;
+  bool FoundIB = false;
 
 public:
-  UnloopUpdater(Loop *UL, LoopInfo *LInfo)
-      : Unloop(*UL), LI(LInfo), DFS(UL), FoundIB(false) {}
+  UnloopUpdater(Loop *UL, LoopInfo *LInfo) : Unloop(*UL), LI(LInfo), DFS(UL) {}
 
   void updateBlockParents();
 
diff --git a/llvm/lib/Analysis/LoopPass.cpp b/llvm/lib/Analysis/LoopPass.cpp
index 9e470e998e67..b720bab454e9 100644
--- a/llvm/lib/Analysis/LoopPass.cpp
+++ b/llvm/lib/Analysis/LoopPass.cpp
@@ -69,8 +69,7 @@ char PrintLoopPassWrapper::ID = 0;
 
 char LPPassManager::ID = 0;
 
-LPPassManager::LPPassManager()
-  : FunctionPass(ID), PMDataManager() {
+LPPassManager::LPPassManager() : FunctionPass(ID) {
   LI = nullptr;
   CurrentLoop = nullptr;
 }
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index f5a65cd2b689..0480c1cd2842 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -11,35 +11,34 @@
 // 'release' mode) or a runtime-loaded model (the 'development' case).
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/Config/config.h"
-#if defined(LLVM_HAVE_TF_AOT) || defined(LLVM_HAVE_TF_API)
-
-#include <limits>
-#include <unordered_map>
-#include <unordered_set>
-
+#include "llvm/Analysis/MLInlineAdvisor.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/MLInlineAdvisor.h"
+#include "llvm/Analysis/InlineModelFeatureMaps.h"
+#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/MLModelRunner.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ReleaseModeModelRunner.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Config/config.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Path.h"
 
+#include <limits>
+#include <unordered_map>
+#include <unordered_set>
+
 using namespace llvm;
 
-#ifdef LLVM_HAVE_TF_AOT
-#include "llvm/Analysis/ReleaseModeModelRunner.h"
+#if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL)
 // codegen-ed file
 #include "InlinerSizeModel.h" // NOLINT
-#include "llvm/Analysis/InlineModelFeatureMaps.h"
 
 std::unique_ptr<InlineAdvisor>
 llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) {
@@ -90,7 +89,8 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM,
                                  std::unique_ptr<MLModelRunner> Runner)
     : InlineAdvisor(
           M, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
-      ModelRunner(std::move(Runner)), CG(new CallGraph(M)),
+      ModelRunner(std::move(Runner)),
+      CG(MAM.getResult<LazyCallGraphAnalysis>(M)),
       InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize) {
   assert(ModelRunner);
 
@@ -100,7 +100,8 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM,
   // critical in behavioral cloning - i.e. training a model to mimic the manual
   // heuristic's decisions - and, thus, equally important for training for
   // improvement.
-  for (auto I = scc_begin(CG.get()); !I.isAtEnd(); ++I) {
+  CallGraph CGraph(M);
+  for (auto I = scc_begin(&CGraph); !I.isAtEnd(); ++I) {
     const std::vector<CallGraphNode *> &CGNodes = *I;
     unsigned Level = 0;
     for (auto *CGNode : CGNodes) {
@@ -110,7 +111,7 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM,
       for (auto &I : instructions(F)) {
         if (auto *CS = getInlinableCS(I)) {
           auto *Called = CS->getCalledFunction();
-          auto Pos = FunctionLevels.find(Called);
+          auto Pos = FunctionLevels.find(&CG.get(*Called));
           // In bottom up traversal, an inlinable callee is either in the
           // same SCC, or to a function in a visited SCC. So not finding its
           // level means we haven't visited it yet, meaning it's in this SCC.
@@ -123,24 +124,73 @@ MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM,
     for (auto *CGNode : CGNodes) {
       Function *F = CGNode->getFunction();
       if (F && !F->isDeclaration())
-        FunctionLevels[F] = Level;
+        FunctionLevels[&CG.get(*F)] = Level;
     }
   }
+  for (auto KVP : FunctionLevels) {
+    AllNodes.insert(KVP.first);
+    EdgeCount += getLocalCalls(KVP.first->getFunction());
+  }
+  NodeCount = AllNodes.size();
+}
+
+unsigned MLInlineAdvisor::getInitialFunctionLevel(const Function &F) const {
+  return CG.lookup(F) ? FunctionLevels.at(CG.lookup(F)) : 0;
 }
 
 void MLInlineAdvisor::onPassEntry() {
   // Function passes executed between InlinerPass runs may have changed the
   // module-wide features.
-  if (!Invalid)
-    return;
-  NodeCount = 0;
-  EdgeCount = 0;
-  for (auto &F : M)
-    if (!F.isDeclaration()) {
-      ++NodeCount;
-      EdgeCount += getLocalCalls(F);
+  // The cgscc pass manager rules are such that:
+  // - if a pass leads to merging SCCs, then the pipeline is restarted on the
+  // merged SCC
+  // - if a pass leads to splitting the SCC, then we continue with one of the
+  // splits
+  // This means that the NodesInLastSCC is a superset (not strict) of the nodes
+  // that subsequent passes would have processed
+  // - in addition, if new Nodes were created by a pass (e.g. CoroSplit),
+  // they'd be adjacent to Nodes in the last SCC. So we just need to check the
+  // boundary of Nodes in NodesInLastSCC for Nodes we haven't seen. We don't
+  // care about the nature of the Edge (call or ref).
+  NodeCount -= static_cast<int64_t>(NodesInLastSCC.size());
+  while (!NodesInLastSCC.empty()) {
+    const auto *N = NodesInLastSCC.front();
+    NodesInLastSCC.pop_front();
+    // The Function wrapped by N could have been deleted since we last saw it.
+    if (N->isDead()) {
+      assert(!N->getFunction().isDeclaration());
+      continue;
+    }
+    ++NodeCount;
+    EdgeCount += getLocalCalls(N->getFunction());
+    for (const auto &E : *(*N)) {
+      const auto *AdjNode = &E.getNode();
+      assert(!AdjNode->isDead() && !AdjNode->getFunction().isDeclaration());
+      auto I = AllNodes.insert(AdjNode);
+      if (I.second)
+        NodesInLastSCC.push_back(AdjNode);
     }
-  Invalid = false;
+  }
+
+  EdgeCount -= EdgesOfLastSeenNodes;
+  EdgesOfLastSeenNodes = 0;
+}
+
+void MLInlineAdvisor::onPassExit(LazyCallGraph::SCC *LastSCC) {
+  if (!LastSCC)
+    return;
+  // Keep track of the nodes and edges we last saw. Then, in onPassEntry,
+  // we update the node count and edge count from the subset of these nodes that
+  // survived.
+  assert(NodesInLastSCC.empty());
+  assert(NodeCount >= LastSCC->size());
+  EdgesOfLastSeenNodes = 0;
+  for (const auto &N : *LastSCC) {
+    assert(!N.isDead());
+    EdgesOfLastSeenNodes += getLocalCalls(N.getFunction());
+    NodesInLastSCC.push_back(&N);
+  }
+  assert(EdgeCount >= EdgesOfLastSeenNodes);
 }
 
 int64_t MLInlineAdvisor::getLocalCalls(Function &F) {
@@ -192,7 +242,7 @@ void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
 
 int64_t MLInlineAdvisor::getModuleIRSize() const {
   int64_t Ret = 0;
-  for (auto &F : CG->getModule())
+  for (auto &F : M)
     if (!F.isDeclaration())
       Ret += getIRSize(F);
   return Ret;
@@ -263,7 +313,7 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   *ModelRunner->getTensor<int64_t>(FeatureIndex::CalleeBasicBlockCount) =
       CalleeBefore.BasicBlockCount;
   *ModelRunner->getTensor<int64_t>(FeatureIndex::CallSiteHeight) =
-      FunctionLevels[&Caller];
+      getInitialFunctionLevel(Caller);
   *ModelRunner->getTensor<int64_t>(FeatureIndex::NodeCount) = NodeCount;
   *ModelRunner->getTensor<int64_t>(FeatureIndex::NrCtantParams) = NrCtantParams;
   *ModelRunner->getTensor<int64_t>(FeatureIndex::EdgeCount) = EdgeCount;
@@ -361,4 +411,3 @@ void MLInlineAdvice::recordUnattemptedInliningImpl() {
     return R;
   });
 }
-#endif // defined(LLVM_HAVE_TF_AOT) || defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index ffdd7a2cfd4b..208f93aa1ac6 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -51,12 +51,13 @@ using namespace llvm;
 
 enum AllocType : uint8_t {
   OpNewLike          = 1<<0, // allocates; never returns null
-  MallocLike         = 1<<1 | OpNewLike, // allocates; may return null
+  MallocLike         = 1<<1, // allocates; may return null
   AlignedAllocLike   = 1<<2, // allocates with alignment; may return null
   CallocLike         = 1<<3, // allocates + bzero
   ReallocLike        = 1<<4, // reallocates
   StrDupLike         = 1<<5,
-  MallocOrCallocLike = MallocLike | CallocLike | AlignedAllocLike,
+  MallocOrOpNewLike  = MallocLike | OpNewLike,
+  MallocOrCallocLike = MallocLike | OpNewLike | CallocLike | AlignedAllocLike,
   AllocLike          = MallocOrCallocLike | StrDupLike,
   AnyAlloc           = AllocLike | ReallocLike
 };
@@ -66,64 +67,59 @@ struct AllocFnsTy {
   unsigned NumParams;
   // First and Second size parameters (or -1 if unused)
   int FstParam, SndParam;
+  // Alignment parameter for aligned_alloc and aligned new
+  int AlignParam;
 };
 
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
 static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
-  {LibFunc_malloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc_vec_malloc,          {MallocLike,  1, 0,  -1}},
-  {LibFunc_valloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc_Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc_ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc_ZnwjSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new(unsigned int, align_val_t)
-  {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, // new(unsigned int, align_val_t, nothrow)
-                                {MallocLike,  3, 0,  -1}},
-  {LibFunc_Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
-  {LibFunc_ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
-  {LibFunc_ZnwmSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new(unsigned long, align_val_t)
-  {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, // new(unsigned long, align_val_t, nothrow)
-                                {MallocLike,  3, 0,  -1}},
-  {LibFunc_Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc_ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc_ZnajSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new[](unsigned int, align_val_t)
-  {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, // new[](unsigned int, align_val_t, nothrow)
-                                {MallocLike,  3, 0,  -1}},
-  {LibFunc_Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
-  {LibFunc_ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
-  {LibFunc_ZnamSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new[](unsigned long, align_val_t)
-  {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, // new[](unsigned long, align_val_t, nothrow)
-                                 {MallocLike,  3, 0,  -1}},
-  {LibFunc_msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc_msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc_msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
-  {LibFunc_msvc_new_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned long long, nothrow)
-  {LibFunc_msvc_new_array_int,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc_msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc_msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
-  {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
-  {LibFunc_aligned_alloc,       {AlignedAllocLike, 2, 1,  -1}},
-  {LibFunc_memalign,            {AlignedAllocLike, 2, 1,  -1}},
-  {LibFunc_calloc,              {CallocLike,  2, 0,   1}},
-  {LibFunc_vec_calloc,          {CallocLike,  2, 0,   1}},
-  {LibFunc_realloc,             {ReallocLike, 2, 1,  -1}},
-  {LibFunc_vec_realloc,         {ReallocLike, 2, 1,  -1}},
-  {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
-  {LibFunc_strdup,              {StrDupLike,  1, -1, -1}},
-  {LibFunc_strndup,             {StrDupLike,  2, 1,  -1}},
-  {LibFunc___kmpc_alloc_shared, {MallocLike,  1, 0,  -1}},
-  // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
+    {LibFunc_malloc,                            {MallocLike,       1,  0, -1, -1}},
+    {LibFunc_vec_malloc,                        {MallocLike,       1,  0, -1, -1}},
+    {LibFunc_valloc,                            {MallocLike,       1,  0, -1, -1}},
+    {LibFunc_Znwj,                              {OpNewLike,        1,  0, -1, -1}}, // new(unsigned int)
+    {LibFunc_ZnwjRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new(unsigned int, nothrow)
+    {LibFunc_ZnwjSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new(unsigned int, align_val_t)
+    {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new(unsigned int, align_val_t, nothrow)
+    {LibFunc_Znwm,                              {OpNewLike,        1,  0, -1, -1}}, // new(unsigned long)
+    {LibFunc_ZnwmRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new(unsigned long, nothrow)
+    {LibFunc_ZnwmSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new(unsigned long, align_val_t)
+    {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new(unsigned long, align_val_t, nothrow)
+    {LibFunc_Znaj,                              {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned int)
+    {LibFunc_ZnajRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new[](unsigned int, nothrow)
+    {LibFunc_ZnajSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new[](unsigned int, align_val_t)
+    {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new[](unsigned int, align_val_t, nothrow)
+    {LibFunc_Znam,                              {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned long)
+    {LibFunc_ZnamRKSt9nothrow_t,                {MallocLike,       2,  0, -1, -1}}, // new[](unsigned long, nothrow)
+    {LibFunc_ZnamSt11align_val_t,               {OpNewLike,        2,  0, -1,  1}}, // new[](unsigned long, align_val_t)
+    {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, {MallocLike,       3,  0, -1,  1}}, // new[](unsigned long, align_val_t, nothrow)
+    {LibFunc_msvc_new_int,                      {OpNewLike,        1,  0, -1, -1}}, // new(unsigned int)
+    {LibFunc_msvc_new_int_nothrow,              {MallocLike,       2,  0, -1, -1}}, // new(unsigned int, nothrow)
+    {LibFunc_msvc_new_longlong,                 {OpNewLike,        1,  0, -1, -1}}, // new(unsigned long long)
+    {LibFunc_msvc_new_longlong_nothrow,         {MallocLike,       2,  0, -1, -1}}, // new(unsigned long long, nothrow)
+    {LibFunc_msvc_new_array_int,                {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned int)
+    {LibFunc_msvc_new_array_int_nothrow,        {MallocLike,       2,  0, -1, -1}}, // new[](unsigned int, nothrow)
+    {LibFunc_msvc_new_array_longlong,           {OpNewLike,        1,  0, -1, -1}}, // new[](unsigned long long)
+    {LibFunc_msvc_new_array_longlong_nothrow,   {MallocLike,       2,  0, -1, -1}}, // new[](unsigned long long, nothrow)
+    {LibFunc_aligned_alloc,                     {AlignedAllocLike, 2,  1, -1,  0}},
+    {LibFunc_memalign,                          {AlignedAllocLike, 2,  1, -1,  0}},
+    {LibFunc_calloc,                            {CallocLike,       2,  0,  1, -1}},
+    {LibFunc_vec_calloc,                        {CallocLike,       2,  0,  1, -1}},
+    {LibFunc_realloc,                           {ReallocLike,      2,  1, -1, -1}},
+    {LibFunc_vec_realloc,                       {ReallocLike,      2,  1, -1, -1}},
+    {LibFunc_reallocf,                          {ReallocLike,      2,  1, -1, -1}},
+    {LibFunc_strdup,                            {StrDupLike,       1, -1, -1, -1}},
+    {LibFunc_strndup,                           {StrDupLike,       2,  1, -1, -1}},
+    {LibFunc___kmpc_alloc_shared,               {MallocLike,       1,  0, -1, -1}},
+    // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
-static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast,
+static const Function *getCalledFunction(const Value *V,
                                          bool &IsNoBuiltin) {
   // Don't care about intrinsics in this case.
   if (isa<IntrinsicInst>(V))
     return nullptr;
 
-  if (LookThroughBitCast)
-    V = V->stripPointerCasts();
-
   const auto *CB = dyn_cast<CallBase>(V);
   if (!CB)
     return nullptr;
@@ -175,11 +171,9 @@ getAllocationDataForFunction(const Function *Callee, AllocType AllocTy,
 }
 
 static Optional<AllocFnsTy> getAllocationData(const Value *V, AllocType AllocTy,
-                                              const TargetLibraryInfo *TLI,
-                                              bool LookThroughBitCast = false) {
+                                              const TargetLibraryInfo *TLI) {
   bool IsNoBuiltinCall;
-  if (const Function *Callee =
-          getCalledFunction(V, LookThroughBitCast, IsNoBuiltinCall))
+  if (const Function *Callee = getCalledFunction(V, IsNoBuiltinCall))
     if (!IsNoBuiltinCall)
       return getAllocationDataForFunction(Callee, AllocTy, TLI);
   return None;
@@ -187,11 +181,9 @@ static Optional<AllocFnsTy> getAllocationData(const Value *V, AllocType AllocTy,
 
 static Optional<AllocFnsTy>
 getAllocationData(const Value *V, AllocType AllocTy,
-                  function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
-                  bool LookThroughBitCast = false) {
+                  function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
   bool IsNoBuiltinCall;
-  if (const Function *Callee =
-          getCalledFunction(V, LookThroughBitCast, IsNoBuiltinCall))
+  if (const Function *Callee = getCalledFunction(V, IsNoBuiltinCall))
     if (!IsNoBuiltinCall)
       return getAllocationDataForFunction(
           Callee, AllocTy, &GetTLI(const_cast<Function &>(*Callee)));
@@ -202,7 +194,7 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
                                               const TargetLibraryInfo *TLI) {
   bool IsNoBuiltinCall;
   const Function *Callee =
-      getCalledFunction(V, /*LookThroughBitCast=*/false, IsNoBuiltinCall);
+      getCalledFunction(V, IsNoBuiltinCall);
   if (!Callee)
     return None;
 
@@ -226,92 +218,57 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
   Result.NumParams = Callee->getNumOperands();
   Result.FstParam = Args.first;
   Result.SndParam = Args.second.getValueOr(-1);
+  // Allocsize has no way to specify an alignment argument
+  Result.AlignParam = -1;
   return Result;
 }
 
-static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
-  const auto *CB =
-      dyn_cast<CallBase>(LookThroughBitCast ? V->stripPointerCasts() : V);
-  return CB && CB->hasRetAttr(Attribute::NoAlias);
-}
-
 /// Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
-bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
-                          bool LookThroughBitCast) {
-  return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast).hasValue();
+bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, AnyAlloc, TLI).hasValue();
 }
 bool llvm::isAllocationFn(
-    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
-    bool LookThroughBitCast) {
-  return getAllocationData(V, AnyAlloc, GetTLI, LookThroughBitCast).hasValue();
-}
-
-/// Tests if a value is a call or invoke to a function that returns a
-/// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
-bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
-                       bool LookThroughBitCast) {
-  // it's safe to consider realloc as noalias since accessing the original
-  // pointer is undefined behavior
-  return isAllocationFn(V, TLI, LookThroughBitCast) ||
-         hasNoAliasAttr(V, LookThroughBitCast);
+    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+  return getAllocationData(V, AnyAlloc, GetTLI).hasValue();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
-bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                          bool LookThroughBitCast) {
-  return getAllocationData(V, MallocLike, TLI, LookThroughBitCast).hasValue();
-}
-bool llvm::isMallocLikeFn(
-    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
-    bool LookThroughBitCast) {
-  return getAllocationData(V, MallocLike, GetTLI, LookThroughBitCast)
-      .hasValue();
+static bool isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, MallocOrOpNewLike, TLI).hasValue();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory with alignment (such as aligned_alloc).
-bool llvm::isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                                bool LookThroughBitCast) {
-  return getAllocationData(V, AlignedAllocLike, TLI, LookThroughBitCast)
-      .hasValue();
-}
-bool llvm::isAlignedAllocLikeFn(
-    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
-    bool LookThroughBitCast) {
-  return getAllocationData(V, AlignedAllocLike, GetTLI, LookThroughBitCast)
+static bool isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, AlignedAllocLike, TLI)
       .hasValue();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
-bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                          bool LookThroughBitCast) {
-  return getAllocationData(V, CallocLike, TLI, LookThroughBitCast).hasValue();
+static bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, CallocLike, TLI).hasValue();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates memory similar to malloc or calloc.
-bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                                  bool LookThroughBitCast) {
-  return getAllocationData(V, MallocOrCallocLike, TLI,
-                           LookThroughBitCast).hasValue();
+bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, MallocOrCallocLike, TLI).hasValue();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
-bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                         bool LookThroughBitCast) {
-  return getAllocationData(V, AllocLike, TLI, LookThroughBitCast).hasValue();
+bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, AllocLike, TLI).hasValue();
 }
 
 /// Tests if a value is a call or invoke to a library function that
 /// reallocates memory (e.g., realloc).
-bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                     bool LookThroughBitCast) {
-  return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast).hasValue();
+bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, ReallocLike, TLI).hasValue();
 }
 
 /// Tests if a functions is a call or invoke to a library function that
@@ -320,113 +277,122 @@ bool llvm::isReallocLikeFn(const Function *F, const TargetLibraryInfo *TLI) {
   return getAllocationDataForFunction(F, ReallocLike, TLI).hasValue();
 }
 
-/// Tests if a value is a call or invoke to a library function that
-/// allocates memory and throws if an allocation failed (e.g., new).
-bool llvm::isOpNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                     bool LookThroughBitCast) {
-  return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast).hasValue();
-}
+bool llvm::isAllocRemovable(const CallBase *CB, const TargetLibraryInfo *TLI) {
+  assert(isAllocationFn(CB, TLI));
 
-/// Tests if a value is a call or invoke to a library function that
-/// allocates memory (strdup, strndup).
-bool llvm::isStrdupLikeFn(const Value *V, const TargetLibraryInfo *TLI,
-                          bool LookThroughBitCast) {
-  return getAllocationData(V, StrDupLike, TLI, LookThroughBitCast).hasValue();
-}
+  // Note: Removability is highly dependent on the source language.  For
+  // example, recent C++ requires direct calls to the global allocation
+  // [basic.stc.dynamic.allocation] to be observable unless part of a new
+  // expression [expr.new paragraph 13].
 
-/// extractMallocCall - Returns the corresponding CallInst if the instruction
-/// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
-/// ignore InvokeInst here.
-const CallInst *llvm::extractMallocCall(
-    const Value *I,
-    function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
-  return isMallocLikeFn(I, GetTLI) ? dyn_cast<CallInst>(I) : nullptr;
+  // Historically we've treated the C family allocation routines as removable
+  return isAllocLikeFn(CB, TLI);
 }
 
-static Value *computeArraySize(const CallInst *CI, const DataLayout &DL,
-                               const TargetLibraryInfo *TLI,
-                               bool LookThroughSExt = false) {
-  if (!CI)
-    return nullptr;
+Value *llvm::getAllocAlignment(const CallBase *V,
+                               const TargetLibraryInfo *TLI) {
+  assert(isAllocationFn(V, TLI));
 
-  // The size of the malloc's result type must be known to determine array size.
-  Type *T = getMallocAllocatedType(CI, TLI);
-  if (!T || !T->isSized())
+  const Optional<AllocFnsTy> FnData = getAllocationData(V, AnyAlloc, TLI);
+  if (!FnData.hasValue() || FnData->AlignParam < 0) {
     return nullptr;
+  }
+  return V->getOperand(FnData->AlignParam);
+}
 
-  unsigned ElementSize = DL.getTypeAllocSize(T);
-  if (StructType *ST = dyn_cast<StructType>(T))
-    ElementSize = DL.getStructLayout(ST)->getSizeInBytes();
+/// When we're compiling N-bit code, and the user uses parameters that are
+/// greater than N bits (e.g. uint64_t on a 32-bit build), we can run into
+/// trouble with APInt size issues. This function handles resizing + overflow
+/// checks for us. Check and zext or trunc \p I depending on IntTyBits and
+/// I's value.
+static bool CheckedZextOrTrunc(APInt &I, unsigned IntTyBits) {
+  // More bits than we can handle. Checking the bit width isn't necessary, but
+  // it's faster than checking active bits, and should give `false` in the
+  // vast majority of cases.
+  if (I.getBitWidth() > IntTyBits && I.getActiveBits() > IntTyBits)
+    return false;
+  if (I.getBitWidth() != IntTyBits)
+    I = I.zextOrTrunc(IntTyBits);
+  return true;
+}
 
-  // If malloc call's arg can be determined to be a multiple of ElementSize,
-  // return the multiple.  Otherwise, return NULL.
-  Value *MallocArg = CI->getArgOperand(0);
-  Value *Multiple = nullptr;
-  if (ComputeMultiple(MallocArg, ElementSize, Multiple, LookThroughSExt))
-    return Multiple;
+Optional<APInt>
+llvm::getAllocSize(const CallBase *CB,
+                   const TargetLibraryInfo *TLI,
+                   std::function<const Value*(const Value*)> Mapper) {
+  // Note: This handles both explicitly listed allocation functions and
+  // allocsize.  The code structure could stand to be cleaned up a bit.
+  Optional<AllocFnsTy> FnData = getAllocationSize(CB, TLI);
+  if (!FnData)
+    return None;
 
-  return nullptr;
-}
+  // Get the index type for this address space, results and intermediate
+  // computations are performed at that width.
+  auto &DL = CB->getModule()->getDataLayout();
+  const unsigned IntTyBits = DL.getIndexTypeSizeInBits(CB->getType());
+
+  // Handle strdup-like functions separately.
+  if (FnData->AllocTy == StrDupLike) {
+    APInt Size(IntTyBits, GetStringLength(Mapper(CB->getArgOperand(0))));
+    if (!Size)
+      return None;
 
-/// getMallocType - Returns the PointerType resulting from the malloc call.
-/// The PointerType depends on the number of bitcast uses of the malloc call:
-///   0: PointerType is the calls' return type.
-///   1: PointerType is the bitcast's result type.
-///  >1: Unique PointerType cannot be determined, return NULL.
-PointerType *llvm::getMallocType(const CallInst *CI,
-                                 const TargetLibraryInfo *TLI) {
-  assert(isMallocLikeFn(CI, TLI) && "getMallocType and not malloc call");
-
-  PointerType *MallocType = nullptr;
-  unsigned NumOfBitCastUses = 0;
-
-  // Determine if CallInst has a bitcast use.
-  for (const User *U : CI->users())
-    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      MallocType = cast<PointerType>(BCI->getDestTy());
-      NumOfBitCastUses++;
+    // Strndup limits strlen.
+    if (FnData->FstParam > 0) {
+      const ConstantInt *Arg =
+        dyn_cast<ConstantInt>(Mapper(CB->getArgOperand(FnData->FstParam)));
+      if (!Arg)
+        return None;
+
+      APInt MaxSize = Arg->getValue().zextOrSelf(IntTyBits);
+      if (Size.ugt(MaxSize))
+        Size = MaxSize + 1;
     }
+    return Size;
+  }
 
-  // Malloc call has 1 bitcast use, so type is the bitcast's destination type.
-  if (NumOfBitCastUses == 1)
-    return MallocType;
+  const ConstantInt *Arg =
+    dyn_cast<ConstantInt>(Mapper(CB->getArgOperand(FnData->FstParam)));
+  if (!Arg)
+    return None;
 
-  // Malloc call was not bitcast, so type is the malloc function's return type.
-  if (NumOfBitCastUses == 0)
-    return cast<PointerType>(CI->getType());
+  APInt Size = Arg->getValue();
+  if (!CheckedZextOrTrunc(Size, IntTyBits))
+    return None;
 
-  // Type could not be determined.
-  return nullptr;
-}
+  // Size is determined by just 1 parameter.
+  if (FnData->SndParam < 0)
+    return Size;
 
-/// getMallocAllocatedType - Returns the Type allocated by malloc call.
-/// The Type depends on the number of bitcast uses of the malloc call:
-///   0: PointerType is the malloc calls' return type.
-///   1: PointerType is the bitcast's result type.
-///  >1: Unique PointerType cannot be determined, return NULL.
-Type *llvm::getMallocAllocatedType(const CallInst *CI,
-                                   const TargetLibraryInfo *TLI) {
-  PointerType *PT = getMallocType(CI, TLI);
-  return PT ? PT->getElementType() : nullptr;
-}
+  Arg = dyn_cast<ConstantInt>(Mapper(CB->getArgOperand(FnData->SndParam)));
+  if (!Arg)
+    return None;
+
+  APInt NumElems = Arg->getValue();
+  if (!CheckedZextOrTrunc(NumElems, IntTyBits))
+    return None;
 
-/// getMallocArraySize - Returns the array size of a malloc call.  If the
-/// argument passed to malloc is a multiple of the size of the malloced type,
-/// then return that multiple.  For non-array mallocs, the multiple is
-/// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
-/// determined.
-Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout &DL,
-                                const TargetLibraryInfo *TLI,
-                                bool LookThroughSExt) {
-  assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call");
-  return computeArraySize(CI, DL, TLI, LookThroughSExt);
+  bool Overflow;
+  Size = Size.umul_ov(NumElems, Overflow);
+  if (Overflow)
+    return None;
+  return Size;
 }
 
-/// extractCallocCall - Returns the corresponding CallInst if the instruction
-/// is a calloc call.
-const CallInst *llvm::extractCallocCall(const Value *I,
-                                        const TargetLibraryInfo *TLI) {
-  return isCallocLikeFn(I, TLI) ? cast<CallInst>(I) : nullptr;
+Constant *llvm::getInitialValueOfAllocation(const CallBase *Alloc,
+                                            const TargetLibraryInfo *TLI,
+                                            Type *Ty) {
+  assert(isAllocationFn(Alloc, TLI));
+
+  // malloc and aligned_alloc are uninitialized (undef)
+  if (isMallocLikeFn(Alloc, TLI) || isAlignedAllocLikeFn(Alloc, TLI))
+    return UndefValue::get(Ty);
+
+  // calloc zero initializes
+  if (isCallocLikeFn(Alloc, TLI))
+    return Constant::getNullValue(Ty);
+
+  return nullptr;
 }
 
 /// isLibFreeFunction - Returns true if the function is a builtin free()
@@ -485,8 +451,7 @@ bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) {
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
 const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   bool IsNoBuiltinCall;
-  const Function *Callee =
-      getCalledFunction(I, /*LookThroughBitCast=*/false, IsNoBuiltinCall);
+  const Function *Callee = getCalledFunction(I, IsNoBuiltinCall);
   if (Callee == nullptr || IsNoBuiltinCall)
     return nullptr;
 
@@ -644,20 +609,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
   return unknown();
 }
 
-/// When we're compiling N-bit code, and the user uses parameters that are
-/// greater than N bits (e.g. uint64_t on a 32-bit build), we can run into
-/// trouble with APInt size issues. This function handles resizing + overflow
-/// checks for us. Check and zext or trunc \p I depending on IntTyBits and
-/// I's value.
 bool ObjectSizeOffsetVisitor::CheckedZextOrTrunc(APInt &I) {
-  // More bits than we can handle. Checking the bit width isn't necessary, but
-  // it's faster than checking active bits, and should give `false` in the
-  // vast majority of cases.
-  if (I.getBitWidth() > IntTyBits && I.getActiveBits() > IntTyBits)
-    return false;
-  if (I.getBitWidth() != IntTyBits)
-    I = I.zextOrTrunc(IntTyBits);
-  return true;
+  return ::CheckedZextOrTrunc(I, IntTyBits);
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
@@ -698,61 +651,10 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitCallBase(CallBase &CB) {
-  Optional<AllocFnsTy> FnData = getAllocationSize(&CB, TLI);
-  if (!FnData)
-    return unknown();
-
-  // Handle strdup-like functions separately.
-  if (FnData->AllocTy == StrDupLike) {
-    APInt Size(IntTyBits, GetStringLength(CB.getArgOperand(0)));
-    if (!Size)
-      return unknown();
-
-    // Strndup limits strlen.
-    if (FnData->FstParam > 0) {
-      ConstantInt *Arg =
-          dyn_cast<ConstantInt>(CB.getArgOperand(FnData->FstParam));
-      if (!Arg)
-        return unknown();
-
-      APInt MaxSize = Arg->getValue().zextOrSelf(IntTyBits);
-      if (Size.ugt(MaxSize))
-        Size = MaxSize + 1;
-    }
-    return std::make_pair(Size, Zero);
-  }
-
-  ConstantInt *Arg = dyn_cast<ConstantInt>(CB.getArgOperand(FnData->FstParam));
-  if (!Arg)
-    return unknown();
-
-  APInt Size = Arg->getValue();
-  if (!CheckedZextOrTrunc(Size))
-    return unknown();
-
-  // Size is determined by just 1 parameter.
-  if (FnData->SndParam < 0)
-    return std::make_pair(Size, Zero);
-
-  Arg = dyn_cast<ConstantInt>(CB.getArgOperand(FnData->SndParam));
-  if (!Arg)
-    return unknown();
-
-  APInt NumElems = Arg->getValue();
-  if (!CheckedZextOrTrunc(NumElems))
-    return unknown();
-
-  bool Overflow;
-  Size = Size.umul_ov(NumElems, Overflow);
-  return Overflow ? unknown() : std::make_pair(Size, Zero);
-
-  // TODO: handle more standard functions (+ wchar cousins):
-  // - strdup / strndup
-  // - strcpy / strncpy
-  // - strcat / strncat
-  // - memcpy / memmove
-  // - strcat / strncat
-  // - memset
+  auto Mapper = [](const Value *V) { return V; };
+  if (Optional<APInt> Size = getAllocSize(&CB, TLI, Mapper))
+    return std::make_pair(*Size, Zero);
+  return unknown();
 }
 
 SizeOffsetType
@@ -976,7 +878,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallBase(CallBase &CB) {
 
   // Handle strdup-like functions separately.
   if (FnData->AllocTy == StrDupLike) {
-    // TODO
+    // TODO: implement evaluation of strdup/strndup
     return unknown();
   }
 
@@ -989,14 +891,6 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallBase(CallBase &CB) {
   SecondArg = Builder.CreateZExtOrTrunc(SecondArg, IntTy);
   Value *Size = Builder.CreateMul(FirstArg, SecondArg);
   return std::make_pair(Size, Zero);
-
-  // TODO: handle more standard functions (+ wchar cousins):
-  // - strdup / strndup
-  // - strcpy / strncpy
-  // - strcat / strncat
-  // - memcpy / memmove
-  // - strcat / strncat
-  // - memset
 }
 
 SizeOffsetEvalType
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index da6bb4c49cba..36df462c7a66 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -594,7 +594,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     // turn into undef.  Note that we can bypass the allocation itself when
     // looking for a clobber in many cases; that's an alias property and is
     // handled by BasicAA.
-    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, &TLI)) {
+    if (isa<AllocaInst>(Inst) || isNoAliasCall(Inst)) {
       const Value *AccessPtr = getUnderlyingObject(MemLoc.Ptr);
       if (AccessPtr == Inst || BatchAA.isMustAlias(Inst, AccessPtr))
         return MemDepResult::getDef(Inst);
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index ac20e20f0c0d..57f431ec21f5 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -1265,8 +1265,8 @@ void MemorySSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
 }
 
 MemorySSA::MemorySSA(Function &Func, AliasAnalysis *AA, DominatorTree *DT)
-    : AA(nullptr), DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
-      SkipWalker(nullptr), NextID(0) {
+    : DT(DT), F(Func), LiveOnEntryDef(nullptr), Walker(nullptr),
+      SkipWalker(nullptr) {
   // Build MemorySSA using a batch alias analysis. This reuses the internal
   // state that AA collects during an alias()/getModRefInfo() call. This is
   // safe because there are no CFG changes while building MemorySSA and can
diff --git a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
index 941458f648bc..fab51d6a7aaf 100644
--- a/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
+++ b/llvm/lib/Analysis/ModelUnderTrainingRunner.cpp
@@ -22,12 +22,13 @@ ModelUnderTrainingRunner::ModelUnderTrainingRunner(
     LLVMContext &Ctx, const std::string &ModelPath,
     const std::vector<TensorSpec> &InputSpecs,
     const std::vector<LoggedFeatureSpec> &OutputSpecs)
-    : MLModelRunner(Ctx), OutputSpecs(OutputSpecs) {
+    : MLModelRunner(Ctx, MLModelRunner::Kind::Development),
+      OutputSpecs(OutputSpecs) {
   Evaluator = std::make_unique<TFModelEvaluator>(
       ModelPath, InputSpecs, [&](size_t I) { return OutputSpecs[I].Spec; },
       OutputSpecs.size());
   if (!Evaluator || !Evaluator->isValid()) {
-    Ctx.emitError("Failed to create inliner saved model evaluator");
+    Ctx.emitError("Failed to create saved model evaluator");
     Evaluator.reset();
     return;
   }
@@ -46,4 +47,21 @@ void *ModelUnderTrainingRunner::getTensorUntyped(size_t Index) {
   return Evaluator->getUntypedInput(Index);
 }
 
+std::unique_ptr<ModelUnderTrainingRunner>
+ModelUnderTrainingRunner::createAndEnsureValid(
+    LLVMContext &Ctx, const std::string &ModelPath, StringRef DecisionName,
+    const std::vector<TensorSpec> &InputSpecs,
+    StringRef OutputSpecsPathOverride) {
+  std::unique_ptr<ModelUnderTrainingRunner> MUTR;
+  if (auto MaybeOutputSpecs = loadOutputSpecs(Ctx, DecisionName, ModelPath,
+                                              OutputSpecsPathOverride))
+    MUTR.reset(new ModelUnderTrainingRunner(Ctx, ModelPath, InputSpecs,
+                                            *MaybeOutputSpecs));
+  if (MUTR && MUTR->isValid())
+    return MUTR;
+
+  Ctx.emitError("Could not load the policy model from the provided path");
+  return nullptr;
+}
+
 #endif // defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/NoInferenceModelRunner.cpp b/llvm/lib/Analysis/NoInferenceModelRunner.cpp
index 02ece6aa3900..7178120ebe4f 100644
--- a/llvm/lib/Analysis/NoInferenceModelRunner.cpp
+++ b/llvm/lib/Analysis/NoInferenceModelRunner.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 
 NoInferenceModelRunner::NoInferenceModelRunner(
     LLVMContext &Ctx, const std::vector<TensorSpec> &Inputs)
-    : MLModelRunner(Ctx) {
+    : MLModelRunner(Ctx, MLModelRunner::Kind::NoOp) {
   ValuesBuffer.reserve(Inputs.size());
   for (const auto &TS : Inputs)
     ValuesBuffer.push_back(std::make_unique<char[]>(TS.getElementCount() *
diff --git a/llvm/lib/Analysis/ObjCARCInstKind.cpp b/llvm/lib/Analysis/ObjCARCInstKind.cpp
index f74a9f7f104f..d177ee056a93 100644
--- a/llvm/lib/Analysis/ObjCARCInstKind.cpp
+++ b/llvm/lib/Analysis/ObjCARCInstKind.cpp
@@ -32,8 +32,8 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
     return OS << "ARCInstKind::Retain";
   case ARCInstKind::RetainRV:
     return OS << "ARCInstKind::RetainRV";
-  case ARCInstKind::ClaimRV:
-    return OS << "ARCInstKind::ClaimRV";
+  case ARCInstKind::UnsafeClaimRV:
+    return OS << "ARCInstKind::UnsafeClaimRV";
   case ARCInstKind::RetainBlock:
     return OS << "ARCInstKind::RetainBlock";
   case ARCInstKind::Release:
@@ -127,7 +127,7 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
   case Intrinsic::objc_clang_arc_use:
     return ARCInstKind::IntrinsicUser;
   case Intrinsic::objc_unsafeClaimAutoreleasedReturnValue:
-    return ARCInstKind::ClaimRV;
+    return ARCInstKind::UnsafeClaimRV;
   case Intrinsic::objc_retainedObject:
     return ARCInstKind::NoopCast;
   case Intrinsic::objc_unretainedObject:
@@ -334,7 +334,7 @@ bool llvm::objcarc::IsUser(ARCInstKind Class) {
   case ARCInstKind::StoreStrong:
   case ARCInstKind::Call:
   case ARCInstKind::None:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
     return false;
   }
   llvm_unreachable("covered switch isn't covered?");
@@ -370,7 +370,7 @@ bool llvm::objcarc::IsRetain(ARCInstKind Class) {
   case ARCInstKind::Call:
   case ARCInstKind::User:
   case ARCInstKind::None:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
     return false;
   }
   llvm_unreachable("covered switch isn't covered?");
@@ -384,7 +384,7 @@ bool llvm::objcarc::IsAutorelease(ARCInstKind Class) {
     return true;
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::RetainBlock:
   case ARCInstKind::Release:
   case ARCInstKind::AutoreleasepoolPush:
@@ -416,7 +416,7 @@ bool llvm::objcarc::IsForwarding(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
   case ARCInstKind::NoopCast:
@@ -451,7 +451,7 @@ bool llvm::objcarc::IsNoopOnNull(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::Release:
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
@@ -486,7 +486,7 @@ bool llvm::objcarc::IsNoopOnGlobal(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::Release:
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
@@ -522,7 +522,7 @@ bool llvm::objcarc::IsAlwaysTail(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::AutoreleaseRV:
     return true;
   case ARCInstKind::Release:
@@ -563,7 +563,7 @@ bool llvm::objcarc::IsNeverTail(ARCInstKind Class) {
     return true;
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::AutoreleaseRV:
   case ARCInstKind::Release:
   case ARCInstKind::RetainBlock:
@@ -598,7 +598,7 @@ bool llvm::objcarc::IsNoThrow(ARCInstKind Class) {
   switch (Class) {
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::Release:
   case ARCInstKind::Autorelease:
   case ARCInstKind::AutoreleaseRV:
@@ -643,7 +643,7 @@ bool llvm::objcarc::CanInterruptRV(ARCInstKind Class) {
     return true;
   case ARCInstKind::Retain:
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
   case ARCInstKind::Release:
   case ARCInstKind::AutoreleasepoolPush:
   case ARCInstKind::RetainBlock:
@@ -696,7 +696,7 @@ bool llvm::objcarc::CanDecrementRefCount(ARCInstKind Kind) {
   case ARCInstKind::StoreStrong:
   case ARCInstKind::CallOrUser:
   case ARCInstKind::Call:
-  case ARCInstKind::ClaimRV:
+  case ARCInstKind::UnsafeClaimRV:
     return true;
   }
 
diff --git a/llvm/lib/Analysis/PHITransAddr.cpp b/llvm/lib/Analysis/PHITransAddr.cpp
index 4c80f6743411..02d084937ccb 100644
--- a/llvm/lib/Analysis/PHITransAddr.cpp
+++ b/llvm/lib/Analysis/PHITransAddr.cpp
@@ -226,7 +226,8 @@ Value *PHITransAddr::PHITranslateSubExpr(Value *V, BasicBlock *CurBB,
       return GEP;
 
     // Simplify the GEP to handle 'gep x, 0' -> x etc.
-    if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(), GEPOps,
+    if (Value *V = SimplifyGEPInst(GEP->getSourceElementType(), GEPOps[0],
+                                   ArrayRef<Value *>(GEPOps).slice(1),
                                    GEP->isInBounds(), {DL, TLI, DT, AC})) {
       for (unsigned i = 0, e = GEPOps.size(); i != e; ++i)
         RemoveInstInputs(GEPOps[i], InstInputs);
diff --git a/llvm/lib/Analysis/RegionPass.cpp b/llvm/lib/Analysis/RegionPass.cpp
index c20ecff5f912..10c8569096c6 100644
--- a/llvm/lib/Analysis/RegionPass.cpp
+++ b/llvm/lib/Analysis/RegionPass.cpp
@@ -30,8 +30,7 @@ using namespace llvm;
 
 char RGPassManager::ID = 0;
 
-RGPassManager::RGPassManager()
-  : FunctionPass(ID), PMDataManager() {
+RGPassManager::RGPassManager() : FunctionPass(ID) {
   RI = nullptr;
   CurrentRegion = nullptr;
 }
diff --git a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
index f83d8b0fd230..294bc38c17ad 100644
--- a/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/ReplayInlineAdvisor.cpp
@@ -28,8 +28,7 @@ ReplayInlineAdvisor::ReplayInlineAdvisor(
     std::unique_ptr<InlineAdvisor> OriginalAdvisor,
     const ReplayInlinerSettings &ReplaySettings, bool EmitRemarks)
     : InlineAdvisor(M, FAM), OriginalAdvisor(std::move(OriginalAdvisor)),
-      HasReplayRemarks(false), ReplaySettings(ReplaySettings),
-      EmitRemarks(EmitRemarks) {
+      ReplaySettings(ReplaySettings), EmitRemarks(EmitRemarks) {
 
   auto BufferOrErr = MemoryBuffer::getFileOrSTDIN(ReplaySettings.ReplayFile);
   std::error_code EC = BufferOrErr.getError();
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 0c3f32295ae1..07aac1523b47 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -301,7 +301,8 @@ void SCEV::print(raw_ostream &OS) const {
   case scUMaxExpr:
   case scSMaxExpr:
   case scUMinExpr:
-  case scSMinExpr: {
+  case scSMinExpr:
+  case scSequentialUMinExpr: {
     const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(this);
     const char *OpStr = nullptr;
     switch (NAry->getSCEVType()) {
@@ -315,6 +316,9 @@ void SCEV::print(raw_ostream &OS) const {
     case scSMinExpr:
       OpStr = " smin ";
       break;
+    case scSequentialUMinExpr:
+      OpStr = " umin_seq ";
+      break;
     default:
       llvm_unreachable("There are no other nary expression types.");
     }
@@ -392,6 +396,8 @@ Type *SCEV::getType() const {
   case scUMinExpr:
   case scSMinExpr:
     return cast<SCEVMinMaxExpr>(this)->getType();
+  case scSequentialUMinExpr:
+    return cast<SCEVSequentialMinMaxExpr>(this)->getType();
   case scAddExpr:
     return cast<SCEVAddExpr>(this)->getType();
   case scUDivExpr:
@@ -774,7 +780,8 @@ CompareSCEVComplexity(EquivalenceClasses<const SCEV *> &EqCacheSCEV,
   case scSMaxExpr:
   case scUMaxExpr:
   case scSMinExpr:
-  case scUMinExpr: {
+  case scUMinExpr:
+  case scSequentialUMinExpr: {
     const SCEVNAryExpr *LC = cast<SCEVNAryExpr>(LHS);
     const SCEVNAryExpr *RC = cast<SCEVNAryExpr>(RHS);
 
@@ -2110,6 +2117,22 @@ ScalarEvolution::getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
   return S;
 }
 
+const SCEV *ScalarEvolution::getCastExpr(SCEVTypes Kind, const SCEV *Op,
+                                         Type *Ty) {
+  switch (Kind) {
+  case scTruncate:
+    return getTruncateExpr(Op, Ty);
+  case scZeroExtend:
+    return getZeroExtendExpr(Op, Ty);
+  case scSignExtend:
+    return getSignExtendExpr(Op, Ty);
+  case scPtrToInt:
+    return getPtrToIntExpr(Op, Ty);
+  default:
+    llvm_unreachable("Not a SCEV cast expression!");
+  }
+}
+
 /// getAnyExtendExpr - Return a SCEV for the given operand extended with
 /// unspecified bits out to the given type.
 const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
@@ -3463,7 +3486,7 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
   return S;
 }
 
-static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
+const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
   APInt A = C1->getAPInt().abs();
   APInt B = C2->getAPInt().abs();
   uint32_t ABW = A.getBitWidth();
@@ -3721,6 +3744,7 @@ const SCEV *ScalarEvolution::getAbsExpr(const SCEV *Op, bool IsNSW) {
 
 const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind,
                                            SmallVectorImpl<const SCEV *> &Ops) {
+  assert(SCEVMinMaxExpr::isMinMaxType(Kind) && "Not a SCEVMinMaxExpr!");
   assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!");
   if (Ops.size() == 1) return Ops[0];
 #ifndef NDEBUG
@@ -3857,6 +3881,209 @@ const SCEV *ScalarEvolution::getMinMaxExpr(SCEVTypes Kind,
   return S;
 }
 
+namespace {
+
+class SCEVSequentialMinMaxDeduplicatingVisitor final
+    : public SCEVVisitor<SCEVSequentialMinMaxDeduplicatingVisitor,
+                         Optional<const SCEV *>> {
+  using RetVal = Optional<const SCEV *>;
+  using Base = SCEVVisitor<SCEVSequentialMinMaxDeduplicatingVisitor, RetVal>;
+
+  ScalarEvolution &SE;
+  const SCEVTypes RootKind; // Must be a sequential min/max expression.
+  const SCEVTypes NonSequentialRootKind; // Non-sequential variant of RootKind.
+  SmallPtrSet<const SCEV *, 16> SeenOps;
+
+  bool canRecurseInto(SCEVTypes Kind) const {
+    // We can only recurse into the SCEV expression of the same effective type
+    // as the type of our root SCEV expression.
+    return RootKind == Kind || NonSequentialRootKind == Kind;
+  };
+
+  RetVal visitAnyMinMaxExpr(const SCEV *S) {
+    assert((isa<SCEVMinMaxExpr>(S) || isa<SCEVSequentialMinMaxExpr>(S)) &&
+           "Only for min/max expressions.");
+    SCEVTypes Kind = S->getSCEVType();
+
+    if (!canRecurseInto(Kind))
+      return S;
+
+    auto *NAry = cast<SCEVNAryExpr>(S);
+    SmallVector<const SCEV *> NewOps;
+    bool Changed =
+        visit(Kind, makeArrayRef(NAry->op_begin(), NAry->op_end()), NewOps);
+
+    if (!Changed)
+      return S;
+    if (NewOps.empty())
+      return None;
+
+    return isa<SCEVSequentialMinMaxExpr>(S)
+               ? SE.getSequentialMinMaxExpr(Kind, NewOps)
+               : SE.getMinMaxExpr(Kind, NewOps);
+  }
+
+  RetVal visit(const SCEV *S) {
+    // Has the whole operand been seen already?
+    if (!SeenOps.insert(S).second)
+      return None;
+    return Base::visit(S);
+  }
+
+public:
+  SCEVSequentialMinMaxDeduplicatingVisitor(ScalarEvolution &SE,
+                                           SCEVTypes RootKind)
+      : SE(SE), RootKind(RootKind),
+        NonSequentialRootKind(
+            SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType(
+                RootKind)) {}
+
+  bool /*Changed*/ visit(SCEVTypes Kind, ArrayRef<const SCEV *> OrigOps,
+                         SmallVectorImpl<const SCEV *> &NewOps) {
+    bool Changed = false;
+    SmallVector<const SCEV *> Ops;
+    Ops.reserve(OrigOps.size());
+
+    for (const SCEV *Op : OrigOps) {
+      RetVal NewOp = visit(Op);
+      if (NewOp != Op)
+        Changed = true;
+      if (NewOp)
+        Ops.emplace_back(*NewOp);
+    }
+
+    if (Changed)
+      NewOps = std::move(Ops);
+    return Changed;
+  }
+
+  RetVal visitConstant(const SCEVConstant *Constant) { return Constant; }
+
+  RetVal visitPtrToIntExpr(const SCEVPtrToIntExpr *Expr) { return Expr; }
+
+  RetVal visitTruncateExpr(const SCEVTruncateExpr *Expr) { return Expr; }
+
+  RetVal visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) { return Expr; }
+
+  RetVal visitSignExtendExpr(const SCEVSignExtendExpr *Expr) { return Expr; }
+
+  RetVal visitAddExpr(const SCEVAddExpr *Expr) { return Expr; }
+
+  RetVal visitMulExpr(const SCEVMulExpr *Expr) { return Expr; }
+
+  RetVal visitUDivExpr(const SCEVUDivExpr *Expr) { return Expr; }
+
+  RetVal visitAddRecExpr(const SCEVAddRecExpr *Expr) { return Expr; }
+
+  RetVal visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    return visitAnyMinMaxExpr(Expr);
+  }
+
+  RetVal visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    return visitAnyMinMaxExpr(Expr);
+  }
+
+  RetVal visitSMinExpr(const SCEVSMinExpr *Expr) {
+    return visitAnyMinMaxExpr(Expr);
+  }
+
+  RetVal visitUMinExpr(const SCEVUMinExpr *Expr) {
+    return visitAnyMinMaxExpr(Expr);
+  }
+
+  RetVal visitSequentialUMinExpr(const SCEVSequentialUMinExpr *Expr) {
+    return visitAnyMinMaxExpr(Expr);
+  }
+
+  RetVal visitUnknown(const SCEVUnknown *Expr) { return Expr; }
+
+  RetVal visitCouldNotCompute(const SCEVCouldNotCompute *Expr) { return Expr; }
+};
+
+} // namespace
+
+const SCEV *
+ScalarEvolution::getSequentialMinMaxExpr(SCEVTypes Kind,
+                                         SmallVectorImpl<const SCEV *> &Ops) {
+  assert(SCEVSequentialMinMaxExpr::isSequentialMinMaxType(Kind) &&
+         "Not a SCEVSequentialMinMaxExpr!");
+  assert(!Ops.empty() && "Cannot get empty (u|s)(min|max)!");
+  if (Ops.size() == 1)
+    return Ops[0];
+  if (Ops.size() == 2 &&
+      any_of(Ops, [](const SCEV *Op) { return isa<SCEVConstant>(Op); }))
+    return getMinMaxExpr(
+        SCEVSequentialMinMaxExpr::getEquivalentNonSequentialSCEVType(Kind),
+        Ops);
+#ifndef NDEBUG
+  Type *ETy = getEffectiveSCEVType(Ops[0]->getType());
+  for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
+    assert(getEffectiveSCEVType(Ops[i]->getType()) == ETy &&
+           "Operand types don't match!");
+    assert(Ops[0]->getType()->isPointerTy() ==
+               Ops[i]->getType()->isPointerTy() &&
+           "min/max should be consistently pointerish");
+  }
+#endif
+
+  // Note that SCEVSequentialMinMaxExpr is *NOT* commutative,
+  // so we can *NOT* do any kind of sorting of the expressions!
+
+  // Check if we have created the same expression before.
+  if (const SCEV *S = findExistingSCEVInCache(Kind, Ops))
+    return S;
+
+  // FIXME: there are *some* simplifications that we can do here.
+
+  // Keep only the first instance of an operand.
+  {
+    SCEVSequentialMinMaxDeduplicatingVisitor Deduplicator(*this, Kind);
+    bool Changed = Deduplicator.visit(Kind, Ops, Ops);
+    if (Changed)
+      return getSequentialMinMaxExpr(Kind, Ops);
+  }
+
+  // Check to see if one of the operands is of the same kind. If so, expand its
+  // operands onto our operand list, and recurse to simplify.
+  {
+    unsigned Idx = 0;
+    bool DeletedAny = false;
+    while (Idx < Ops.size()) {
+      if (Ops[Idx]->getSCEVType() != Kind) {
+        ++Idx;
+        continue;
+      }
+      const auto *SMME = cast<SCEVSequentialMinMaxExpr>(Ops[Idx]);
+      Ops.erase(Ops.begin() + Idx);
+      Ops.insert(Ops.begin() + Idx, SMME->op_begin(), SMME->op_end());
+      DeletedAny = true;
+    }
+
+    if (DeletedAny)
+      return getSequentialMinMaxExpr(Kind, Ops);
+  }
+
+  // Okay, it looks like we really DO need an expr.  Check to see if we
+  // already have one, otherwise create a new one.
+  FoldingSetNodeID ID;
+  ID.AddInteger(Kind);
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    ID.AddPointer(Ops[i]);
+  void *IP = nullptr;
+  const SCEV *ExistingSCEV = UniqueSCEVs.FindNodeOrInsertPos(ID, IP);
+  if (ExistingSCEV)
+    return ExistingSCEV;
+
+  const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
+  std::uninitialized_copy(Ops.begin(), Ops.end(), O);
+  SCEV *S = new (SCEVAllocator)
+      SCEVSequentialMinMaxExpr(ID.Intern(SCEVAllocator), Kind, O, Ops.size());
+
+  UniqueSCEVs.InsertNode(S, IP);
+  registerUser(S, Ops);
+  return S;
+}
+
 const SCEV *ScalarEvolution::getSMaxExpr(const SCEV *LHS, const SCEV *RHS) {
   SmallVector<const SCEV *, 2> Ops = {LHS, RHS};
   return getSMaxExpr(Ops);
@@ -3885,14 +4112,16 @@ const SCEV *ScalarEvolution::getSMinExpr(SmallVectorImpl<const SCEV *> &Ops) {
   return getMinMaxExpr(scSMinExpr, Ops);
 }
 
-const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
-                                         const SCEV *RHS) {
+const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS, const SCEV *RHS,
+                                         bool Sequential) {
   SmallVector<const SCEV *, 2> Ops = { LHS, RHS };
-  return getUMinExpr(Ops);
+  return getUMinExpr(Ops, Sequential);
 }
 
-const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl<const SCEV *> &Ops) {
-  return getMinMaxExpr(scUMinExpr, Ops);
+const SCEV *ScalarEvolution::getUMinExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                         bool Sequential) {
+  return Sequential ? getSequentialMinMaxExpr(scSequentialUMinExpr, Ops)
+                    : getMinMaxExpr(scUMinExpr, Ops);
 }
 
 const SCEV *
@@ -4375,13 +4604,15 @@ const SCEV *ScalarEvolution::getUMaxFromMismatchedTypes(const SCEV *LHS,
 }
 
 const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(const SCEV *LHS,
-                                                        const SCEV *RHS) {
+                                                        const SCEV *RHS,
+                                                        bool Sequential) {
   SmallVector<const SCEV *, 2> Ops = { LHS, RHS };
-  return getUMinFromMismatchedTypes(Ops);
+  return getUMinFromMismatchedTypes(Ops, Sequential);
 }
 
-const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(
-    SmallVectorImpl<const SCEV *> &Ops) {
+const SCEV *
+ScalarEvolution::getUMinFromMismatchedTypes(SmallVectorImpl<const SCEV *> &Ops,
+                                            bool Sequential) {
   assert(!Ops.empty() && "At least one operand must be!");
   // Trivial case.
   if (Ops.size() == 1)
@@ -4402,7 +4633,7 @@ const SCEV *ScalarEvolution::getUMinFromMismatchedTypes(
     PromotedOps.push_back(getNoopOrZeroExtend(S, MaxType));
 
   // Generate umin.
-  return getUMinExpr(PromotedOps);
+  return getUMinExpr(PromotedOps, Sequential);
 }
 
 const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
@@ -5513,6 +5744,7 @@ static bool IsAvailableOnEntry(const Loop *L, DominatorTree &DT, const SCEV *S,
       case scSMaxExpr:
       case scUMinExpr:
       case scSMinExpr:
+      case scSequentialUMinExpr:
         // These expressions are available if their operand(s) is/are.
         return true;
 
@@ -6060,35 +6292,31 @@ ScalarEvolution::getRangeRef(const SCEV *S,
                     ConservativeResult.intersectWith(X, RangeType));
   }
 
-  if (const SCEVSMaxExpr *SMax = dyn_cast<SCEVSMaxExpr>(S)) {
-    ConstantRange X = getRangeRef(SMax->getOperand(0), SignHint);
-    for (unsigned i = 1, e = SMax->getNumOperands(); i != e; ++i)
-      X = X.smax(getRangeRef(SMax->getOperand(i), SignHint));
-    return setRange(SMax, SignHint,
-                    ConservativeResult.intersectWith(X, RangeType));
-  }
-
-  if (const SCEVUMaxExpr *UMax = dyn_cast<SCEVUMaxExpr>(S)) {
-    ConstantRange X = getRangeRef(UMax->getOperand(0), SignHint);
-    for (unsigned i = 1, e = UMax->getNumOperands(); i != e; ++i)
-      X = X.umax(getRangeRef(UMax->getOperand(i), SignHint));
-    return setRange(UMax, SignHint,
-                    ConservativeResult.intersectWith(X, RangeType));
-  }
-
-  if (const SCEVSMinExpr *SMin = dyn_cast<SCEVSMinExpr>(S)) {
-    ConstantRange X = getRangeRef(SMin->getOperand(0), SignHint);
-    for (unsigned i = 1, e = SMin->getNumOperands(); i != e; ++i)
-      X = X.smin(getRangeRef(SMin->getOperand(i), SignHint));
-    return setRange(SMin, SignHint,
-                    ConservativeResult.intersectWith(X, RangeType));
-  }
+  if (isa<SCEVMinMaxExpr>(S) || isa<SCEVSequentialMinMaxExpr>(S)) {
+    Intrinsic::ID ID;
+    switch (S->getSCEVType()) {
+    case scUMaxExpr:
+      ID = Intrinsic::umax;
+      break;
+    case scSMaxExpr:
+      ID = Intrinsic::smax;
+      break;
+    case scUMinExpr:
+    case scSequentialUMinExpr:
+      ID = Intrinsic::umin;
+      break;
+    case scSMinExpr:
+      ID = Intrinsic::smin;
+      break;
+    default:
+      llvm_unreachable("Unknown SCEVMinMaxExpr/SCEVSequentialMinMaxExpr.");
+    }
 
-  if (const SCEVUMinExpr *UMin = dyn_cast<SCEVUMinExpr>(S)) {
-    ConstantRange X = getRangeRef(UMin->getOperand(0), SignHint);
-    for (unsigned i = 1, e = UMin->getNumOperands(); i != e; ++i)
-      X = X.umin(getRangeRef(UMin->getOperand(i), SignHint));
-    return setRange(UMin, SignHint,
+    const auto *NAry = cast<SCEVNAryExpr>(S);
+    ConstantRange X = getRangeRef(NAry->getOperand(0), SignHint);
+    for (unsigned i = 1, e = NAry->getNumOperands(); i != e; ++i)
+      X = X.intrinsic(ID, {X, getRangeRef(NAry->getOperand(i), SignHint)});
+    return setRange(S, SignHint,
                     ConservativeResult.intersectWith(X, RangeType));
   }
 
@@ -7368,11 +7596,6 @@ const SCEV *ScalarEvolution::getConstantMaxTripCountFromArray(const Loop *L) {
       auto *ArrSize = dyn_cast<ConstantInt>(AllocateInst->getArraySize());
       if (!Ty || !ArrSize || !ArrSize->isOne())
         continue;
-      // Also make sure step was increased the same with sizeof allocated
-      // element type.
-      const PointerType *GEPT = dyn_cast<PointerType>(GEP->getType());
-      if (Ty->getElementType() != GEPT->getElementType())
-        continue;
 
       // FIXME: Since gep indices are silently zext to the indexing type,
       // we will have a narrow gep index which wraps around rather than
@@ -8093,6 +8316,29 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeExitLimitFromCondImpl(
       return getZero(CI->getType());
   }
 
+  // If we're exiting based on the overflow flag of an x.with.overflow intrinsic
+  // with a constant step, we can form an equivalent icmp predicate and figure
+  // out how many iterations will be taken before we exit.
+  const WithOverflowInst *WO;
+  const APInt *C;
+  if (match(ExitCond, m_ExtractValue<1>(m_WithOverflowInst(WO))) &&
+      match(WO->getRHS(), m_APInt(C))) {
+    ConstantRange NWR =
+      ConstantRange::makeExactNoWrapRegion(WO->getBinaryOp(), *C,
+                                           WO->getNoWrapKind());
+    CmpInst::Predicate Pred;
+    APInt NewRHSC, Offset;
+    NWR.getEquivalentICmp(Pred, NewRHSC, Offset);
+    if (!ExitIfTrue)
+      Pred = ICmpInst::getInversePredicate(Pred);
+    auto *LHS = getSCEV(WO->getLHS());
+    if (Offset != 0)
+      LHS = getAddExpr(LHS, getConstant(Offset));
+    auto EL = computeExitLimitFromICmp(L, Pred, LHS, getConstant(NewRHSC),
+                                       ControlsExit, AllowPredicates);
+    if (EL.hasAnyInfo()) return EL;
+  }
+
   // If it's not an integer or pointer comparison then compute it the hard way.
   return computeExitCountExhaustively(L, ExitCond, ExitIfTrue);
 }
@@ -8134,26 +8380,11 @@ ScalarEvolution::computeExitLimitFromCondFromBinOp(
   if (EitherMayExit) {
     // Both conditions must be same for the loop to continue executing.
     // Choose the less conservative count.
-    // If ExitCond is a short-circuit form (select), using
-    // umin(EL0.ExactNotTaken, EL1.ExactNotTaken) is unsafe in general.
-    // To see the detailed examples, please see
-    // test/Analysis/ScalarEvolution/exit-count-select.ll
-    bool PoisonSafe = isa<BinaryOperator>(ExitCond);
-    if (!PoisonSafe)
-      // Even if ExitCond is select, we can safely derive BECount using both
-      // EL0 and EL1 in these cases:
-      // (1) EL0.ExactNotTaken is non-zero
-      // (2) EL1.ExactNotTaken is non-poison
-      // (3) EL0.ExactNotTaken is zero (BECount should be simply zero and
-      //     it cannot be umin(0, ..))
-      // The PoisonSafe assignment below is simplified and the assertion after
-      // BECount calculation fully guarantees the condition (3).
-      PoisonSafe = isa<SCEVConstant>(EL0.ExactNotTaken) ||
-                   isa<SCEVConstant>(EL1.ExactNotTaken);
     if (EL0.ExactNotTaken != getCouldNotCompute() &&
-        EL1.ExactNotTaken != getCouldNotCompute() && PoisonSafe) {
-      BECount =
-          getUMinFromMismatchedTypes(EL0.ExactNotTaken, EL1.ExactNotTaken);
+        EL1.ExactNotTaken != getCouldNotCompute()) {
+      BECount = getUMinFromMismatchedTypes(
+          EL0.ExactNotTaken, EL1.ExactNotTaken,
+          /*Sequential=*/!isa<BinaryOperator>(ExitCond));
 
       // If EL0.ExactNotTaken was zero and ExitCond was a short-circuit form,
       // it should have been simplified to zero (see the condition (3) above)
@@ -8203,6 +8434,26 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
   const SCEV *LHS = getSCEV(ExitCond->getOperand(0));
   const SCEV *RHS = getSCEV(ExitCond->getOperand(1));
 
+  ExitLimit EL = computeExitLimitFromICmp(L, Pred, LHS, RHS, ControlsExit,
+                                          AllowPredicates);
+  if (EL.hasAnyInfo()) return EL;
+
+  auto *ExhaustiveCount =
+      computeExitCountExhaustively(L, ExitCond, ExitIfTrue);
+
+  if (!isa<SCEVCouldNotCompute>(ExhaustiveCount))
+    return ExhaustiveCount;
+
+  return computeShiftCompareExitLimit(ExitCond->getOperand(0),
+                                      ExitCond->getOperand(1), L, OriginalPred);
+}
+ScalarEvolution::ExitLimit
+ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
+                                          ICmpInst::Predicate Pred,
+                                          const SCEV *LHS, const SCEV *RHS,
+                                          bool ControlsExit,
+                                          bool AllowPredicates) {
+
   // Try to evaluate any dependencies out of the loop.
   LHS = getSCEVAtScope(LHS, L);
   RHS = getSCEVAtScope(RHS, L);
@@ -8312,14 +8563,7 @@ ScalarEvolution::computeExitLimitFromICmp(const Loop *L,
     break;
   }
 
-  auto *ExhaustiveCount =
-      computeExitCountExhaustively(L, ExitCond, ExitIfTrue);
-
-  if (!isa<SCEVCouldNotCompute>(ExhaustiveCount))
-    return ExhaustiveCount;
-
-  return computeShiftCompareExitLimit(ExitCond->getOperand(0),
-                                      ExitCond->getOperand(1), L, OriginalPred);
+  return getCouldNotCompute();
 }
 
 ScalarEvolution::ExitLimit
@@ -8941,7 +9185,8 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
   case scUMaxExpr:
   case scSMinExpr:
   case scUMinExpr:
-    return nullptr; // TODO: smax, umax, smin, umax.
+  case scSequentialUMinExpr:
+    return nullptr; // TODO: smax, umax, smin, umax, umin_seq.
   }
   llvm_unreachable("Unknown SCEV kind!");
 }
@@ -9070,7 +9315,8 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
     return V;
   }
 
-  if (const SCEVCommutativeExpr *Comm = dyn_cast<SCEVCommutativeExpr>(V)) {
+  if (isa<SCEVCommutativeExpr>(V) || isa<SCEVSequentialMinMaxExpr>(V)) {
+    const auto *Comm = cast<SCEVNAryExpr>(V);
     // Avoid performing the look-up in the common case where the specified
     // expression has no loop-variant portions.
     for (unsigned i = 0, e = Comm->getNumOperands(); i != e; ++i) {
@@ -9092,7 +9338,9 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
           return getMulExpr(NewOps, Comm->getNoWrapFlags());
         if (isa<SCEVMinMaxExpr>(Comm))
           return getMinMaxExpr(Comm->getSCEVType(), NewOps);
-        llvm_unreachable("Unknown commutative SCEV type!");
+        if (isa<SCEVSequentialMinMaxExpr>(Comm))
+          return getSequentialMinMaxExpr(Comm->getSCEVType(), NewOps);
+        llvm_unreachable("Unknown commutative / sequential min/max SCEV type!");
       }
     }
     // If we got here, all operands are loop invariant.
@@ -9153,32 +9401,11 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
     return AddRec;
   }
 
-  if (const SCEVZeroExtendExpr *Cast = dyn_cast<SCEVZeroExtendExpr>(V)) {
-    const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
-    if (Op == Cast->getOperand())
-      return Cast;  // must be loop invariant
-    return getZeroExtendExpr(Op, Cast->getType());
-  }
-
-  if (const SCEVSignExtendExpr *Cast = dyn_cast<SCEVSignExtendExpr>(V)) {
-    const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
-    if (Op == Cast->getOperand())
-      return Cast;  // must be loop invariant
-    return getSignExtendExpr(Op, Cast->getType());
-  }
-
-  if (const SCEVTruncateExpr *Cast = dyn_cast<SCEVTruncateExpr>(V)) {
+  if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) {
     const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
     if (Op == Cast->getOperand())
       return Cast;  // must be loop invariant
-    return getTruncateExpr(Op, Cast->getType());
-  }
-
-  if (const SCEVPtrToIntExpr *Cast = dyn_cast<SCEVPtrToIntExpr>(V)) {
-    const SCEV *Op = getSCEVAtScope(Cast->getOperand(), L);
-    if (Op == Cast->getOperand())
-      return Cast; // must be loop invariant
-    return getPtrToIntExpr(Op, Cast->getType());
+    return getCastExpr(Cast->getSCEVType(), Op, Cast->getType());
   }
 
   llvm_unreachable("Unknown SCEV type!");
@@ -11236,6 +11463,48 @@ bool ScalarEvolution::isImpliedViaMerge(ICmpInst::Predicate Pred,
   return true;
 }
 
+bool ScalarEvolution::isImpliedCondOperandsViaShift(ICmpInst::Predicate Pred,
+                                                    const SCEV *LHS,
+                                                    const SCEV *RHS,
+                                                    const SCEV *FoundLHS,
+                                                    const SCEV *FoundRHS) {
+  // We want to imply LHS < RHS from LHS < (RHS >> shiftvalue).  First, make
+  // sure that we are dealing with same LHS.
+  if (RHS == FoundRHS) {
+    std::swap(LHS, RHS);
+    std::swap(FoundLHS, FoundRHS);
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+  if (LHS != FoundLHS)
+    return false;
+
+  auto *SUFoundRHS = dyn_cast<SCEVUnknown>(FoundRHS);
+  if (!SUFoundRHS)
+    return false;
+
+  Value *Shiftee, *ShiftValue;
+
+  using namespace PatternMatch;
+  if (match(SUFoundRHS->getValue(),
+            m_LShr(m_Value(Shiftee), m_Value(ShiftValue)))) {
+    auto *ShifteeS = getSCEV(Shiftee);
+    // Prove one of the following:
+    // LHS <u (shiftee >> shiftvalue) && shiftee <=u RHS ---> LHS <u RHS
+    // LHS <=u (shiftee >> shiftvalue) && shiftee <=u RHS ---> LHS <=u RHS
+    // LHS <s (shiftee >> shiftvalue) && shiftee <=s RHS && shiftee >=s 0
+    //   ---> LHS <s RHS
+    // LHS <=s (shiftee >> shiftvalue) && shiftee <=s RHS && shiftee >=s 0
+    //   ---> LHS <=s RHS
+    if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE)
+      return isKnownPredicate(ICmpInst::ICMP_ULE, ShifteeS, RHS);
+    if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+      if (isKnownNonNegative(ShifteeS))
+        return isKnownPredicate(ICmpInst::ICMP_SLE, ShifteeS, RHS);
+  }
+
+  return false;
+}
+
 bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
                                             const SCEV *LHS, const SCEV *RHS,
                                             const SCEV *FoundLHS,
@@ -11247,6 +11516,9 @@ bool ScalarEvolution::isImpliedCondOperands(ICmpInst::Predicate Pred,
   if (isImpliedCondOperandsViaNoOverflow(Pred, LHS, RHS, FoundLHS, FoundRHS))
     return true;
 
+  if (isImpliedCondOperandsViaShift(Pred, LHS, RHS, FoundLHS, FoundRHS))
+    return true;
+
   if (isImpliedCondOperandsViaAddRecStart(Pred, LHS, RHS, FoundLHS, FoundRHS,
                                           CtxI))
     return true;
@@ -11323,6 +11595,7 @@ static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE,
   case ICmpInst::ICMP_ULE:
     return
         // min(A, ...) <= A
+        // FIXME: what about umin_seq?
         IsMinMaxConsistingOf<SCEVUMinExpr>(LHS, RHS) ||
         // A <= max(A, ...)
         IsMinMaxConsistingOf<SCEVUMaxExpr>(RHS, LHS);
@@ -12723,7 +12996,8 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
   case scUMaxExpr:
   case scSMaxExpr:
   case scUMinExpr:
-  case scSMinExpr: {
+  case scSMinExpr:
+  case scSequentialUMinExpr: {
     bool HasVarying = false;
     for (auto *Op : cast<SCEVNAryExpr>(S)->operands()) {
       LoopDisposition D = getLoopDisposition(Op, L);
@@ -12813,7 +13087,8 @@ ScalarEvolution::computeBlockDisposition(const SCEV *S, const BasicBlock *BB) {
   case scUMaxExpr:
   case scSMaxExpr:
   case scUMinExpr:
-  case scSMinExpr: {
+  case scSMinExpr:
+  case scSequentialUMinExpr: {
     const SCEVNAryExpr *NAry = cast<SCEVNAryExpr>(S);
     bool Proper = true;
     for (const SCEV *NAryOp : NAry->operands()) {
diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp
index 3d10479c4544..26bc63983b4e 100644
--- a/llvm/lib/Analysis/TFUtils.cpp
+++ b/llvm/lib/Analysis/TFUtils.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/Utils/TFUtils.h"
+#include "llvm/Support/Base64.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/JSON.h"
@@ -22,6 +23,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include "google/protobuf/struct.pb.h"
 #include "google/protobuf/text_format.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_experimental.h"
@@ -72,6 +74,14 @@ TFStatusPtr createTFStatus() {
 TFSessionOptionsPtr createTFSessionOptions() {
   return TFSessionOptionsPtr(TF_NewSessionOptions(), &TF_DeleteSessionOptions);
 }
+
+void serialize(const Message &SE, std::string *OutStr) {
+  if (ProtobufTextMode) {
+    TextFormat::PrintToString(SE, OutStr);
+  } else {
+    *OutStr = SE.SerializeAsString();
+  }
+}
 } // namespace
 
 namespace llvm {
@@ -307,19 +317,13 @@ public:
         IncludeReward(IncludeReward), FeatureLists(LoggedFeatureSpecs.size()) {}
 
   // flush the logged info to a stream and clear the log contents.
-  void flush(raw_ostream &OS) {
+  void flush(std::string *Str) {
     size_t NrRecords = getNrRecords();
     (void)NrRecords;
     tensorflow::SequenceExample SE;
     transferLog(SE);
     assert(isSelfConsistent(SE, NrRecords));
-    std::string OutStr;
-    if (ProtobufTextMode)
-      google::protobuf::TextFormat::PrintToString(SE, &OutStr);
-    else
-      OutStr = SE.SerializeAsString();
-
-    OS << OutStr;
+    serialize(SE, Str);
   }
 
   char *addNewTensor(size_t FeatureID) {
@@ -567,5 +571,31 @@ char *Logger::addEntryAndGetFloatOrInt64Buffer(size_t FeatureID) {
   return reinterpret_cast<char *>(LoggerData->addNewTensor(FeatureID));
 }
 
-void Logger::flush(raw_ostream &OS) { LoggerData->flush(OS); }
+void Logger::flush(std::string *Str) { LoggerData->flush(Str); }
+
+void Logger::flush(raw_ostream &OS) {
+  std::string Buff;
+  LoggerData->flush(&Buff);
+  OS << Buff;
+}
+
+void Logger::flushLogs(raw_ostream &OS,
+                       const StringMap<std::unique_ptr<Logger>> &Loggers) {
+  google::protobuf::Struct Msg;
+  for (const auto &NamedLogger : Loggers) {
+    tensorflow::SequenceExample SE;
+    const auto &Logger = NamedLogger.second;
+    std::string Unencoded;
+    if (Logger->LoggerData->getNrRecords() > 0)
+      Logger->flush(&Unencoded);
+
+    (*Msg.mutable_fields())[NamedLogger.first().str()]
+        .mutable_string_value()
+        ->append(ProtobufTextMode ? Unencoded : encodeBase64(Unencoded));
+  }
+
+  std::string OutStr;
+  serialize(Msg, &OutStr);
+  OS << OutStr;
+}
 #endif // defined(LLVM_HAVE_TF_API)
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 6aa9a77391dc..25e9dee98e13 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -408,6 +408,16 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType,
   return TTIImpl->isLegalMaskedScatter(DataType, Alignment);
 }
 
+bool TargetTransformInfo::forceScalarizeMaskedGather(VectorType *DataType,
+                                                     Align Alignment) const {
+  return TTIImpl->forceScalarizeMaskedGather(DataType, Alignment);
+}
+
+bool TargetTransformInfo::forceScalarizeMaskedScatter(VectorType *DataType,
+                                                      Align Alignment) const {
+  return TTIImpl->forceScalarizeMaskedScatter(DataType, Alignment);
+}
+
 bool TargetTransformInfo::isLegalMaskedCompressStore(Type *DataType) const {
   return TTIImpl->isLegalMaskedCompressStore(DataType);
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index fc378f97de0b..34358739f9a8 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -396,10 +396,10 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
       V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo));
 }
 
-unsigned llvm::ComputeMinSignedBits(const Value *V, const DataLayout &DL,
-                                    unsigned Depth, AssumptionCache *AC,
-                                    const Instruction *CxtI,
-                                    const DominatorTree *DT) {
+unsigned llvm::ComputeMaxSignificantBits(const Value *V, const DataLayout &DL,
+                                         unsigned Depth, AssumptionCache *AC,
+                                         const Instruction *CxtI,
+                                         const DominatorTree *DT) {
   unsigned SignBits = ComputeNumSignBits(V, DL, Depth, AC, CxtI, DT);
   return V->getType()->getScalarSizeInBits() - SignBits + 1;
 }
@@ -1593,7 +1593,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleLZ = Known2.countMaxLeadingZeros();
-        // If this call is undefined for 0, the result will be less than 2^n.
+        // If this call is poison for 0 input, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
         unsigned LowBits = Log2_32(PossibleLZ)+1;
@@ -1604,7 +1604,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         // If we have a known 1, its position is our upper bound.
         unsigned PossibleTZ = Known2.countMaxTrailingZeros();
-        // If this call is undefined for 0, the result will be less than 2^n.
+        // If this call is poison for 0 input, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
         unsigned LowBits = Log2_32(PossibleTZ)+1;
@@ -3248,125 +3248,6 @@ static unsigned ComputeNumSignBitsImpl(const Value *V,
   return std::max(FirstAnswer, Known.countMinSignBits());
 }
 
-/// This function computes the integer multiple of Base that equals V.
-/// If successful, it returns true and returns the multiple in
-/// Multiple. If unsuccessful, it returns false. It looks
-/// through SExt instructions only if LookThroughSExt is true.
-bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
-                           bool LookThroughSExt, unsigned Depth) {
-  assert(V && "No Value?");
-  assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
-  assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");
-
-  Type *T = V->getType();
-
-  ConstantInt *CI = dyn_cast<ConstantInt>(V);
-
-  if (Base == 0)
-    return false;
-
-  if (Base == 1) {
-    Multiple = V;
-    return true;
-  }
-
-  ConstantExpr *CO = dyn_cast<ConstantExpr>(V);
-  Constant *BaseVal = ConstantInt::get(T, Base);
-  if (CO && CO == BaseVal) {
-    // Multiple is 1.
-    Multiple = ConstantInt::get(T, 1);
-    return true;
-  }
-
-  if (CI && CI->getZExtValue() % Base == 0) {
-    Multiple = ConstantInt::get(T, CI->getZExtValue() / Base);
-    return true;
-  }
-
-  if (Depth == MaxAnalysisRecursionDepth) return false;
-
-  Operator *I = dyn_cast<Operator>(V);
-  if (!I) return false;
-
-  switch (I->getOpcode()) {
-  default: break;
-  case Instruction::SExt:
-    if (!LookThroughSExt) return false;
-    // otherwise fall through to ZExt
-    LLVM_FALLTHROUGH;
-  case Instruction::ZExt:
-    return ComputeMultiple(I->getOperand(0), Base, Multiple,
-                           LookThroughSExt, Depth+1);
-  case Instruction::Shl:
-  case Instruction::Mul: {
-    Value *Op0 = I->getOperand(0);
-    Value *Op1 = I->getOperand(1);
-
-    if (I->getOpcode() == Instruction::Shl) {
-      ConstantInt *Op1CI = dyn_cast<ConstantInt>(Op1);
-      if (!Op1CI) return false;
-      // Turn Op0 << Op1 into Op0 * 2^Op1
-      APInt Op1Int = Op1CI->getValue();
-      uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1);
-      APInt API(Op1Int.getBitWidth(), 0);
-      API.setBit(BitToSet);
-      Op1 = ConstantInt::get(V->getContext(), API);
-    }
-
-    Value *Mul0 = nullptr;
-    if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
-      if (Constant *Op1C = dyn_cast<Constant>(Op1))
-        if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
-          if (Op1C->getType()->getPrimitiveSizeInBits().getFixedSize() <
-              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
-            Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
-          if (Op1C->getType()->getPrimitiveSizeInBits().getFixedSize() >
-              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
-            MulC = ConstantExpr::getZExt(MulC, Op1C->getType());
-
-          // V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
-          Multiple = ConstantExpr::getMul(MulC, Op1C);
-          return true;
-        }
-
-      if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0))
-        if (Mul0CI->getValue() == 1) {
-          // V == Base * Op1, so return Op1
-          Multiple = Op1;
-          return true;
-        }
-    }
-
-    Value *Mul1 = nullptr;
-    if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
-      if (Constant *Op0C = dyn_cast<Constant>(Op0))
-        if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
-          if (Op0C->getType()->getPrimitiveSizeInBits().getFixedSize() <
-              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
-            Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
-          if (Op0C->getType()->getPrimitiveSizeInBits().getFixedSize() >
-              MulC->getType()->getPrimitiveSizeInBits().getFixedSize())
-            MulC = ConstantExpr::getZExt(MulC, Op0C->getType());
-
-          // V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
-          Multiple = ConstantExpr::getMul(MulC, Op0C);
-          return true;
-        }
-
-      if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1))
-        if (Mul1CI->getValue() == 1) {
-          // V == Base * Op0, so return Op0
-          Multiple = Op0;
-          return true;
-        }
-    }
-  }
-  }
-
-  // We could not determine if V is a multiple of Base.
-  return false;
-}
-
 Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
                                             const TargetLibraryInfo *TLI) {
   const Function *F = CB.getCalledFunction();
@@ -6756,17 +6637,27 @@ Optional<bool> llvm::isImpliedByDomCondition(CmpInst::Predicate Pred,
 }
 
 static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
-                              APInt &Upper, const InstrInfoQuery &IIQ) {
+                              APInt &Upper, const InstrInfoQuery &IIQ,
+                              bool PreferSignedRange) {
   unsigned Width = Lower.getBitWidth();
   const APInt *C;
   switch (BO.getOpcode()) {
   case Instruction::Add:
     if (match(BO.getOperand(1), m_APInt(C)) && !C->isZero()) {
-      // FIXME: If we have both nuw and nsw, we should reduce the range further.
-      if (IIQ.hasNoUnsignedWrap(cast<OverflowingBinaryOperator>(&BO))) {
+      bool HasNSW = IIQ.hasNoSignedWrap(&BO);
+      bool HasNUW = IIQ.hasNoUnsignedWrap(&BO);
+
+      // If the caller expects a signed compare, then try to use a signed range.
+      // Otherwise if both no-wraps are set, use the unsigned range because it
+      // is never larger than the signed range. Example:
+      // "add nuw nsw i8 X, -2" is unsigned [254,255] vs. signed [-128, 125].
+      if (PreferSignedRange && HasNSW && HasNUW)
+        HasNUW = false;
+
+      if (HasNUW) {
         // 'add nuw x, C' produces [C, UINT_MAX].
         Lower = *C;
-      } else if (IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(&BO))) {
+      } else if (HasNSW) {
         if (C->isNegative()) {
           // 'add nsw x, -C' produces [SINT_MIN, SINT_MAX - C].
           Lower = APInt::getSignedMinValue(Width);
@@ -7083,8 +6974,8 @@ static void setLimitForFPToI(const Instruction *I, APInt &Lower, APInt &Upper) {
   }
 }
 
-ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
-                                         AssumptionCache *AC,
+ConstantRange llvm::computeConstantRange(const Value *V, bool ForSigned,
+                                         bool UseInstrInfo, AssumptionCache *AC,
                                          const Instruction *CtxI,
                                          const DominatorTree *DT,
                                          unsigned Depth) {
@@ -7102,7 +6993,7 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
   APInt Lower = APInt(BitWidth, 0);
   APInt Upper = APInt(BitWidth, 0);
   if (auto *BO = dyn_cast<BinaryOperator>(V))
-    setLimitsForBinOp(*BO, Lower, Upper, IIQ);
+    setLimitsForBinOp(*BO, Lower, Upper, IIQ, ForSigned);
   else if (auto *II = dyn_cast<IntrinsicInst>(V))
     setLimitsForIntrinsic(*II, Lower, Upper);
   else if (auto *SI = dyn_cast<SelectInst>(V))
@@ -7134,8 +7025,10 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
       // Currently we just use information from comparisons.
       if (!Cmp || Cmp->getOperand(0) != V)
         continue;
-      ConstantRange RHS = computeConstantRange(Cmp->getOperand(1), UseInstrInfo,
-                                               AC, I, DT, Depth + 1);
+      // TODO: Set "ForSigned" parameter via Cmp->isSigned()?
+      ConstantRange RHS =
+          computeConstantRange(Cmp->getOperand(1), /* ForSigned */ false,
+                               UseInstrInfo, AC, I, DT, Depth + 1);
       CR = CR.intersectWith(
           ConstantRange::makeAllowedICmpRegion(Cmp->getPredicate(), RHS));
     }
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 35c615522fe2..432ec151cf8a 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -133,14 +133,17 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   for (const auto &RAG : ForwardRefAttrGroups) {
     Value *V = RAG.first;
     const std::vector<unsigned> &Attrs = RAG.second;
-    AttrBuilder B;
+    AttrBuilder B(Context);
 
-    for (const auto &Attr : Attrs)
-      B.merge(NumberedAttrBuilders[Attr]);
+    for (const auto &Attr : Attrs) {
+      auto R = NumberedAttrBuilders.find(Attr);
+      if (R != NumberedAttrBuilders.end())
+        B.merge(R->second);
+    }
 
     if (Function *Fn = dyn_cast<Function>(V)) {
       AttributeList AS = Fn->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs());
       AS = AS.removeFnAttributes(Context);
 
       FnAttrs.merge(B);
@@ -156,27 +159,27 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
       Fn->setAttributes(AS);
     } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
       AttributeList AS = CI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs());
       AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
       AS = AS.addFnAttributes(Context, FnAttrs);
       CI->setAttributes(AS);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
       AttributeList AS = II->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs());
       AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
       AS = AS.addFnAttributes(Context, FnAttrs);
       II->setAttributes(AS);
     } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(V)) {
       AttributeList AS = CBI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttrs());
+      AttrBuilder FnAttrs(M->getContext(), AS.getFnAttrs());
       AS = AS.removeFnAttributes(Context);
       FnAttrs.merge(B);
       AS = AS.addFnAttributes(Context, FnAttrs);
       CBI->setAttributes(AS);
     } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
-      AttrBuilder Attrs(GV->getAttributes());
+      AttrBuilder Attrs(M->getContext(), GV->getAttributes());
       Attrs.merge(B);
       GV->setAttributes(AttributeSet::get(Context,Attrs));
     } else {
@@ -982,17 +985,18 @@ bool LLParser::parseAliasOrIFunc(const std::string &Name, LocTy NameLoc,
     return error(AliaseeLoc, "An alias or ifunc must have pointer type");
   unsigned AddrSpace = PTy->getAddressSpace();
 
-  if (IsAlias && !PTy->isOpaqueOrPointeeTypeMatches(Ty)) {
-    return error(
-        ExplicitTypeLoc,
-        typeComparisonErrorMessage(
-            "explicit pointee type doesn't match operand's pointee type", Ty,
-            PTy->getElementType()));
-  }
-
-  if (!IsAlias && !PTy->getElementType()->isFunctionTy()) {
-    return error(ExplicitTypeLoc,
-                 "explicit pointee type should be a function type");
+  if (IsAlias) {
+    if (!PTy->isOpaqueOrPointeeTypeMatches(Ty))
+      return error(
+          ExplicitTypeLoc,
+          typeComparisonErrorMessage(
+              "explicit pointee type doesn't match operand's pointee type", Ty,
+              PTy->getNonOpaquePointerElementType()));
+  } else {
+    if (!PTy->isOpaque() &&
+        !PTy->getNonOpaquePointerElementType()->isFunctionTy())
+      return error(ExplicitTypeLoc,
+                   "explicit pointee type should be a function type");
   }
 
   GlobalValue *GVal = nullptr;
@@ -1206,7 +1210,7 @@ bool LLParser::parseGlobal(const std::string &Name, LocTy NameLoc,
     }
   }
 
-  AttrBuilder Attrs;
+  AttrBuilder Attrs(M->getContext());
   LocTy BuiltinLoc;
   std::vector<unsigned> FwdRefAttrGrps;
   if (parseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc))
@@ -1235,13 +1239,18 @@ bool LLParser::parseUnnamedAttrGrp() {
   Lex.Lex();
 
   if (parseToken(lltok::equal, "expected '=' here") ||
-      parseToken(lltok::lbrace, "expected '{' here") ||
-      parseFnAttributeValuePairs(NumberedAttrBuilders[VarID], unused, true,
-                                 BuiltinLoc) ||
+      parseToken(lltok::lbrace, "expected '{' here"))
+    return true;
+
+  auto R = NumberedAttrBuilders.find(VarID);
+  if (R == NumberedAttrBuilders.end())
+    R = NumberedAttrBuilders.emplace(VarID, AttrBuilder(M->getContext())).first;
+
+  if (parseFnAttributeValuePairs(R->second, unused, true, BuiltinLoc) ||
       parseToken(lltok::rbrace, "expected end of attribute group"))
     return true;
 
-  if (!NumberedAttrBuilders[VarID].hasAttributes())
+  if (!R->second.hasAttributes())
     return error(AttrGrpLoc, "attribute group has no attributes");
 
   return false;
@@ -1402,14 +1411,14 @@ static inline GlobalValue *createGlobalFwdRef(Module *M, PointerType *PTy) {
                               nullptr, GlobalVariable::NotThreadLocal,
                               PTy->getAddressSpace());
 
-  if (auto *FT = dyn_cast<FunctionType>(PTy->getPointerElementType()))
+  Type *ElemTy = PTy->getNonOpaquePointerElementType();
+  if (auto *FT = dyn_cast<FunctionType>(ElemTy))
     return Function::Create(FT, GlobalValue::ExternalWeakLinkage,
                             PTy->getAddressSpace(), "", M);
   else
-    return new GlobalVariable(*M, PTy->getPointerElementType(), false,
-                              GlobalValue::ExternalWeakLinkage, nullptr, "",
-                              nullptr, GlobalVariable::NotThreadLocal,
-                              PTy->getAddressSpace());
+    return new GlobalVariable(
+        *M, ElemTy, false, GlobalValue::ExternalWeakLinkage, nullptr, "",
+        nullptr, GlobalVariable::NotThreadLocal, PTy->getAddressSpace());
 }
 
 Value *LLParser::checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
@@ -2372,11 +2381,12 @@ bool LLParser::parseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
     // parse the argument.
     LocTy ArgLoc;
     Type *ArgTy = nullptr;
-    AttrBuilder ArgAttrs;
     Value *V;
     if (parseType(ArgTy, ArgLoc))
       return true;
 
+    AttrBuilder ArgAttrs(M->getContext());
+
     if (ArgTy->isMetadataTy()) {
       if (parseMetadataAsValue(V, PFS))
         return true;
@@ -2493,7 +2503,7 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
   } else {
     LocTy TypeLoc = Lex.getLoc();
     Type *ArgTy = nullptr;
-    AttrBuilder Attrs;
+    AttrBuilder Attrs(M->getContext());
     std::string Name;
 
     if (parseType(ArgTy) || parseOptionalParamAttrs(Attrs))
@@ -3579,7 +3589,7 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
             ExplicitTypeLoc,
             typeComparisonErrorMessage(
                 "explicit pointee type doesn't match operand's pointee type",
-                Ty, BasePointerType->getElementType()));
+                Ty, BasePointerType->getNonOpaquePointerElementType()));
       }
 
       unsigned GEPWidth =
@@ -4541,16 +4551,17 @@ bool LLParser::parseDIStringType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(name, MDStringField, );                                             \
   OPTIONAL(stringLength, MDField, );                                           \
   OPTIONAL(stringLengthExpression, MDField, );                                 \
+  OPTIONAL(stringLocationExpression, MDField, );                               \
   OPTIONAL(size, MDUnsignedField, (0, UINT64_MAX));                            \
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(encoding, DwarfAttEncodingField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIStringType,
-                           (Context, tag.Val, name.Val, stringLength.Val,
-                            stringLengthExpression.Val, size.Val, align.Val,
-                            encoding.Val));
+  Result = GET_OR_DISTINCT(
+      DIStringType,
+      (Context, tag.Val, name.Val, stringLength.Val, stringLengthExpression.Val,
+       stringLocationExpression.Val, size.Val, align.Val, encoding.Val));
   return false;
 }
 
@@ -5462,7 +5473,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
   unsigned Visibility;
   unsigned DLLStorageClass;
   bool DSOLocal;
-  AttrBuilder RetAttrs;
+  AttrBuilder RetAttrs(M->getContext());
   unsigned CC;
   bool HasLinkage;
   Type *RetType = nullptr;
@@ -5525,7 +5536,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
 
   SmallVector<ArgInfo, 8> ArgList;
   bool IsVarArg;
-  AttrBuilder FuncAttrs;
+  AttrBuilder FuncAttrs(M->getContext());
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy BuiltinLoc;
   std::string Section;
@@ -5593,7 +5604,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
     if (FRVI != ForwardRefVals.end()) {
       FwdFn = FRVI->second.first;
       if (!FwdFn->getType()->isOpaque()) {
-        if (!FwdFn->getType()->getPointerElementType()->isFunctionTy())
+        if (!FwdFn->getType()->getNonOpaquePointerElementType()->isFunctionTy())
           return error(FRVI->second.second, "invalid forward reference to "
                                             "function as global value!");
         if (FwdFn->getType() != PFT)
@@ -6248,7 +6259,7 @@ bool LLParser::parseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
 ///       OptionalAttrs 'to' TypeAndValue 'unwind' TypeAndValue
 bool LLParser::parseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy CallLoc = Lex.getLoc();
-  AttrBuilder RetAttrs, FnAttrs;
+  AttrBuilder RetAttrs(M->getContext()), FnAttrs(M->getContext());
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy NoBuiltinLoc;
   unsigned CC;
@@ -6558,7 +6569,7 @@ bool LLParser::parseUnaryOp(Instruction *&Inst, PerFunctionState &PFS,
 ///       '[' LabelList ']'
 bool LLParser::parseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy CallLoc = Lex.getLoc();
-  AttrBuilder RetAttrs, FnAttrs;
+  AttrBuilder RetAttrs(M->getContext()), FnAttrs(M->getContext());
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy NoBuiltinLoc;
   unsigned CC;
@@ -6975,7 +6986,7 @@ bool LLParser::parseFreeze(Instruction *&Inst, PerFunctionState &PFS) {
 ///           OptionalAttrs Type Value ParameterList OptionalAttrs
 bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS,
                          CallInst::TailCallKind TCK) {
-  AttrBuilder RetAttrs, FnAttrs;
+  AttrBuilder RetAttrs(M->getContext()), FnAttrs(M->getContext());
   std::vector<unsigned> FwdRefAttrGrps;
   LocTy BuiltinLoc;
   unsigned CallAddrSpace;
@@ -7196,7 +7207,7 @@ int LLParser::parseLoad(Instruction *&Inst, PerFunctionState &PFS) {
         ExplicitTypeLoc,
         typeComparisonErrorMessage(
             "explicit pointee type doesn't match operand's pointee type", Ty,
-            cast<PointerType>(Val->getType())->getElementType()));
+            Val->getType()->getNonOpaquePointerElementType()));
   }
   SmallPtrSet<Type *, 4> Visited;
   if (!Alignment && !Ty->isSized(&Visited))
@@ -7456,7 +7467,7 @@ int LLParser::parseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
         ExplicitTypeLoc,
         typeComparisonErrorMessage(
             "explicit pointee type doesn't match operand's pointee type", Ty,
-            BasePointerType->getElementType()));
+            BasePointerType->getNonOpaquePointerElementType()));
   }
 
   SmallVector<Value*, 16> Indices;
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index 284e469a1d2f..99d2c8221281 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -12,8 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/BinaryFormat/MsgPackDocument.h"
+
+#include <map>
+#include <utility>
 
 namespace llvm {
 namespace AMDGPU {
diff --git a/llvm/lib/BinaryFormat/ELF.cpp b/llvm/lib/BinaryFormat/ELF.cpp
index 2ede63f464d3..e2e601b6d90f 100644
--- a/llvm/lib/BinaryFormat/ELF.cpp
+++ b/llvm/lib/BinaryFormat/ELF.cpp
@@ -7,9 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/ELF.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Error.h"
 
 using namespace llvm;
 using namespace ELF;
diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp
index 8c7f7b7043a0..044e4840cb3b 100644
--- a/llvm/lib/BinaryFormat/Magic.cpp
+++ b/llvm/lib/BinaryFormat/Magic.cpp
@@ -10,10 +10,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
-#include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
@@ -88,7 +86,10 @@ file_magic llvm::identify_magic(StringRef Magic) {
     if (startswith(Magic, "!<arch>\n") || startswith(Magic, "!<thin>\n"))
       return file_magic::archive;
     break;
-
+  case '<':
+    if (startswith(Magic, "<bigaf>\n"))
+      return file_magic::archive;
+    break;
   case '\177':
     if (startswith(Magic, "\177ELF") && Magic.size() >= 18) {
       bool Data2MSB = Magic[5] == 2;
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index a36b256c29b6..ffef35299981 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -781,7 +781,7 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
   uint64_t MetadataIndexOffset = 0;
 
   // Read all the records for this block.
-  while (1) {
+  while (true) {
     if (Stream.AtEndOfStream())
       return reportError("Premature end of bitstream");
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index f5a878f8788a..720ab560f988 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1349,7 +1349,7 @@ Error BitcodeReader::parseAttributeBlock() {
         return error("Invalid record");
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        AttrBuilder B;
+        AttrBuilder B(Context);
         decodeLLVMAttributesForBitcode(B, Record[i+1]);
         Attrs.push_back(AttributeList::get(Context, Record[i], B));
       }
@@ -1591,7 +1591,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
       uint64_t GrpID = Record[0];
       uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
 
-      AttrBuilder B;
+      AttrBuilder B(Context);
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Record[i] == 0) {        // Enum attribute
           Attribute::AttrKind Kind;
@@ -2702,7 +2702,7 @@ Error BitcodeReader::parseConstants() {
 
       PointerType *OrigPtrTy = cast<PointerType>(Elt0FullTy->getScalarType());
       if (!PointeeType)
-        PointeeType = OrigPtrTy->getElementType();
+        PointeeType = OrigPtrTy->getPointerElementType();
       else if (!OrigPtrTy->isOpaqueOrPointeeTypeMatches(PointeeType))
         return error("Explicit gep operator type does not match pointee type "
                      "of pointer operand");
@@ -2824,9 +2824,9 @@ Error BitcodeReader::parseConstants() {
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[3+AsmStrSize+i];
       UpgradeInlineAsmString(&AsmStr);
-      V = InlineAsm::get(
-          cast<FunctionType>(cast<PointerType>(CurTy)->getElementType()),
-          AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
+      // FIXME: support upgrading in opaque pointers mode.
+      V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()),
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
       break;
     }
     // This version adds support for the asm dialect keywords (e.g.,
@@ -2850,37 +2850,74 @@ Error BitcodeReader::parseConstants() {
       for (unsigned i = 0; i != ConstStrSize; ++i)
         ConstrStr += (char)Record[3+AsmStrSize+i];
       UpgradeInlineAsmString(&AsmStr);
-      V = InlineAsm::get(
-          cast<FunctionType>(cast<PointerType>(CurTy)->getElementType()),
-          AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
-          InlineAsm::AsmDialect(AsmDialect));
+      // FIXME: support upgrading in opaque pointers mode.
+      V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()),
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+                         InlineAsm::AsmDialect(AsmDialect));
       break;
     }
     // This version adds support for the unwind keyword.
-    case bitc::CST_CODE_INLINEASM: {
+    case bitc::CST_CODE_INLINEASM_OLD3: {
       if (Record.size() < 2)
         return error("Invalid record");
+      unsigned OpNum = 0;
       std::string AsmStr, ConstrStr;
-      bool HasSideEffects = Record[0] & 1;
-      bool IsAlignStack = (Record[0] >> 1) & 1;
-      unsigned AsmDialect = (Record[0] >> 2) & 1;
-      bool CanThrow = (Record[0] >> 3) & 1;
-      unsigned AsmStrSize = Record[1];
-      if (2 + AsmStrSize >= Record.size())
+      bool HasSideEffects = Record[OpNum] & 1;
+      bool IsAlignStack = (Record[OpNum] >> 1) & 1;
+      unsigned AsmDialect = (Record[OpNum] >> 2) & 1;
+      bool CanThrow = (Record[OpNum] >> 3) & 1;
+      ++OpNum;
+      unsigned AsmStrSize = Record[OpNum];
+      ++OpNum;
+      if (OpNum + AsmStrSize >= Record.size())
         return error("Invalid record");
-      unsigned ConstStrSize = Record[2 + AsmStrSize];
-      if (3 + AsmStrSize + ConstStrSize > Record.size())
+      unsigned ConstStrSize = Record[OpNum + AsmStrSize];
+      if (OpNum + 1 + AsmStrSize + ConstStrSize > Record.size())
         return error("Invalid record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
-        AsmStr += (char)Record[2 + i];
+        AsmStr += (char)Record[OpNum + i];
+      ++OpNum;
       for (unsigned i = 0; i != ConstStrSize; ++i)
-        ConstrStr += (char)Record[3 + AsmStrSize + i];
+        ConstrStr += (char)Record[OpNum + AsmStrSize + i];
       UpgradeInlineAsmString(&AsmStr);
-      V = InlineAsm::get(
-          cast<FunctionType>(cast<PointerType>(CurTy)->getElementType()),
-          AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
-          InlineAsm::AsmDialect(AsmDialect), CanThrow);
+      // FIXME: support upgrading in opaque pointers mode.
+      V = InlineAsm::get(cast<FunctionType>(CurTy->getPointerElementType()),
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+                         InlineAsm::AsmDialect(AsmDialect), CanThrow);
+      break;
+    }
+    // This version adds explicit function type.
+    case bitc::CST_CODE_INLINEASM: {
+      if (Record.size() < 3)
+        return error("Invalid record");
+      unsigned OpNum = 0;
+      auto *FnTy = dyn_cast_or_null<FunctionType>(getTypeByID(Record[OpNum]));
+      ++OpNum;
+      if (!FnTy)
+        return error("Invalid record");
+      std::string AsmStr, ConstrStr;
+      bool HasSideEffects = Record[OpNum] & 1;
+      bool IsAlignStack = (Record[OpNum] >> 1) & 1;
+      unsigned AsmDialect = (Record[OpNum] >> 2) & 1;
+      bool CanThrow = (Record[OpNum] >> 3) & 1;
+      ++OpNum;
+      unsigned AsmStrSize = Record[OpNum];
+      ++OpNum;
+      if (OpNum + AsmStrSize >= Record.size())
+        return error("Invalid record");
+      unsigned ConstStrSize = Record[OpNum + AsmStrSize];
+      if (OpNum + 1 + AsmStrSize + ConstStrSize > Record.size())
+        return error("Invalid record");
+
+      for (unsigned i = 0; i != AsmStrSize; ++i)
+        AsmStr += (char)Record[OpNum + i];
+      ++OpNum;
+      for (unsigned i = 0; i != ConstStrSize; ++i)
+        ConstrStr += (char)Record[OpNum + AsmStrSize + i];
+      UpgradeInlineAsmString(&AsmStr);
+      V = InlineAsm::get(FnTy, AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+                         InlineAsm::AsmDialect(AsmDialect), CanThrow);
       break;
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
@@ -3242,7 +3279,7 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
     if (!Ty->isPointerTy())
       return error("Invalid type for value");
     AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
-    Ty = cast<PointerType>(Ty)->getElementType();
+    Ty = Ty->getPointerElementType();
   }
 
   uint64_t RawLinkage = Record[3];
@@ -3335,7 +3372,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
   if (!FTy)
     return error("Invalid record");
   if (auto *PTy = dyn_cast<PointerType>(FTy))
-    FTy = PTy->getElementType();
+    FTy = PTy->getPointerElementType();
 
   if (!isa<FunctionType>(FTy))
     return error("Invalid type for value");
@@ -3376,7 +3413,7 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
       Func->removeParamAttr(i, Kind);
 
       Type *PTy = cast<FunctionType>(FTy)->getParamType(i);
-      Type *PtrEltTy = cast<PointerType>(PTy)->getElementType();
+      Type *PtrEltTy = PTy->getPointerElementType();
       Attribute NewAttr;
       switch (Kind) {
       case Attribute::ByVal:
@@ -3499,7 +3536,7 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
     auto *PTy = dyn_cast<PointerType>(Ty);
     if (!PTy)
       return error("Invalid type for value");
-    Ty = PTy->getElementType();
+    Ty = PTy->getPointerElementType();
     AddrSpace = PTy->getAddressSpace();
   } else {
     AddrSpace = Record[OpNum++];
@@ -3795,6 +3832,11 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       if (Error Err = parseComdatRecord(Record))
         return Err;
       break;
+    // FIXME: BitcodeReader should handle {GLOBALVAR, FUNCTION, ALIAS, IFUNC}
+    // written by ThinLinkBitcodeWriter. See
+    // `ThinLinkBitcodeWriter::writeSimplifiedModuleInfo` for the format of each
+    // record
+    // (https://github.com/llvm/llvm-project/blob/b6a93967d9c11e79802b5e75cec1584d6c8aa472/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp#L4714)
     case bitc::MODULE_CODE_GLOBALVAR:
       if (Error Err = parseGlobalVarRecord(Record))
         return Err;
@@ -3857,12 +3899,13 @@ void BitcodeReader::propagateAttributeTypes(CallBase *CB,
   for (unsigned i = 0; i != CB->arg_size(); ++i) {
     for (Attribute::AttrKind Kind : {Attribute::ByVal, Attribute::StructRet,
                                      Attribute::InAlloca}) {
-      if (!CB->paramHasAttr(i, Kind))
+      if (!CB->paramHasAttr(i, Kind) ||
+          CB->getParamAttr(i, Kind).getValueAsType())
         continue;
 
       CB->removeParamAttr(i, Kind);
 
-      Type *PtrEltTy = cast<PointerType>(ArgsTys[i])->getElementType();
+      Type *PtrEltTy = ArgsTys[i]->getPointerElementType();
       Attribute NewAttr;
       switch (Kind) {
       case Attribute::ByVal:
@@ -3882,11 +3925,28 @@ void BitcodeReader::propagateAttributeTypes(CallBase *CB,
     }
   }
 
+  if (CB->isInlineAsm()) {
+    const InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+    unsigned ArgNo = 0;
+    for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+      if (!CI.hasArg())
+        continue;
+
+      if (CI.isIndirect && !CB->getAttributes().getParamElementType(ArgNo)) {
+        Type *ElemTy = ArgsTys[ArgNo]->getPointerElementType();
+        CB->addParamAttr(
+            ArgNo, Attribute::get(Context, Attribute::ElementType, ElemTy));
+      }
+
+      ArgNo++;
+    }
+  }
+
   switch (CB->getIntrinsicID()) {
   case Intrinsic::preserve_array_access_index:
   case Intrinsic::preserve_struct_access_index:
     if (!CB->getAttributes().getParamElementType(0)) {
-      Type *ElTy = cast<PointerType>(ArgsTys[0])->getElementType();
+      Type *ElTy = ArgsTys[0]->getPointerElementType();
       Attribute NewAttr = Attribute::get(Context, Attribute::ElementType, ElTy);
       CB->addParamAttr(0, NewAttr);
     }
@@ -4176,8 +4236,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
 
       if (!Ty) {
-        Ty = cast<PointerType>(BasePtr->getType()->getScalarType())
-                 ->getElementType();
+        Ty = BasePtr->getType()->getScalarType()->getPointerElementType();
       } else if (!cast<PointerType>(BasePtr->getType()->getScalarType())
                       ->isOpaqueOrPointeeTypeMatches(Ty)) {
         return error(
@@ -4693,8 +4752,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (!CalleeTy)
         return error("Callee is not a pointer");
       if (!FTy) {
-        FTy = dyn_cast<FunctionType>(
-            cast<PointerType>(Callee->getType())->getElementType());
+        FTy =
+            dyn_cast<FunctionType>(Callee->getType()->getPointerElementType());
         if (!FTy)
           return error("Callee is not of pointer to function type");
       } else if (!CalleeTy->isOpaqueOrPointeeTypeMatches(FTy))
@@ -4774,26 +4833,29 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (!OpTy)
         return error("Callee is not a pointer type");
       if (!FTy) {
-        FTy = dyn_cast<FunctionType>(
-            cast<PointerType>(Callee->getType())->getElementType());
+        FTy =
+            dyn_cast<FunctionType>(Callee->getType()->getPointerElementType());
         if (!FTy)
           return error("Callee is not of pointer to function type");
-      } else if (cast<PointerType>(Callee->getType())->getElementType() != FTy)
+      } else if (!OpTy->isOpaqueOrPointeeTypeMatches(FTy))
         return error("Explicit call type does not match pointee type of "
                      "callee operand");
       if (Record.size() < FTy->getNumParams() + OpNum)
         return error("Insufficient operands to call");
 
       SmallVector<Value*, 16> Args;
+      SmallVector<Type *, 16> ArgsTys;
       // Read the fixed params.
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
+        Value *Arg;
         if (FTy->getParamType(i)->isLabelTy())
-          Args.push_back(getBasicBlock(Record[OpNum]));
+          Arg = getBasicBlock(Record[OpNum]);
         else
-          Args.push_back(getValue(Record, OpNum, NextValueNo,
-                                  FTy->getParamType(i)));
-        if (!Args.back())
+          Arg = getValue(Record, OpNum, NextValueNo, FTy->getParamType(i));
+        if (!Arg)
           return error("Invalid record");
+        Args.push_back(Arg);
+        ArgsTys.push_back(Arg->getType());
       }
 
       // Read type/value pairs for varargs params.
@@ -4806,6 +4868,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
             return error("Invalid record");
           Args.push_back(Op);
+          ArgsTys.push_back(Op->getType());
         }
       }
 
@@ -4816,6 +4879,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       cast<CallBrInst>(I)->setCallingConv(
           static_cast<CallingConv::ID>((0x7ff & CCInfo) >> bitc::CALL_CCONV));
       cast<CallBrInst>(I)->setAttributes(PAL);
+      propagateAttributeTypes(cast<CallBase>(I), ArgsTys);
       break;
     }
     case bitc::FUNC_CODE_INST_UNREACHABLE: // UNREACHABLE
@@ -4932,7 +4996,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         auto *PTy = dyn_cast_or_null<PointerType>(Ty);
         if (!PTy)
           return error("Old-style alloca with a non-pointer type");
-        Ty = PTy->getElementType();
+        Ty = PTy->getPointerElementType();
       }
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
@@ -4977,7 +5041,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (OpNum + 3 == Record.size()) {
         Ty = getTypeByID(Record[OpNum++]);
       } else {
-        Ty = cast<PointerType>(Op->getType())->getElementType();
+        Ty = Op->getType()->getPointerElementType();
       }
 
       if (Error Err = typeCheckLoadStoreInst(Ty, Op->getType()))
@@ -5010,7 +5074,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (OpNum + 5 == Record.size()) {
         Ty = getTypeByID(Record[OpNum++]);
       } else {
-        Ty = cast<PointerType>(Op->getType())->getElementType();
+        Ty = Op->getType()->getPointerElementType();
       }
 
       if (Error Err = typeCheckLoadStoreInst(Ty, Op->getType()))
@@ -5042,8 +5106,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           (BitCode == bitc::FUNC_CODE_INST_STORE
                ? getValueTypePair(Record, OpNum, NextValueNo, Val)
                : popValue(Record, OpNum, NextValueNo,
-                          cast<PointerType>(Ptr->getType())->getElementType(),
-                          Val)) ||
+                          Ptr->getType()->getPointerElementType(), Val)) ||
           OpNum + 2 != Record.size())
         return error("Invalid record");
 
@@ -5071,8 +5134,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           (BitCode == bitc::FUNC_CODE_INST_STOREATOMIC
                ? getValueTypePair(Record, OpNum, NextValueNo, Val)
                : popValue(Record, OpNum, NextValueNo,
-                          cast<PointerType>(Ptr->getType())->getElementType(),
-                          Val)) ||
+                          Ptr->getType()->getPointerElementType(), Val)) ||
           OpNum + 4 != Record.size())
         return error("Invalid record");
 
@@ -5323,8 +5385,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (!OpTy)
         return error("Callee is not a pointer type");
       if (!FTy) {
-        FTy = dyn_cast<FunctionType>(
-            cast<PointerType>(Callee->getType())->getElementType());
+        FTy =
+            dyn_cast<FunctionType>(Callee->getType()->getPointerElementType());
         if (!FTy)
           return error("Callee is not of pointer to function type");
       } else if (!OpTy->isOpaqueOrPointeeTypeMatches(FTy))
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 60530d7f7a00..0f4111514057 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1105,7 +1105,7 @@ void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
 void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders(
     PlaceholderQueue &Placeholders) {
   DenseSet<unsigned> Temporaries;
-  while (1) {
+  while (true) {
     // Populate Temporaries with the placeholders that haven't been loaded yet.
     Placeholders.getTemporaries(MetadataList, Temporaries);
 
@@ -1423,15 +1423,21 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_STRING_TYPE: {
-    if (Record.size() != 8)
+    if (Record.size() > 9 || Record.size() < 8)
       return error("Invalid record");
 
     IsDistinct = Record[0];
+    bool SizeIs8 = Record.size() == 8;
+    // StringLocationExp (i.e. Record[5]) is added at a later time
+    // than the other fields. The code here enables backward compatibility.
+    Metadata *StringLocationExp = SizeIs8 ? nullptr : getMDOrNull(Record[5]);
+    unsigned Offset = SizeIs8 ? 5 : 6;
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIStringType,
                         (Context, Record[1], getMDString(Record[2]),
                          getMDOrNull(Record[3]), getMDOrNull(Record[4]),
-                         Record[5], Record[6], Record[7])),
+                         StringLocationExp, Record[Offset], Record[Offset + 1],
+                         Record[Offset + 2])),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1632,7 +1638,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         Record.size() <= 16 ? true : Record[16],
         Record.size() <= 17 ? false : Record[17],
         Record.size() <= 18 ? 0 : Record[18],
-        Record.size() <= 19 ? 0 : Record[19],
+        Record.size() <= 19 ? false : Record[19],
         Record.size() <= 20 ? nullptr : getMDString(Record[20]),
         Record.size() <= 21 ? nullptr : getMDString(Record[21]));
 
@@ -1675,7 +1681,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       SPFlags = DISubprogram::toSPFlags(
           /*IsLocalToUnit=*/Record[7], /*IsDefinition=*/Record[8],
           /*IsOptimized=*/Record[14], /*Virtuality=*/Record[11],
-          /*DIFlagMainSubprogram=*/HasOldMainSubprogramFlag);
+          /*IsMainSubprogram=*/HasOldMainSubprogramFlag);
 
     // All definitions should be distinct.
     IsDistinct = (Record[0] & 1) || (SPFlags & DISubprogram::SPFlagDefinition);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index dc06bc10cf95..eb4e09ea3a26 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -948,7 +948,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
       } else {
         // POINTER: [pointee type, address space]
         Code = bitc::TYPE_CODE_POINTER;
-        TypeVals.push_back(VE.getTypeID(PTy->getElementType()));
+        TypeVals.push_back(VE.getTypeID(PTy->getNonOpaquePointerElementType()));
         TypeVals.push_back(AddressSpace);
         if (AddressSpace == 0)
           AbbrevToUse = PtrAbbrev;
@@ -1657,6 +1657,7 @@ void ModuleBitcodeWriter::writeDIStringType(const DIStringType *N,
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getStringLength()));
   Record.push_back(VE.getMetadataOrNullID(N->getStringLengthExp()));
+  Record.push_back(VE.getMetadataOrNullID(N->getStringLocationExp()));
   Record.push_back(N->getSizeInBits());
   Record.push_back(N->getAlignInBits());
   Record.push_back(N->getEncoding());
@@ -2458,6 +2459,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
     }
 
     if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+      Record.push_back(VE.getTypeID(IA->getFunctionType()));
       Record.push_back(
           unsigned(IA->hasSideEffects()) | unsigned(IA->isAlignStack()) << 1 |
           unsigned(IA->getDialect() & 1) << 2 | unsigned(IA->canThrow()) << 3);
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index df4f1a1873d7..01f7e85bd60e 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -386,8 +386,10 @@ ValueEnumerator::ValueEnumerator(const Module &M,
   }
 
   // Enumerate the ifuncs.
-  for (const GlobalIFunc &GIF : M.ifuncs())
+  for (const GlobalIFunc &GIF : M.ifuncs()) {
     EnumerateValue(&GIF);
+    EnumerateType(GIF.getValueType());
+  }
 
   // Remember what is the cutoff between globalvalue's and other constants.
   unsigned FirstConstant = Values.size();
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 5c64622c7245..bb71d72256d8 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -120,8 +120,7 @@ bool AggressiveAntiDepState::IsLive(unsigned Reg) {
 AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
     MachineFunction &MFi, const RegisterClassInfo &RCI,
     TargetSubtargetInfo::RegClassVector &CriticalPathRCs)
-    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
-      TII(MF.getSubtarget().getInstrInfo()),
+    : MF(MFi), MRI(MF.getRegInfo()), TII(MF.getSubtarget().getInstrInfo()),
       TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI) {
   /* Collect a bitset of all registers that are only broken if they
      are on the critical path. */
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index 7e68e5e22879..e8fef505e43d 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -577,9 +577,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,
   bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS;
   ADS = true;
 
-  AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex);
-  AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
-                          AttributeList::ReturnIndex);
+  AttrBuilder CallerAttrs(F->getContext(), F->getAttributes().getRetAttrs());
+  AttrBuilder CalleeAttrs(F->getContext(),
+                          cast<CallInst>(I)->getAttributes().getRetAttrs());
 
   // Following attributes are completely benign as far as calling convention
   // goes, they shouldn't affect whether the call is a tail call.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
index 964cef75d164..03e63321e3c4 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AIXException.cpp
@@ -23,6 +23,8 @@ namespace llvm {
 
 AIXException::AIXException(AsmPrinter *A) : DwarfCFIExceptionBase(A) {}
 
+void AIXException::markFunctionEnd() { endFragment(); }
+
 void AIXException::emitExceptionInfoTable(const MCSymbol *LSDA,
                                           const MCSymbol *PerSym) {
   // Generate EH Info Table.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 533f20535655..4f3f798fe6f8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -247,6 +247,11 @@ void AsmPrinter::emitInitialRawDwarfLocDirective(const MachineFunction &MF) {
   if (DD) {
     assert(OutStreamer->hasRawTextSupport() &&
            "Expected assembly output mode.");
+    // This is NVPTX specific and it's unclear why.
+    // PR51079: If we have code without debug information we need to give up.
+    DISubprogram *MFSP = MF.getFunction().getSubprogram();
+    if (!MFSP)
+      return;
     (void)DD->emitInitialLocDirective(MF, /*CUID=*/0);
   }
 }
@@ -2477,7 +2482,8 @@ void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 // two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
+void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV,
+                               unsigned MaxBytesToEmit) const {
   if (GV)
     Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
 
@@ -2490,9 +2496,9 @@ void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
       STI = &getSubtargetInfo();
     else
       STI = TM.getMCSubtargetInfo();
-    OutStreamer->emitCodeAlignment(Alignment.value(), STI);
+    OutStreamer->emitCodeAlignment(Alignment.value(), STI, MaxBytesToEmit);
   } else
-    OutStreamer->emitValueToAlignment(Alignment.value());
+    OutStreamer->emitValueToAlignment(Alignment.value(), 0, 1, MaxBytesToEmit);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3286,7 +3292,7 @@ void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
   if (Alignment != Align(1))
-    emitAlignment(Alignment);
+    emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment());
 
   // Switch to a new section if this basic block must begin a section. The
   // entry block is always placed in the function section and is handled
@@ -3648,6 +3654,12 @@ unsigned int AsmPrinter::getDwarfOffsetByteSize() const {
       OutStreamer->getContext().getDwarfFormat());
 }
 
+dwarf::FormParams AsmPrinter::getDwarfFormParams() const {
+  return {getDwarfVersion(), uint8_t(getPointerSize()),
+          OutStreamer->getContext().getDwarfFormat(),
+          MAI->doesDwarfUseRelocationsAcrossSections()};
+}
+
 unsigned int AsmPrinter::getUnitLengthFieldByteSize() const {
   return dwarf::getUnitLengthFieldByteSize(
       OutStreamer->getContext().getDwarfFormat());
diff --git a/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 5e7db1f2f76c..bd2c60eadd61 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -33,6 +33,7 @@ class ByteStreamer {
   virtual void emitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0;
   virtual void emitULEB128(uint64_t DWord, const Twine &Comment = "",
                            unsigned PadTo = 0) = 0;
+  virtual unsigned emitDIERef(const DIE &D) = 0;
 };
 
 class APByteStreamer final : public ByteStreamer {
@@ -54,15 +55,24 @@ public:
     AP.OutStreamer->AddComment(Comment);
     AP.emitULEB128(DWord, nullptr, PadTo);
   }
+  unsigned emitDIERef(const DIE &D) override {
+    uint64_t Offset = D.getOffset();
+    static constexpr unsigned ULEB128PadSize = 4;
+    assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
+    emitULEB128(Offset, "", ULEB128PadSize);
+    // Return how many comments to skip in DwarfDebug::emitDebugLocEntry to keep
+    // comments aligned with debug loc entries.
+    return ULEB128PadSize;
+  }
 };
 
 class HashingByteStreamer final : public ByteStreamer {
  private:
   DIEHash &Hash;
  public:
- HashingByteStreamer(DIEHash &H) : Hash(H) {}
-  void emitInt8(uint8_t Byte, const Twine &Comment) override {
-    Hash.update(Byte);
+   HashingByteStreamer(DIEHash &H) : Hash(H) {}
+   void emitInt8(uint8_t Byte, const Twine &Comment) override {
+     Hash.update(Byte);
   }
   void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
     Hash.addSLEB128(DWord);
@@ -71,6 +81,10 @@ class HashingByteStreamer final : public ByteStreamer {
                    unsigned PadTo) override {
     Hash.addULEB128(DWord);
   }
+  unsigned emitDIERef(const DIE &D) override {
+    Hash.hashRawTypeReference(D);
+    return 0; // Only used together with the APByteStreamer.
+  }
 };
 
 class BufferByteStreamer final : public ByteStreamer {
@@ -115,9 +129,15 @@ public:
       // with each other.
       for (size_t i = 1; i < Length; ++i)
         Comments.push_back("");
-
     }
   }
+  unsigned emitDIERef(const DIE &D) override {
+    uint64_t Offset = D.getOffset();
+    static constexpr unsigned ULEB128PadSize = 4;
+    assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
+    emitULEB128(Offset, "", ULEB128PadSize);
+    return 0; // Only used together with the APByteStreamer.
+  }
 };
 
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index d621108408f0..52c74713551c 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -68,6 +68,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -600,6 +601,8 @@ static SourceLanguage MapDWLangToCVLang(unsigned DWLang) {
     return SourceLanguage::D;
   case dwarf::DW_LANG_Swift:
     return SourceLanguage::Swift;
+  case dwarf::DW_LANG_Rust:
+    return SourceLanguage::Rust;
   default:
     // There's no CodeView representation for this language, and CV doesn't
     // have an "unknown" option for the language field, so we'll use MASM,
@@ -843,6 +846,12 @@ void CodeViewDebug::emitCompilerInformation() {
   if (MMI->getModule()->getProfileSummary(/*IsCS*/ false) != nullptr) {
     Flags |= static_cast<uint32_t>(CompileSym3Flags::PGO);
   }
+  using ArchType = llvm::Triple::ArchType;
+  ArchType Arch = Triple(MMI->getModule()->getTargetTriple()).getArch();
+  if (Asm->TM.Options.Hotpatch || Arch == ArchType::thumb ||
+      Arch == ArchType::aarch64) {
+    Flags |= static_cast<uint32_t>(CompileSym3Flags::HotPatch);
+  }
 
   OS.AddComment("Flags and language");
   OS.emitInt32(Flags);
@@ -857,8 +866,10 @@ void CodeViewDebug::emitCompilerInformation() {
   StringRef CompilerVersion = CU->getProducer();
   Version FrontVer = parseVersion(CompilerVersion);
   OS.AddComment("Frontend version");
-  for (int N : FrontVer.Part)
+  for (int N : FrontVer.Part) {
+    N = std::min<int>(N, std::numeric_limits<uint16_t>::max());
     OS.emitInt16(N);
+  }
 
   // Some Microsoft tools, like Binscope, expect a backend version number of at
   // least 8.something, so we'll coerce the LLVM version into a form that
@@ -885,6 +896,34 @@ static TypeIndex getStringIdTypeIdx(GlobalTypeTableBuilder &TypeTable,
   return TypeTable.writeLeafType(SIR);
 }
 
+static std::string flattenCommandLine(ArrayRef<std::string> Args,
+                                      StringRef MainFilename) {
+  std::string FlatCmdLine;
+  raw_string_ostream OS(FlatCmdLine);
+  bool PrintedOneArg = false;
+  if (!StringRef(Args[0]).contains("-cc1")) {
+    llvm::sys::printArg(OS, "-cc1", /*Quote=*/true);
+    PrintedOneArg = true;
+  }
+  for (unsigned i = 0; i < Args.size(); i++) {
+    StringRef Arg = Args[i];
+    if (Arg.empty())
+      continue;
+    if (Arg == "-main-file-name" || Arg == "-o") {
+      i++; // Skip this argument and next one.
+      continue;
+    }
+    if (Arg.startswith("-object-file-name") || Arg == MainFilename)
+      continue;
+    if (PrintedOneArg)
+      OS << " ";
+    llvm::sys::printArg(OS, Arg, /*Quote=*/true);
+    PrintedOneArg = true;
+  }
+  OS.flush();
+  return FlatCmdLine;
+}
+
 void CodeViewDebug::emitBuildInfo() {
   // First, make LF_BUILDINFO. It's a sequence of strings with various bits of
   // build info. The known prefix is:
@@ -905,8 +944,16 @@ void CodeViewDebug::emitBuildInfo() {
       getStringIdTypeIdx(TypeTable, MainSourceFile->getDirectory());
   BuildInfoArgs[BuildInfoRecord::SourceFile] =
       getStringIdTypeIdx(TypeTable, MainSourceFile->getFilename());
-  // FIXME: Path to compiler and command line. PDB is intentionally blank unless
-  // we implement /Zi type servers.
+  // FIXME: PDB is intentionally blank unless we implement /Zi type servers.
+  BuildInfoArgs[BuildInfoRecord::TypeServerPDB] =
+      getStringIdTypeIdx(TypeTable, "");
+  if (Asm->TM.Options.MCOptions.Argv0 != nullptr) {
+    BuildInfoArgs[BuildInfoRecord::BuildTool] =
+        getStringIdTypeIdx(TypeTable, Asm->TM.Options.MCOptions.Argv0);
+    BuildInfoArgs[BuildInfoRecord::CommandLine] = getStringIdTypeIdx(
+        TypeTable, flattenCommandLine(Asm->TM.Options.MCOptions.CommandLineArgs,
+                                      MainSourceFile->getFilename()));
+  }
   BuildInfoRecord BIR(BuildInfoArgs);
   TypeIndex BuildInfoIndex = TypeTable.writeLeafType(BIR);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index 2834d9c3ebbf..1a0256f30d41 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -274,7 +274,7 @@ LLVM_DUMP_METHOD void DIE::dump() const {
 }
 #endif
 
-unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,
+unsigned DIE::computeOffsetsAndAbbrevs(const dwarf::FormParams &FormParams,
                                        DIEAbbrevSet &AbbrevSet,
                                        unsigned CUOffset) {
   // Unique the abbreviation and fill in the abbreviation number so this DIE
@@ -289,7 +289,7 @@ unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,
 
   // Add the byte size of all the DIE attribute values.
   for (const auto &V : values())
-    CUOffset += V.SizeOf(AP);
+    CUOffset += V.sizeOf(FormParams);
 
   // Let the children compute their offsets and abbreviation numbers.
   if (hasChildren()) {
@@ -297,7 +297,8 @@ unsigned DIE::computeOffsetsAndAbbrevs(const AsmPrinter *AP,
     assert(Abbrev.hasChildren() && "Children flag not set");
 
     for (auto &Child : children())
-      CUOffset = Child.computeOffsetsAndAbbrevs(AP, AbbrevSet, CUOffset);
+      CUOffset =
+          Child.computeOffsetsAndAbbrevs(FormParams, AbbrevSet, CUOffset);
 
     // Each child chain is terminated with a zero byte, adjust the offset.
     CUOffset += sizeof(int8_t);
@@ -335,13 +336,13 @@ void DIEValue::emitValue(const AsmPrinter *AP) const {
   }
 }
 
-unsigned DIEValue::SizeOf(const AsmPrinter *AP) const {
+unsigned DIEValue::sizeOf(const dwarf::FormParams &FormParams) const {
   switch (Ty) {
   case isNone:
     llvm_unreachable("Expected valid DIEValue");
 #define HANDLE_DIEVALUE(T)                                                     \
   case is##T:                                                                  \
-    return getDIE##T().SizeOf(AP, Form);
+    return getDIE##T().sizeOf(FormParams, Form);
 #include "llvm/CodeGen/DIEValue.def"
   }
   llvm_unreachable("Unknown DIE kind");
@@ -407,7 +408,8 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_strp_sup:
   case dwarf::DW_FORM_addr:
   case dwarf::DW_FORM_ref_addr:
-    Asm->OutStreamer->emitIntValue(Integer, SizeOf(Asm, Form));
+    Asm->OutStreamer->emitIntValue(Integer,
+                                   sizeOf(Asm->getDwarfFormParams(), Form));
     return;
   case dwarf::DW_FORM_GNU_str_index:
   case dwarf::DW_FORM_GNU_addr_index:
@@ -425,15 +427,12 @@ void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   }
 }
 
-/// SizeOf - Determine size of integer value in bytes.
+/// sizeOf - Determine size of integer value in bytes.
 ///
-unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  assert(AP && "AsmPrinter is required to set FormParams");
-  dwarf::FormParams Params = {AP->getDwarfVersion(),
-                              uint8_t(AP->getPointerSize()),
-                              AP->OutStreamer->getContext().getDwarfFormat()};
-
-  if (Optional<uint8_t> FixedSize = dwarf::getFixedFormByteSize(Form, Params))
+unsigned DIEInteger::sizeOf(const dwarf::FormParams &FormParams,
+                            dwarf::Form Form) const {
+  if (Optional<uint8_t> FixedSize =
+          dwarf::getFixedFormByteSize(Form, FormParams))
     return *FixedSize;
 
   switch (Form) {
@@ -464,19 +463,20 @@ void DIEInteger::print(raw_ostream &O) const {
 /// EmitValue - Emit expression value.
 ///
 void DIEExpr::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->emitDebugValue(Expr, SizeOf(AP, Form));
+  AP->emitDebugValue(Expr, sizeOf(AP->getDwarfFormParams(), Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
 ///
-unsigned DIEExpr::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIEExpr::sizeOf(const dwarf::FormParams &FormParams,
+                         dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_data4:
     return 4;
   case dwarf::DW_FORM_data8:
     return 8;
   case dwarf::DW_FORM_sec_offset:
-    return AP->getDwarfOffsetByteSize();
+    return FormParams.getDwarfOffsetByteSize();
   default:
     llvm_unreachable("DIE Value form not supported yet");
   }
@@ -493,12 +493,14 @@ void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; }
 ///
 void DIELabel::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   bool IsSectionRelative = Form != dwarf::DW_FORM_addr;
-  AP->emitLabelReference(Label, SizeOf(AP, Form), IsSectionRelative);
+  AP->emitLabelReference(Label, sizeOf(AP->getDwarfFormParams(), Form),
+                         IsSectionRelative);
 }
 
-/// SizeOf - Determine size of label value in bytes.
+/// sizeOf - Determine size of label value in bytes.
 ///
-unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIELabel::sizeOf(const dwarf::FormParams &FormParams,
+                          dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_data4:
     return 4;
@@ -506,9 +508,9 @@ unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
     return 8;
   case dwarf::DW_FORM_sec_offset:
   case dwarf::DW_FORM_strp:
-    return AP->getDwarfOffsetByteSize();
+    return FormParams.getDwarfOffsetByteSize();
   case dwarf::DW_FORM_addr:
-    return AP->MAI->getCodePointerSize();
+    return FormParams.AddrSize;
   default:
     llvm_unreachable("DIE Value form not supported yet");
   }
@@ -527,7 +529,7 @@ void DIEBaseTypeRef::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   AP->emitULEB128(Offset, nullptr, ULEB128PadSize);
 }
 
-unsigned DIEBaseTypeRef::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIEBaseTypeRef::sizeOf(const dwarf::FormParams &, dwarf::Form) const {
   return ULEB128PadSize;
 }
 
@@ -541,19 +543,21 @@ void DIEBaseTypeRef::print(raw_ostream &O) const { O << "BaseTypeRef: " << Index
 /// EmitValue - Emit delta value.
 ///
 void DIEDelta::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->emitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
+  AP->emitLabelDifference(LabelHi, LabelLo,
+                          sizeOf(AP->getDwarfFormParams(), Form));
 }
 
 /// SizeOf - Determine size of delta value in bytes.
 ///
-unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIEDelta::sizeOf(const dwarf::FormParams &FormParams,
+                          dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_data4:
     return 4;
   case dwarf::DW_FORM_data8:
     return 8;
   case dwarf::DW_FORM_sec_offset:
-    return AP->getDwarfOffsetByteSize();
+    return FormParams.getDwarfOffsetByteSize();
   default:
     llvm_unreachable("DIE Value form not supported yet");
   }
@@ -592,9 +596,10 @@ void DIEString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   }
 }
 
-/// SizeOf - Determine size of delta value in bytes.
+/// sizeOf - Determine size of delta value in bytes.
 ///
-unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIEString::sizeOf(const dwarf::FormParams &FormParams,
+                           dwarf::Form Form) const {
   // Index of string in symbol table.
   switch (Form) {
   case dwarf::DW_FORM_GNU_str_index:
@@ -603,11 +608,11 @@ unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_strx2:
   case dwarf::DW_FORM_strx3:
   case dwarf::DW_FORM_strx4:
-    return DIEInteger(S.getIndex()).SizeOf(AP, Form);
+    return DIEInteger(S.getIndex()).sizeOf(FormParams, Form);
   case dwarf::DW_FORM_strp:
-    if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
-      return DIELabel(S.getSymbol()).SizeOf(AP, Form);
-    return DIEInteger(S.getOffset()).SizeOf(AP, Form);
+    if (FormParams.DwarfUsesRelocationsAcrossSections)
+      return DIELabel(S.getSymbol()).sizeOf(FormParams, Form);
+    return DIEInteger(S.getOffset()).sizeOf(FormParams, Form);
   default:
     llvm_unreachable("Expected valid string form");
   }
@@ -630,7 +635,7 @@ void DIEInlineString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   llvm_unreachable("Expected valid string form");
 }
 
-unsigned DIEInlineString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIEInlineString::sizeOf(const dwarf::FormParams &, dwarf::Form) const {
   // Emit string bytes + NULL byte.
   return S.size() + 1;
 }
@@ -653,7 +658,8 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_ref2:
   case dwarf::DW_FORM_ref4:
   case dwarf::DW_FORM_ref8:
-    AP->OutStreamer->emitIntValue(Entry->getOffset(), SizeOf(AP, Form));
+    AP->OutStreamer->emitIntValue(Entry->getOffset(),
+                                  sizeOf(AP->getDwarfFormParams(), Form));
     return;
 
   case dwarf::DW_FORM_ref_udata:
@@ -665,11 +671,12 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
     uint64_t Addr = Entry->getDebugSectionOffset();
     if (const MCSymbol *SectionSym =
             Entry->getUnit()->getCrossSectionRelativeBaseAddress()) {
-      AP->emitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true);
+      AP->emitLabelPlusOffset(SectionSym, Addr,
+                              sizeOf(AP->getDwarfFormParams(), Form), true);
       return;
     }
 
-    AP->OutStreamer->emitIntValue(Addr, SizeOf(AP, Form));
+    AP->OutStreamer->emitIntValue(Addr, sizeOf(AP->getDwarfFormParams(), Form));
     return;
   }
   default:
@@ -677,7 +684,8 @@ void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   }
 }
 
-unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIEEntry::sizeOf(const dwarf::FormParams &FormParams,
+                          dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_ref1:
     return 1;
@@ -690,15 +698,7 @@ unsigned DIEEntry::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_ref_udata:
     return getULEB128Size(Entry->getOffset());
   case dwarf::DW_FORM_ref_addr:
-    if (AP->getDwarfVersion() == 2)
-      return AP->MAI->getCodePointerSize();
-    switch (AP->OutStreamer->getContext().getDwarfFormat()) {
-    case dwarf::DWARF32:
-      return 4;
-    case dwarf::DWARF64:
-      return 8;
-    }
-    llvm_unreachable("Invalid DWARF format");
+    return FormParams.getRefAddrByteSize();
 
   default:
     llvm_unreachable("Improper form for DIE reference");
@@ -714,12 +714,10 @@ void DIEEntry::print(raw_ostream &O) const {
 // DIELoc Implementation
 //===----------------------------------------------------------------------===//
 
-/// ComputeSize - calculate the size of the location expression.
-///
-unsigned DIELoc::ComputeSize(const AsmPrinter *AP) const {
+unsigned DIELoc::computeSize(const dwarf::FormParams &FormParams) const {
   if (!Size) {
     for (const auto &V : values())
-      Size += V.SizeOf(AP);
+      Size += V.sizeOf(FormParams);
   }
 
   return Size;
@@ -743,9 +741,9 @@ void DIELoc::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
     V.emitValue(Asm);
 }
 
-/// SizeOf - Determine size of location data in bytes.
+/// sizeOf - Determine size of location data in bytes.
 ///
-unsigned DIELoc::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIELoc::sizeOf(const dwarf::FormParams &, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
   case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
@@ -766,12 +764,10 @@ void DIELoc::print(raw_ostream &O) const {
 // DIEBlock Implementation
 //===----------------------------------------------------------------------===//
 
-/// ComputeSize - calculate the size of the block.
-///
-unsigned DIEBlock::ComputeSize(const AsmPrinter *AP) const {
+unsigned DIEBlock::computeSize(const dwarf::FormParams &FormParams) const {
   if (!Size) {
     for (const auto &V : values())
-      Size += V.SizeOf(AP);
+      Size += V.sizeOf(FormParams);
   }
 
   return Size;
@@ -797,9 +793,9 @@ void DIEBlock::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
     V.emitValue(Asm);
 }
 
-/// SizeOf - Determine size of block data in bytes.
+/// sizeOf - Determine size of block data in bytes.
 ///
-unsigned DIEBlock::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIEBlock::sizeOf(const dwarf::FormParams &, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
   case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
@@ -820,22 +816,23 @@ void DIEBlock::print(raw_ostream &O) const {
 // DIELocList Implementation
 //===----------------------------------------------------------------------===//
 
-unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+unsigned DIELocList::sizeOf(const dwarf::FormParams &FormParams,
+                            dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_loclistx:
     return getULEB128Size(Index);
   case dwarf::DW_FORM_data4:
-    assert(!AP->isDwarf64() &&
+    assert(FormParams.Format != dwarf::DWARF64 &&
            "DW_FORM_data4 is not suitable to emit a pointer to a location list "
            "in the 64-bit DWARF format");
     return 4;
   case dwarf::DW_FORM_data8:
-    assert(AP->isDwarf64() &&
+    assert(FormParams.Format == dwarf::DWARF64 &&
            "DW_FORM_data8 is not suitable to emit a pointer to a location list "
            "in the 32-bit DWARF format");
     return 8;
   case dwarf::DW_FORM_sec_offset:
-    return AP->getDwarfOffsetByteSize();
+    return FormParams.getDwarfOffsetByteSize();
   default:
     llvm_unreachable("DIE Value form not supported yet");
   }
@@ -860,9 +857,10 @@ void DIELocList::print(raw_ostream &O) const { O << "LocList: " << Index; }
 // DIEAddrOffset Implementation
 //===----------------------------------------------------------------------===//
 
-unsigned DIEAddrOffset::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
-  return Addr.SizeOf(AP, dwarf::DW_FORM_addrx) +
-         Offset.SizeOf(AP, dwarf::DW_FORM_data4);
+unsigned DIEAddrOffset::sizeOf(const dwarf::FormParams &FormParams,
+                               dwarf::Form) const {
+  return Addr.sizeOf(FormParams, dwarf::DW_FORM_addrx) +
+         Offset.sizeOf(FormParams, dwarf::DW_FORM_data4);
 }
 
 /// EmitValue - Emit label value.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 5f4ee747fcca..e175854f7b93 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -207,6 +207,18 @@ void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
   computeHash(Entry);
 }
 
+void DIEHash::hashRawTypeReference(const DIE &Entry) {
+  unsigned &DieNumber = Numbering[&Entry];
+  if (DieNumber) {
+    addULEB128('R');
+    addULEB128(DieNumber);
+    return;
+  }
+  DieNumber = Numbering.size();
+  addULEB128('T');
+  computeHash(Entry);
+}
+
 // Hash all of the values in a block like set of values. This assumes that
 // all of the data is going to be added as integers.
 void DIEHash::hashBlockData(const DIE::const_value_range &Values) {
@@ -298,10 +310,10 @@ void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) {
     addULEB128(Attribute);
     addULEB128(dwarf::DW_FORM_block);
     if (Value.getType() == DIEValue::isBlock) {
-      addULEB128(Value.getDIEBlock().ComputeSize(AP));
+      addULEB128(Value.getDIEBlock().computeSize(AP->getDwarfFormParams()));
       hashBlockData(Value.getDIEBlock().values());
     } else if (Value.getType() == DIEValue::isLoc) {
-      addULEB128(Value.getDIELoc().ComputeSize(AP));
+      addULEB128(Value.getDIELoc().computeSize(AP->getDwarfFormParams()));
       hashBlockData(Value.getDIELoc().values());
     } else {
       // We could add the block length, but that would take
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
index 29e1da4c5d60..24a973b39271 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -62,6 +62,8 @@ public:
   /// Encodes and adds \param Value to the hash as a SLEB128.
   void addSLEB128(int64_t Value);
 
+  void hashRawTypeReference(const DIE &Entry);
+
 private:
   /// Adds \param Str to the hash and includes a NULL byte.
   void addString(StringRef Str);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 4df34d2c9402..18fc46c74eb4 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -155,7 +155,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DIType *Ty) {
 
   if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
       Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
-      Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_atomic_type)
+      Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_atomic_type &&
+      Tag != dwarf::DW_TAG_immutable_type)
     return DDTy->getSizeInBits();
 
   DIType *BaseType = DDTy->getBaseType();
@@ -210,7 +211,8 @@ bool DebugHandlerBase::isUnsignedDIType(const DIType *Ty) {
       return true;
     assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
            T == dwarf::DW_TAG_volatile_type ||
-           T == dwarf::DW_TAG_restrict_type || T == dwarf::DW_TAG_atomic_type);
+           T == dwarf::DW_TAG_restrict_type || T == dwarf::DW_TAG_atomic_type ||
+           T == dwarf::DW_TAG_immutable_type);
     assert(DTy->getBaseType() && "Expected valid base type");
     return isUnsignedDIType(DTy->getBaseType());
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 9b73f0ab2f05..5913c687db48 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -127,9 +127,14 @@ unsigned DwarfCompileUnit::getOrCreateSourceID(const DIFile *File) {
   if (!File)
     return Asm->OutStreamer->emitDwarfFileDirective(0, "", "", None, None,
                                                     CUID);
-  return Asm->OutStreamer->emitDwarfFileDirective(
-      0, File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File),
-      File->getSource(), CUID);
+
+  if (LastFile != File) {
+    LastFile = File;
+    LastFileID = Asm->OutStreamer->emitDwarfFileDirective(
+        0, File->getDirectory(), File->getFilename(), DD->getMD5AsBytes(File),
+        File->getSource(), CUID);
+  }
+  return LastFileID;
 }
 
 DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
@@ -260,9 +265,20 @@ void DwarfCompileUnit::addLocationAttribute(
 
     if (Global) {
       const MCSymbol *Sym = Asm->getSymbol(Global);
-      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-      assert((PointerSize == 4 || PointerSize == 8) &&
-             "Add support for other sizes if necessary");
+      // 16-bit platforms like MSP430 and AVR take this path, so sink this
+      // assert to platforms that use it.
+      auto GetPointerSizedFormAndOp = [this]() {
+        unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+        assert((PointerSize == 4 || PointerSize == 8) &&
+               "Add support for other sizes if necessary");
+        struct FormAndOp {
+          dwarf::Form Form;
+          dwarf::LocationAtom Op;
+        };
+        return PointerSize == 4
+                   ? FormAndOp{dwarf::DW_FORM_data4, dwarf::DW_OP_const4u}
+                   : FormAndOp{dwarf::DW_FORM_data8, dwarf::DW_OP_const8u};
+      };
       if (Global->isThreadLocal()) {
         if (Asm->TM.useEmulatedTLS()) {
           // TODO: add debug info for emulated thread local mode.
@@ -270,15 +286,12 @@ void DwarfCompileUnit::addLocationAttribute(
           // FIXME: Make this work with -gsplit-dwarf.
           // Based on GCC's support for TLS:
           if (!DD->useSplitDwarf()) {
+            auto FormAndOp = GetPointerSizedFormAndOp();
             // 1) Start with a constNu of the appropriate pointer size
-            addUInt(*Loc, dwarf::DW_FORM_data1,
-                    PointerSize == 4 ? dwarf::DW_OP_const4u
-                                     : dwarf::DW_OP_const8u);
+            addUInt(*Loc, dwarf::DW_FORM_data1, FormAndOp.Op);
             // 2) containing the (relocated) offset of the TLS variable
             //    within the module's TLS block.
-            addExpr(*Loc,
-                    PointerSize == 4 ? dwarf::DW_FORM_data4
-                                     : dwarf::DW_FORM_data8,
+            addExpr(*Loc, FormAndOp.Form,
                     Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
           } else {
             addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
@@ -292,13 +305,11 @@ void DwarfCompileUnit::addLocationAttribute(
         }
       } else if (Asm->TM.getRelocationModel() == Reloc::RWPI ||
                  Asm->TM.getRelocationModel() == Reloc::ROPI_RWPI) {
+        auto FormAndOp = GetPointerSizedFormAndOp();
         // Constant
-        addUInt(*Loc, dwarf::DW_FORM_data1,
-                PointerSize == 4 ? dwarf::DW_OP_const4u
-                                 : dwarf::DW_OP_const8u);
+        addUInt(*Loc, dwarf::DW_FORM_data1, FormAndOp.Op);
         // Relocation offset
-        addExpr(*Loc, PointerSize == 4 ? dwarf::DW_FORM_data4
-                                       : dwarf::DW_FORM_data8,
+        addExpr(*Loc, FormAndOp.Form,
                 Asm->getObjFileLowering().getIndirectSymViaRWPI(Sym));
         // Base register
         Register BaseReg = Asm->getObjFileLowering().getStaticBase();
@@ -1575,7 +1586,8 @@ void DwarfCompileUnit::createBaseTypeDIEs() {
               Twine(dwarf::AttributeEncodingString(Btr.Encoding) +
                     "_" + Twine(Btr.BitSize)).toStringRef(Str));
     addUInt(Die, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1, Btr.Encoding);
-    addUInt(Die, dwarf::DW_AT_byte_size, None, Btr.BitSize / 8);
+    // Round up to smallest number of bytes that contains this number of bits.
+    addUInt(Die, dwarf::DW_AT_byte_size, None, divideCeil(Btr.BitSize, 8));
 
     Btr.Die = &Die;
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index fb03982b5e4a..f2e1f6346803 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -86,6 +86,9 @@ class DwarfCompileUnit final : public DwarfUnit {
   /// DWO ID for correlating skeleton and split units.
   uint64_t DWOId = 0;
 
+  const DIFile *LastFile = nullptr;
+  unsigned LastFileID;
+
   /// Construct a DIE for the given DbgVariable without initializing the
   /// DbgVariable's DIE reference.
   DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 48134f1fd774..680b9586228f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2539,12 +2539,10 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
       if (Op.getDescription().Op[I] == Encoding::SizeNA)
         continue;
       if (Op.getDescription().Op[I] == Encoding::BaseTypeRef) {
-        uint64_t Offset =
-            CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die->getOffset();
-        assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
-        Streamer.emitULEB128(Offset, "", ULEB128PadSize);
+        unsigned Length =
+          Streamer.emitDIERef(*CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die);
         // Make sure comments stay aligned.
-        for (unsigned J = 0; J < ULEB128PadSize; ++J)
+        for (unsigned J = 0; J < Length; ++J)
           if (Comment != End)
             Comment++;
       } else {
@@ -3369,7 +3367,8 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   // Fast path if we're building some type units and one has already used the
   // address pool we know we're going to throw away all this work anyway, so
   // don't bother building dependent types.
-  if (!TypeUnitsUnderConstruction.empty() && AddrPool.hasBeenUsed())
+  if (!TypeUnitsUnderConstruction.empty() &&
+      (AddrPool.hasBeenUsed() || SeenLocalType))
     return;
 
   auto Ins = TypeSignatures.insert(std::make_pair(CTy, 0));
@@ -3380,6 +3379,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
+  SeenLocalType = false;
 
   auto OwnedUnit = std::make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
                                                     getDwoLineTable(CU));
@@ -3423,7 +3423,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
     // Types referencing entries in the address table cannot be placed in type
     // units.
-    if (AddrPool.hasBeenUsed()) {
+    if (AddrPool.hasBeenUsed() || SeenLocalType) {
 
       // Remove all the types built while building this type.
       // This is pessimistic as some of these types might not be dependent on
@@ -3451,14 +3451,18 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
 
 DwarfDebug::NonTypeUnitContext::NonTypeUnitContext(DwarfDebug *DD)
     : DD(DD),
-      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)), AddrPoolUsed(DD->AddrPool.hasBeenUsed()) {
+      TypeUnitsUnderConstruction(std::move(DD->TypeUnitsUnderConstruction)),
+      AddrPoolUsed(DD->AddrPool.hasBeenUsed()),
+      SeenLocalType(DD->SeenLocalType) {
   DD->TypeUnitsUnderConstruction.clear();
   DD->AddrPool.resetUsedFlag();
+  DD->SeenLocalType = false;
 }
 
 DwarfDebug::NonTypeUnitContext::~NonTypeUnitContext() {
   DD->TypeUnitsUnderConstruction = std::move(TypeUnitsUnderConstruction);
   DD->AddrPool.resetUsedFlag(AddrPoolUsed);
+  DD->SeenLocalType = SeenLocalType;
 }
 
 DwarfDebug::NonTypeUnitContext DwarfDebug::enterNonTypeUnitContext() {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 4e1a1b1e068d..0043000652e8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -433,6 +433,7 @@ private:
   DenseMap<const DIStringType *, unsigned> StringTypeLocMap;
 
   AddressPool AddrPool;
+  bool SeenLocalType = false;
 
   /// Accelerator tables.
   AccelTable<DWARF5AccelTableData> AccelDebugNames;
@@ -671,6 +672,7 @@ public:
     DwarfDebug *DD;
     decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction;
     bool AddrPoolUsed;
+    bool SeenLocalType;
     friend class DwarfDebug;
     NonTypeUnitContext(DwarfDebug *DD);
   public:
@@ -679,6 +681,7 @@ public:
   };
 
   NonTypeUnitContext enterNonTypeUnitContext();
+  void seenLocalType() { SeenLocalType = true; }
 
   /// Add a label so that arange data can be generated for it.
   void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
index 40898c9fc855..4defa8a30855 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -98,6 +98,8 @@ class LLVM_LIBRARY_VISIBILITY AIXException : public DwarfCFIExceptionBase {
 public:
   AIXException(AsmPrinter *A);
 
+  void markFunctionEnd() override;
+
   void endModule() override {}
   void beginFunction(const MachineFunction *MF) override {}
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 37407c98e75f..ee932d105107 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -681,9 +681,25 @@ void DwarfExpression::emitLegacySExt(unsigned FromBits) {
 }
 
 void DwarfExpression::emitLegacyZExt(unsigned FromBits) {
-  // (X & (1 << FromBits - 1))
-  emitOp(dwarf::DW_OP_constu);
-  emitUnsigned((1ULL << FromBits) - 1);
+  // Heuristic to decide the most efficient encoding.
+  // A ULEB can encode 7 1-bits per byte.
+  if (FromBits / 7 < 1+1+1+1+1) {
+    // (X & (1 << FromBits - 1))
+    emitOp(dwarf::DW_OP_constu);
+    emitUnsigned((1ULL << FromBits) - 1);
+  } else {
+    // Note that the DWARF 4 stack consists of pointer-sized elements,
+    // so technically it doesn't make sense to shift left more than 64
+    // bits. We leave that for the consumer to decide though. LLDB for
+    // example uses APInt for the stack elements and can still deal
+    // with this.
+    emitOp(dwarf::DW_OP_lit1);
+    emitOp(dwarf::DW_OP_constu);
+    emitUnsigned(FromBits);
+    emitOp(dwarf::DW_OP_shl);
+    emitOp(dwarf::DW_OP_lit1);
+    emitOp(dwarf::DW_OP_minus);
+  }
   emitOp(dwarf::DW_OP_and);
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 838e1c9a10be..a67d0f032cf6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -92,7 +92,8 @@ unsigned DwarfFile::computeSizeAndOffsetsForUnit(DwarfUnit *TheU) {
 // Compute the size and offset of a DIE. The offset is relative to start of the
 // CU. It returns the offset after laying out the DIE.
 unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) {
-  return Die.computeOffsetsAndAbbrevs(Asm, Abbrevs, Offset);
+  return Die.computeOffsetsAndAbbrevs(Asm->getDwarfFormParams(), Abbrevs,
+                                      Offset);
 }
 
 void DwarfFile::emitAbbrevs(MCSection *Section) { Abbrevs.Emit(Asm, Section); }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 6b6d63f14f87..15d90c54adfc 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -77,7 +77,7 @@ void DIEDwarfExpression::enableTemporaryBuffer() {
 void DIEDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; }
 
 unsigned DIEDwarfExpression::getTemporaryBufferSize() {
-  return TmpDIE.ComputeSize(&AP);
+  return TmpDIE.computeSize(AP.getDwarfFormParams());
 }
 
 void DIEDwarfExpression::commitTemporaryBuffer() { OutDIE.takeValues(TmpDIE); }
@@ -394,14 +394,14 @@ DIE &DwarfUnit::createAndAddDIE(dwarf::Tag Tag, DIE &Parent, const DINode *N) {
 }
 
 void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc) {
-  Loc->ComputeSize(Asm);
+  Loc->computeSize(Asm->getDwarfFormParams());
   DIELocs.push_back(Loc); // Memoize so we can call the destructor later on.
   addAttribute(Die, Attribute, Loc->BestForm(DD->getDwarfVersion()), Loc);
 }
 
 void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
                          DIEBlock *Block) {
-  Block->ComputeSize(Asm);
+  Block->computeSize(Asm->getDwarfFormParams());
   DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
   addAttribute(Die, Attribute, Form, Block);
 }
@@ -597,10 +597,8 @@ DIE *DwarfUnit::createTypeDIE(const DIScope *Context, DIE &ContextDIE,
       // Skip updating the accelerator tables since this is not the full type.
       if (MDString *TypeId = CTy->getRawIdentifier())
         DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
-      else {
-        auto X = DD->enterNonTypeUnitContext();
+      else
         finishNonUnitTypeDIE(TyDIE, CTy);
-      }
       return &TyDIE;
     }
     constructTypeDIE(TyDIE, CTy);
@@ -744,6 +742,16 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) {
     addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
   }
 
+  if (DIExpression *Expr = STy->getStringLocationExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    // This is to describe the memory location of the
+    // string, so lock it down as such.
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize());
+  }
+
   if (STy->getEncoding()) {
     // For eventual Unicode support.
     addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
@@ -1189,7 +1197,7 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
       DefinitionArgs = SP->getType()->getTypeArray();
 
       if (DeclArgs.size() && DefinitionArgs.size())
-        if (DefinitionArgs[0] != NULL && DeclArgs[0] != DefinitionArgs[0])
+        if (DefinitionArgs[0] != nullptr && DeclArgs[0] != DefinitionArgs[0])
           addType(SPDie, DefinitionArgs[0]);
 
       DeclDie = getDIE(SPDecl);
@@ -1842,5 +1850,25 @@ void DwarfTypeUnit::finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) {
   StringRef Name = CTy->getName();
   if (!Name.empty())
     addString(D, dwarf::DW_AT_name, Name);
+  if (Name.startswith("_STN") || !Name.contains('<'))
+    addTemplateParams(D, CTy->getTemplateParams());
+  // If the type is in an anonymous namespace, we can't reference it from a TU
+  // (since the type would be CU local and the TU doesn't specify which TU has
+  // the appropriate type definition) - so flag this emission as such and skip
+  // the rest of the emission now since we're going to throw out all this work
+  // and put the outer/referencing type in the CU instead.
+  // FIXME: Probably good to generalize this to a DICompositeType flag populated
+  // by the frontend, then we could use that to have types that can have
+  // decl+def merged by LTO but where the definition still doesn't go in a type
+  // unit because the type has only one definition.
+  for (DIScope *S = CTy->getScope(); S; S = S->getScope()) {
+    if (auto *NS = dyn_cast<DINamespace>(S)) {
+      if (NS->getName().empty()) {
+        DD->seenLocalType();
+        break;
+      }
+    }
+  }
+  auto X = DD->enterNonTypeUnitContext();
   getCU().createTypeDIE(CTy);
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 54b0079dd7ce..330f3bacca43 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -25,9 +25,7 @@ namespace llvm {
 
 class ConstantFP;
 class ConstantInt;
-class DbgVariable;
 class DwarfCompileUnit;
-class MachineOperand;
 class MCDwarfDwoLineTable;
 class MCSymbol;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
index 7d5e51218693..a92a89084cad 100644
--- a/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
+++ b/llvm/lib/CodeGen/AsmPrinter/PseudoProbePrinter.h
@@ -19,8 +19,6 @@
 namespace llvm {
 
 class AsmPrinter;
-class MCStreamer;
-class Module;
 class DILocation;
 
 class PseudoProbeHandler : public AsmPrinterHandler {
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
index 1e3f33e70715..ad8432343a60 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinCFGuard.cpp
@@ -27,7 +27,7 @@
 
 using namespace llvm;
 
-WinCFGuard::WinCFGuard(AsmPrinter *A) : AsmPrinterHandler(), Asm(A) {}
+WinCFGuard::WinCFGuard(AsmPrinter *A) : Asm(A) {}
 
 WinCFGuard::~WinCFGuard() {}
 
diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h
index 2a4ea92a92aa..95d5dcfbbd0f 100644
--- a/llvm/lib/CodeGen/BranchFolding.h
+++ b/llvm/lib/CodeGen/BranchFolding.h
@@ -23,7 +23,6 @@ class BasicBlock;
 class MachineBranchProbabilityInfo;
 class MachineFunction;
 class MachineLoopInfo;
-class MachineModuleInfo;
 class MachineRegisterInfo;
 class MBFIWrapper;
 class ProfileSummaryInfo;
diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp
index 1c2e3f998449..de173a9dfd62 100644
--- a/llvm/lib/CodeGen/CFIInstrInserter.cpp
+++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -347,7 +347,7 @@ bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) {
     }
 
     if (ForceFullCFA) {
-      MF.getSubtarget().getFrameLowering()->emitCalleeSavedFrameMoves(
+      MF.getSubtarget().getFrameLowering()->emitCalleeSavedFrameMovesFullCFA(
           *MBBInfo.MBB, MBBI);
       InsertedCFIInstr = true;
       PrevMBBInfo = &MBBInfo;
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 5f9982cd155d..84a0e4142bb6 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -43,9 +43,9 @@ void VirtRegAuxInfo::calculateSpillWeightsAndHints() {
 }
 
 // Return the preferred allocation register for reg, given a COPY instruction.
-static Register copyHint(const MachineInstr *MI, unsigned Reg,
-                         const TargetRegisterInfo &TRI,
-                         const MachineRegisterInfo &MRI) {
+Register VirtRegAuxInfo::copyHint(const MachineInstr *MI, unsigned Reg,
+                                  const TargetRegisterInfo &TRI,
+                                  const MachineRegisterInfo &MRI) {
   unsigned Sub, HSub;
   Register HReg;
   if (MI->getOperand(0).getReg() == Reg) {
@@ -77,9 +77,10 @@ static Register copyHint(const MachineInstr *MI, unsigned Reg,
 }
 
 // Check if all values in LI are rematerializable
-static bool isRematerializable(const LiveInterval &LI, const LiveIntervals &LIS,
-                               const VirtRegMap &VRM,
-                               const TargetInstrInfo &TII) {
+bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI,
+                                        const LiveIntervals &LIS,
+                                        const VirtRegMap &VRM,
+                                        const TargetInstrInfo &TII) {
   Register Reg = LI.reg();
   Register Original = VRM.getOriginal(Reg);
   for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 747f4e4fdecc..28f24e5ea908 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -4168,11 +4168,11 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
 
   // We can get through binary operator, if it is legal. In other words, the
   // binary operator must have a nuw or nsw flag.
-  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-  if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) &&
-      ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
-       (IsSExt && BinOp->hasNoSignedWrap())))
-    return true;
+  if (const auto *BinOp = dyn_cast<BinaryOperator>(Inst))
+    if (isa<OverflowingBinaryOperator>(BinOp) &&
+        ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
+         (IsSExt && BinOp->hasNoSignedWrap())))
+      return true;
 
   // ext(and(opnd, cst)) --> and(ext(opnd), ext(cst))
   if ((Inst->getOpcode() == Instruction::And ||
@@ -4181,10 +4181,10 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
 
   // ext(xor(opnd, cst)) --> xor(ext(opnd), ext(cst))
   if (Inst->getOpcode() == Instruction::Xor) {
-    const ConstantInt *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1));
     // Make sure it is not a NOT.
-    if (Cst && !Cst->getValue().isAllOnes())
-      return true;
+    if (const auto *Cst = dyn_cast<ConstantInt>(Inst->getOperand(1)))
+      if (!Cst->getValue().isAllOnes())
+        return true;
   }
 
   // zext(shrl(opnd, cst)) --> shrl(zext(opnd), zext(cst))
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 3bed81d5841d..1d50e1d22b95 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -90,7 +90,6 @@ CGOPT(bool, EnableAddrsig)
 CGOPT(bool, EmitCallSiteInfo)
 CGOPT(bool, EnableMachineFunctionSplitter)
 CGOPT(bool, EnableDebugEntryValues)
-CGOPT_EXP(bool, ValueTrackingVariableLocations)
 CGOPT(bool, ForceDwarfFrameSection)
 CGOPT(bool, XRayOmitFunctionIndex)
 CGOPT(bool, DebugStrictDwarf)
@@ -433,12 +432,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       cl::init(false));
   CGBINDOPT(EnableDebugEntryValues);
 
-  static cl::opt<bool> ValueTrackingVariableLocations(
-      "experimental-debug-variable-locations",
-      cl::desc("Use experimental new value-tracking variable locations"),
-      cl::init(false));
-  CGBINDOPT(ValueTrackingVariableLocations);
-
   static cl::opt<bool> EnableMachineFunctionSplitter(
       "split-machine-functions",
       cl::desc("Split out cold basic blocks from machine functions based on "
@@ -539,12 +532,6 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.DebugStrictDwarf = getDebugStrictDwarf();
   Options.LoopAlignment = getAlignLoops();
 
-  if (auto Opt = getExplicitValueTrackingVariableLocations())
-    Options.ValueTrackingVariableLocations = *Opt;
-  else
-    Options.ValueTrackingVariableLocations =
-        getDefaultValueTrackingVariableLocations(TheTriple);
-
   Options.MCOptions = mc::InitMCTargetOptionsFromFlags();
 
   Options.ThreadModel = getThreadModel();
@@ -620,7 +607,7 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
                                     Function &F) {
   auto &Ctx = F.getContext();
   AttributeList Attrs = F.getAttributes();
-  AttrBuilder NewAttrs;
+  AttrBuilder NewAttrs(Ctx);
 
   if (!CPU.empty() && !F.hasFnAttribute("target-cpu"))
     NewAttrs.addAttribute("target-cpu", CPU);
@@ -698,8 +685,3 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
     setFunctionAttributes(CPU, Features, F);
 }
 
-bool codegen::getDefaultValueTrackingVariableLocations(const llvm::Triple &T) {
-  if (T.getArch() == llvm::Triple::x86_64)
-    return true;
-  return false;
-}
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 901409ea9f8f..eb2d449bc4af 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -40,8 +40,7 @@ using namespace llvm;
 
 CriticalAntiDepBreaker::CriticalAntiDepBreaker(MachineFunction &MFi,
                                                const RegisterClassInfo &RCI)
-    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
-      TII(MF.getSubtarget().getInstrInfo()),
+    : MF(MFi), MRI(MF.getRegInfo()), TII(MF.getSubtarget().getInstrInfo()),
       TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
       Classes(TRI->getNumRegs(), nullptr), KillIndices(TRI->getNumRegs(), 0),
       DefIndices(TRI->getNumRegs(), 0), KeepRegs(TRI->getNumRegs(), false) {}
diff --git a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
index 7300ea6b50ee..d9caa8ad42d0 100644
--- a/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -68,9 +68,16 @@ void ExpandPostRA::TransferImplicitOperands(MachineInstr *MI) {
   MachineBasicBlock::iterator CopyMI = MI;
   --CopyMI;
 
-  for (const MachineOperand &MO : MI->implicit_operands())
-    if (MO.isReg())
-      CopyMI->addOperand(MO);
+  Register DstReg = MI->getOperand(0).getReg();
+  for (const MachineOperand &MO : MI->implicit_operands()) {
+    CopyMI->addOperand(MO);
+
+    // Be conservative about preserving kills when subregister defs are
+    // involved. If there was implicit kill of a super-register overlapping the
+    // copy result, we would kill the subregisters previous copies defined.
+    if (MO.isKill() && TRI->regsOverlap(DstReg, MO.getReg()))
+      CopyMI->getOperand(CopyMI->getNumOperands() - 1).setIsKill(false);
+  }
 }
 
 bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 2676becdd807..1a642e233a6a 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -191,10 +191,10 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
     assert(DstOps.size() == 1 && "Invalid dsts");
     if (SrcOps[0].getLLTTy(*getMRI()).isVector()) {
       // Try to constant fold vector constants.
-      auto VecCst = ConstantFoldVectorBinop(
+      Register VecCst = ConstantFoldVectorBinop(
           Opc, SrcOps[0].getReg(), SrcOps[1].getReg(), *getMRI(), *this);
       if (VecCst)
-        return MachineInstrBuilder(getMF(), *VecCst);
+        return buildCopy(DstOps[0], VecCst);
       break;
     }
     if (Optional<APInt> Cst = ConstantFoldBinOp(Opc, SrcOps[0].getReg(),
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index d061664e8c5d..1ec7868f2234 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -86,6 +86,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   CallLoweringInfo Info;
   const DataLayout &DL = MIRBuilder.getDataLayout();
   MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   bool CanBeTailCalled = CB.isTailCall() &&
                          isInTailCallPosition(CB, MF.getTarget()) &&
                          (MF.getFunction()
@@ -109,6 +110,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
     CanBeTailCalled = false;
   }
 
+
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
@@ -136,10 +138,23 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   else
     Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
 
+  Register ReturnHintAlignReg;
+  Align ReturnHintAlign;
+
   Info.OrigRet = ArgInfo{ResRegs, RetTy, 0, ISD::ArgFlagsTy{}};
-  if (!Info.OrigRet.Ty->isVoidTy())
+
+  if (!Info.OrigRet.Ty->isVoidTy()) {
     setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB);
 
+    if (MaybeAlign Alignment = CB.getRetAlign()) {
+      if (*Alignment > Align(1)) {
+        ReturnHintAlignReg = MRI.cloneVirtualRegister(ResRegs[0]);
+        Info.OrigRet.Regs[0] = ReturnHintAlignReg;
+        ReturnHintAlign = *Alignment;
+      }
+    }
+  }
+
   Info.CB = &CB;
   Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
   Info.CallConv = CallConv;
@@ -147,7 +162,15 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   Info.IsMustTailCall = CB.isMustTailCall();
   Info.IsTailCall = CanBeTailCalled;
   Info.IsVarArg = IsVarArg;
-  return lowerCall(MIRBuilder, Info);
+  if (!lowerCall(MIRBuilder, Info))
+    return false;
+
+  if (ReturnHintAlignReg && !Info.IsTailCall) {
+    MIRBuilder.buildAssertAlign(ResRegs[0], ReturnHintAlignReg,
+                                ReturnHintAlign);
+  }
+
+  return true;
 }
 
 template <typename FuncInfoTy>
@@ -509,7 +532,8 @@ static void buildCopyToRegs(MachineIRBuilder &B, ArrayRef<Register> DstRegs,
 bool CallLowering::determineAndHandleAssignments(
     ValueHandler &Handler, ValueAssigner &Assigner,
     SmallVectorImpl<ArgInfo> &Args, MachineIRBuilder &MIRBuilder,
-    CallingConv::ID CallConv, bool IsVarArg, Register ThisReturnReg) const {
+    CallingConv::ID CallConv, bool IsVarArg,
+    ArrayRef<Register> ThisReturnRegs) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -519,7 +543,7 @@ bool CallLowering::determineAndHandleAssignments(
     return false;
 
   return handleAssignments(Handler, Args, CCInfo, ArgLocs, MIRBuilder,
-                           ThisReturnReg);
+                           ThisReturnRegs);
 }
 
 static unsigned extendOpFromFlags(llvm::ISD::ArgFlagsTy Flags) {
@@ -596,7 +620,7 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
                                      CCState &CCInfo,
                                      SmallVectorImpl<CCValAssign> &ArgLocs,
                                      MachineIRBuilder &MIRBuilder,
-                                     Register ThisReturnReg) const {
+                                     ArrayRef<Register> ThisReturnRegs) const {
   MachineFunction &MF = MIRBuilder.getMF();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = MF.getFunction();
@@ -740,10 +764,10 @@ bool CallLowering::handleAssignments(ValueHandler &Handler,
 
       assert(!VA.needsCustom() && "custom loc should have been handled already");
 
-      if (i == 0 && ThisReturnReg.isValid() &&
+      if (i == 0 && !ThisReturnRegs.empty() &&
           Handler.isIncomingArgumentHandler() &&
           isTypeIsValidForThisReturn(ValVT)) {
-        Handler.assignValueToReg(Args[i].Regs[i], ThisReturnReg, VA);
+        Handler.assignValueToReg(ArgReg, ThisReturnRegs[Part], VA);
         continue;
       }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index dd1ef74e8ad0..30f8838805b5 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -56,8 +56,7 @@ class WorkListMaintainer : public GISelChangeObserver {
   SmallPtrSet<const MachineInstr *, 4> CreatedInstrs;
 
 public:
-  WorkListMaintainer(WorkListTy &WorkList)
-      : GISelChangeObserver(), WorkList(WorkList) {}
+  WorkListMaintainer(WorkListTy &WorkList) : WorkList(WorkList) {}
   virtual ~WorkListMaintainer() {
   }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index f7a634dad61a..d6a009744161 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1748,6 +1748,20 @@ void CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
+bool CombinerHelper::matchCombineUnmergeUndef(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  unsigned SrcIdx = MI.getNumOperands() - 1;
+  Register SrcReg = MI.getOperand(SrcIdx).getReg();
+  MatchInfo = [&MI](MachineIRBuilder &B) {
+    unsigned NumElems = MI.getNumOperands() - 1;
+    for (unsigned Idx = 0; Idx < NumElems; ++Idx) {
+      Register DstReg = MI.getOperand(Idx).getReg();
+      B.buildUndef(DstReg);
+    }
+  };
+  return isa<GImplicitDef>(MRI.getVRegDef(SrcReg));
+}
+
 bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
@@ -2025,16 +2039,19 @@ void CombinerHelper::applyCombineAddP2IToPtrAdd(
 }
 
 bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
-                                                  int64_t &NewCst) {
+                                                  APInt &NewCst) {
   auto &PtrAdd = cast<GPtrAdd>(MI);
   Register LHS = PtrAdd.getBaseReg();
   Register RHS = PtrAdd.getOffsetReg();
   MachineRegisterInfo &MRI = Builder.getMF().getRegInfo();
 
-  if (auto RHSCst = getIConstantVRegSExtVal(RHS, MRI)) {
-    int64_t Cst;
+  if (auto RHSCst = getIConstantVRegVal(RHS, MRI)) {
+    APInt Cst;
     if (mi_match(LHS, MRI, m_GIntToPtr(m_ICst(Cst)))) {
-      NewCst = Cst + *RHSCst;
+      auto DstTy = MRI.getType(PtrAdd.getReg(0));
+      // G_INTTOPTR uses zero-extension
+      NewCst = Cst.zextOrTrunc(DstTy.getSizeInBits());
+      NewCst += RHSCst->sextOrTrunc(DstTy.getSizeInBits());
       return true;
     }
   }
@@ -2043,7 +2060,7 @@ bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
 }
 
 void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI,
-                                                  int64_t &NewCst) {
+                                                  APInt &NewCst) {
   auto &PtrAdd = cast<GPtrAdd>(MI);
   Register Dst = PtrAdd.getReg(0);
 
@@ -3875,39 +3892,48 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
   LLT Ty = MRI.getType(Dst);
   unsigned BitWidth = Ty.getScalarSizeInBits();
 
-  Register ShlSrc, ShlAmt, LShrSrc, LShrAmt;
+  Register ShlSrc, ShlAmt, LShrSrc, LShrAmt, Amt;
   unsigned FshOpc = 0;
 
-  // Match (or (shl x, amt), (lshr y, sub(bw, amt))).
-  if (mi_match(
-          Dst, MRI,
-          // m_GOr() handles the commuted version as well.
-          m_GOr(m_GShl(m_Reg(ShlSrc), m_Reg(ShlAmt)),
-                m_GLShr(m_Reg(LShrSrc), m_GSub(m_SpecificICstOrSplat(BitWidth),
-                                               m_Reg(LShrAmt)))))) {
+  // Match (or (shl ...), (lshr ...)).
+  if (!mi_match(Dst, MRI,
+                // m_GOr() handles the commuted version as well.
+                m_GOr(m_GShl(m_Reg(ShlSrc), m_Reg(ShlAmt)),
+                      m_GLShr(m_Reg(LShrSrc), m_Reg(LShrAmt)))))
+    return false;
+
+  // Given constants C0 and C1 such that C0 + C1 is bit-width:
+  // (or (shl x, C0), (lshr y, C1)) -> (fshl x, y, C0) or (fshr x, y, C1)
+  // TODO: Match constant splat.
+  int64_t CstShlAmt, CstLShrAmt;
+  if (mi_match(ShlAmt, MRI, m_ICst(CstShlAmt)) &&
+      mi_match(LShrAmt, MRI, m_ICst(CstLShrAmt)) &&
+      CstShlAmt + CstLShrAmt == BitWidth) {
+    FshOpc = TargetOpcode::G_FSHR;
+    Amt = LShrAmt;
+
+  } else if (mi_match(LShrAmt, MRI,
+                      m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
+             ShlAmt == Amt) {
+    // (or (shl x, amt), (lshr y, (sub bw, amt))) -> (fshl x, y, amt)
     FshOpc = TargetOpcode::G_FSHL;
 
-    // Match (or (shl x, sub(bw, amt)), (lshr y, amt)).
-  } else if (mi_match(Dst, MRI,
-                      m_GOr(m_GLShr(m_Reg(LShrSrc), m_Reg(LShrAmt)),
-                            m_GShl(m_Reg(ShlSrc),
-                                   m_GSub(m_SpecificICstOrSplat(BitWidth),
-                                          m_Reg(ShlAmt)))))) {
+  } else if (mi_match(ShlAmt, MRI,
+                      m_GSub(m_SpecificICstOrSplat(BitWidth), m_Reg(Amt))) &&
+             LShrAmt == Amt) {
+    // (or (shl x, (sub bw, amt)), (lshr y, amt)) -> (fshr x, y, amt)
     FshOpc = TargetOpcode::G_FSHR;
 
   } else {
     return false;
   }
 
-  if (ShlAmt != LShrAmt)
-    return false;
-
-  LLT AmtTy = MRI.getType(ShlAmt);
+  LLT AmtTy = MRI.getType(Amt);
   if (!isLegalOrBeforeLegalizer({FshOpc, {Ty, AmtTy}}))
     return false;
 
   MatchInfo = [=](MachineIRBuilder &B) {
-    B.buildInstr(FshOpc, {Dst}, {ShlSrc, LShrSrc, ShlAmt});
+    B.buildInstr(FshOpc, {Dst}, {ShlSrc, LShrSrc, Amt});
   };
   return true;
 }
@@ -4127,8 +4153,9 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(
   assert(MI.getOpcode() == TargetOpcode::G_AND);
   Register Dst = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(Dst);
-  if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal(
-          TargetOpcode::G_UBFX, Ty, Ty))
+  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  if (!getTargetLowering().isConstantUnsignedBitfieldExtractLegal(
+          TargetOpcode::G_UBFX, Ty, ExtractTy))
     return false;
 
   int64_t AndImm, LSBImm;
@@ -4148,7 +4175,6 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(
   if (static_cast<uint64_t>(LSBImm) >= Size)
     return false;
 
-  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
   uint64_t Width = APInt(Size, AndImm).countTrailingOnes();
   MatchInfo = [=](MachineIRBuilder &B) {
     auto WidthCst = B.buildConstant(ExtractTy, Width);
@@ -4214,8 +4240,9 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
 
   const Register Dst = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(Dst);
-  if (!getTargetLowering().isConstantUnsignedBitfieldExtactLegal(
-          TargetOpcode::G_UBFX, Ty, Ty))
+  LLT ExtractTy = getTargetLowering().getPreferredShiftAmountTy(Ty);
+  if (!getTargetLowering().isConstantUnsignedBitfieldExtractLegal(
+          TargetOpcode::G_UBFX, Ty, ExtractTy))
     return false;
 
   // Try to match shr (and x, c1), c2
@@ -4249,8 +4276,8 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
     return false;
 
   MatchInfo = [=](MachineIRBuilder &B) {
-    auto WidthCst = B.buildConstant(Ty, Width);
-    auto PosCst = B.buildConstant(Ty, Pos);
+    auto WidthCst = B.buildConstant(ExtractTy, Width);
+    auto PosCst = B.buildConstant(ExtractTy, Pos);
     B.buildInstr(TargetOpcode::G_UBFX, {Dst}, {AndSrc, PosCst, WidthCst});
   };
   return true;
@@ -4850,37 +4877,39 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
   if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
     return false;
 
-  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
-  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
   unsigned PreferredFusedOpcode =
       HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
 
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
-  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
-      isContractableFMul(*RHS, AllowFusionGlobally)) {
-    if (hasMoreUses(*LHS, *RHS, MRI))
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
       std::swap(LHS, RHS);
   }
 
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-  if (isContractableFMul(*LHS, AllowFusionGlobally) &&
-      (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()))) {
+  if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
-                   {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(),
-                    RHS->getOperand(0).getReg()});
+                   {LHS.MI->getOperand(1).getReg(),
+                    LHS.MI->getOperand(2).getReg(), RHS.Reg});
     };
     return true;
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
-  if (isContractableFMul(*RHS, AllowFusionGlobally) &&
-      (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()))) {
+  if (isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+      (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
-                   {RHS->getOperand(1).getReg(), RHS->getOperand(2).getReg(),
-                    LHS->getOperand(0).getReg()});
+                   {RHS.MI->getOperand(1).getReg(),
+                    RHS.MI->getOperand(2).getReg(), LHS.Reg});
     };
     return true;
   }
@@ -4897,8 +4926,10 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
     return false;
 
   const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
-  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
-  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
 
   unsigned PreferredFusedOpcode =
@@ -4906,42 +4937,38 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
 
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
-  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
-      isContractableFMul(*RHS, AllowFusionGlobally)) {
-    if (hasMoreUses(*LHS, *RHS, MRI))
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
       std::swap(LHS, RHS);
   }
 
   // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
   MachineInstr *FpExtSrc;
-  if (mi_match(LHS->getOperand(0).getReg(), MRI,
-               m_GFPExt(m_MInstr(FpExtSrc))) &&
+  if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
       isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
                           MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
       auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
-      B.buildInstr(
-          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
-          {FpExtX.getReg(0), FpExtY.getReg(0), RHS->getOperand(0).getReg()});
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FpExtX.getReg(0), FpExtY.getReg(0), RHS.Reg});
     };
     return true;
   }
 
   // fold (fadd z, (fpext (fmul x, y))) -> (fma (fpext x), (fpext y), z)
   // Note: Commutes FADD operands.
-  if (mi_match(RHS->getOperand(0).getReg(), MRI,
-               m_GFPExt(m_MInstr(FpExtSrc))) &&
+  if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FpExtSrc))) &&
       isContractableFMul(*FpExtSrc, AllowFusionGlobally) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
                           MRI.getType(FpExtSrc->getOperand(1).getReg()))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
       auto FpExtX = B.buildFPExt(DstType, FpExtSrc->getOperand(1).getReg());
       auto FpExtY = B.buildFPExt(DstType, FpExtSrc->getOperand(2).getReg());
-      B.buildInstr(
-          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
-          {FpExtX.getReg(0), FpExtY.getReg(0), LHS->getOperand(0).getReg()});
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {FpExtX.getReg(0), FpExtY.getReg(0), LHS.Reg});
     };
     return true;
   }
@@ -4957,8 +4984,10 @@ bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
   if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive, true))
     return false;
 
-  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
-  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
 
   unsigned PreferredFusedOpcode =
@@ -4966,31 +4995,31 @@ bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
 
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
-  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
-      isContractableFMul(*RHS, AllowFusionGlobally)) {
-    if (hasMoreUses(*LHS, *RHS, MRI))
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
       std::swap(LHS, RHS);
   }
 
   MachineInstr *FMA = nullptr;
   Register Z;
   // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z))
-  if (LHS->getOpcode() == PreferredFusedOpcode &&
-      (MRI.getVRegDef(LHS->getOperand(3).getReg())->getOpcode() ==
+  if (LHS.MI->getOpcode() == PreferredFusedOpcode &&
+      (MRI.getVRegDef(LHS.MI->getOperand(3).getReg())->getOpcode() ==
        TargetOpcode::G_FMUL) &&
-      MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg()) &&
-      MRI.hasOneNonDBGUse(LHS->getOperand(3).getReg())) {
-    FMA = LHS;
-    Z = RHS->getOperand(0).getReg();
+      MRI.hasOneNonDBGUse(LHS.MI->getOperand(0).getReg()) &&
+      MRI.hasOneNonDBGUse(LHS.MI->getOperand(3).getReg())) {
+    FMA = LHS.MI;
+    Z = RHS.Reg;
   }
   // fold (fadd z, (fma x, y, (fmul u, v))) -> (fma x, y, (fma u, v, z))
-  else if (RHS->getOpcode() == PreferredFusedOpcode &&
-           (MRI.getVRegDef(RHS->getOperand(3).getReg())->getOpcode() ==
+  else if (RHS.MI->getOpcode() == PreferredFusedOpcode &&
+           (MRI.getVRegDef(RHS.MI->getOperand(3).getReg())->getOpcode() ==
             TargetOpcode::G_FMUL) &&
-           MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg()) &&
-           MRI.hasOneNonDBGUse(RHS->getOperand(3).getReg())) {
-    Z = LHS->getOperand(0).getReg();
-    FMA = RHS;
+           MRI.hasOneNonDBGUse(RHS.MI->getOperand(0).getReg()) &&
+           MRI.hasOneNonDBGUse(RHS.MI->getOperand(3).getReg())) {
+    Z = LHS.Reg;
+    FMA = RHS.MI;
   }
 
   if (FMA) {
@@ -5025,17 +5054,19 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
 
   const auto &TLI = *MI.getMF()->getSubtarget().getTargetLowering();
   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
-  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
-  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
 
   unsigned PreferredFusedOpcode =
       HasFMAD ? TargetOpcode::G_FMAD : TargetOpcode::G_FMA;
 
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
-  if (Aggressive && isContractableFMul(*LHS, AllowFusionGlobally) &&
-      isContractableFMul(*RHS, AllowFusionGlobally)) {
-    if (hasMoreUses(*LHS, *RHS, MRI))
+  if (Aggressive && isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally)) {
+    if (hasMoreUses(*LHS.MI, *RHS.MI, MRI))
       std::swap(LHS, RHS);
   }
 
@@ -5054,16 +5085,17 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
   MachineInstr *FMulMI, *FMAMI;
   // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
   //   -> (fma x, y, (fma (fpext u), (fpext v), z))
-  if (LHS->getOpcode() == PreferredFusedOpcode &&
-      mi_match(LHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+  if (LHS.MI->getOpcode() == PreferredFusedOpcode &&
+      mi_match(LHS.MI->getOperand(3).getReg(), MRI,
+               m_GFPExt(m_MInstr(FMulMI))) &&
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
                           MRI.getType(FMulMI->getOperand(0).getReg()))) {
     MatchInfo = [=](MachineIRBuilder &B) {
       buildMatchInfo(FMulMI->getOperand(1).getReg(),
-                     FMulMI->getOperand(2).getReg(),
-                     RHS->getOperand(0).getReg(), LHS->getOperand(1).getReg(),
-                     LHS->getOperand(2).getReg(), B);
+                     FMulMI->getOperand(2).getReg(), RHS.Reg,
+                     LHS.MI->getOperand(1).getReg(),
+                     LHS.MI->getOperand(2).getReg(), B);
     };
     return true;
   }
@@ -5073,7 +5105,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
   // FIXME: This turns two single-precision and one double-precision
   // operation into two double-precision operations, which might not be
   // interesting for all targets, especially GPUs.
-  if (mi_match(LHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+  if (mi_match(LHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) &&
       FMAMI->getOpcode() == PreferredFusedOpcode) {
     MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
     if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
@@ -5085,8 +5117,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
         X = B.buildFPExt(DstType, X).getReg(0);
         Y = B.buildFPExt(DstType, Y).getReg(0);
         buildMatchInfo(FMulMI->getOperand(1).getReg(),
-                       FMulMI->getOperand(2).getReg(),
-                       RHS->getOperand(0).getReg(), X, Y, B);
+                       FMulMI->getOperand(2).getReg(), RHS.Reg, X, Y, B);
       };
 
       return true;
@@ -5095,16 +5126,17 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
 
   // fold (fadd z, (fma x, y, (fpext (fmul u, v)))
   //   -> (fma x, y, (fma (fpext u), (fpext v), z))
-  if (RHS->getOpcode() == PreferredFusedOpcode &&
-      mi_match(RHS->getOperand(3).getReg(), MRI, m_GFPExt(m_MInstr(FMulMI))) &&
+  if (RHS.MI->getOpcode() == PreferredFusedOpcode &&
+      mi_match(RHS.MI->getOperand(3).getReg(), MRI,
+               m_GFPExt(m_MInstr(FMulMI))) &&
       isContractableFMul(*FMulMI, AllowFusionGlobally) &&
       TLI.isFPExtFoldable(MI, PreferredFusedOpcode, DstType,
                           MRI.getType(FMulMI->getOperand(0).getReg()))) {
     MatchInfo = [=](MachineIRBuilder &B) {
       buildMatchInfo(FMulMI->getOperand(1).getReg(),
-                     FMulMI->getOperand(2).getReg(),
-                     LHS->getOperand(0).getReg(), RHS->getOperand(1).getReg(),
-                     RHS->getOperand(2).getReg(), B);
+                     FMulMI->getOperand(2).getReg(), LHS.Reg,
+                     RHS.MI->getOperand(1).getReg(),
+                     RHS.MI->getOperand(2).getReg(), B);
     };
     return true;
   }
@@ -5114,7 +5146,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
   // FIXME: This turns two single-precision and one double-precision
   // operation into two double-precision operations, which might not be
   // interesting for all targets, especially GPUs.
-  if (mi_match(RHS->getOperand(0).getReg(), MRI, m_GFPExt(m_MInstr(FMAMI))) &&
+  if (mi_match(RHS.Reg, MRI, m_GFPExt(m_MInstr(FMAMI))) &&
       FMAMI->getOpcode() == PreferredFusedOpcode) {
     MachineInstr *FMulMI = MRI.getVRegDef(FMAMI->getOperand(3).getReg());
     if (isContractableFMul(*FMulMI, AllowFusionGlobally) &&
@@ -5126,8 +5158,7 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
         X = B.buildFPExt(DstType, X).getReg(0);
         Y = B.buildFPExt(DstType, Y).getReg(0);
         buildMatchInfo(FMulMI->getOperand(1).getReg(),
-                       FMulMI->getOperand(2).getReg(),
-                       LHS->getOperand(0).getReg(), X, Y, B);
+                       FMulMI->getOperand(2).getReg(), LHS.Reg, X, Y, B);
       };
       return true;
     }
@@ -5144,16 +5175,18 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
   if (!canCombineFMadOrFMA(MI, AllowFusionGlobally, HasFMAD, Aggressive))
     return false;
 
-  MachineInstr *LHS = MRI.getVRegDef(MI.getOperand(1).getReg());
-  MachineInstr *RHS = MRI.getVRegDef(MI.getOperand(2).getReg());
+  Register Op1 = MI.getOperand(1).getReg();
+  Register Op2 = MI.getOperand(2).getReg();
+  DefinitionAndSourceRegister LHS = {MRI.getVRegDef(Op1), Op1};
+  DefinitionAndSourceRegister RHS = {MRI.getVRegDef(Op2), Op2};
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
 
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
   int FirstMulHasFewerUses = true;
-  if (isContractableFMul(*LHS, AllowFusionGlobally) &&
-      isContractableFMul(*RHS, AllowFusionGlobally) &&
-      hasMoreUses(*LHS, *RHS, MRI))
+  if (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+      isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+      hasMoreUses(*LHS.MI, *RHS.MI, MRI))
     FirstMulHasFewerUses = false;
 
   unsigned PreferredFusedOpcode =
@@ -5161,24 +5194,24 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
 
   // fold (fsub (fmul x, y), z) -> (fma x, y, -z)
   if (FirstMulHasFewerUses &&
-      (isContractableFMul(*LHS, AllowFusionGlobally) &&
-       (Aggressive || MRI.hasOneNonDBGUse(LHS->getOperand(0).getReg())))) {
+      (isContractableFMul(*LHS.MI, AllowFusionGlobally) &&
+       (Aggressive || MRI.hasOneNonDBGUse(LHS.Reg)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
-      Register NegZ = B.buildFNeg(DstTy, RHS->getOperand(0).getReg()).getReg(0);
-      B.buildInstr(
-          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
-          {LHS->getOperand(1).getReg(), LHS->getOperand(2).getReg(), NegZ});
+      Register NegZ = B.buildFNeg(DstTy, RHS.Reg).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {LHS.MI->getOperand(1).getReg(),
+                    LHS.MI->getOperand(2).getReg(), NegZ});
     };
     return true;
   }
   // fold (fsub x, (fmul y, z)) -> (fma -y, z, x)
-  else if ((isContractableFMul(*RHS, AllowFusionGlobally) &&
-            (Aggressive || MRI.hasOneNonDBGUse(RHS->getOperand(0).getReg())))) {
+  else if ((isContractableFMul(*RHS.MI, AllowFusionGlobally) &&
+            (Aggressive || MRI.hasOneNonDBGUse(RHS.Reg)))) {
     MatchInfo = [=, &MI](MachineIRBuilder &B) {
-      Register NegY = B.buildFNeg(DstTy, RHS->getOperand(1).getReg()).getReg(0);
-      B.buildInstr(
-          PreferredFusedOpcode, {MI.getOperand(0).getReg()},
-          {NegY, RHS->getOperand(2).getReg(), LHS->getOperand(0).getReg()});
+      Register NegY =
+          B.buildFNeg(DstTy, RHS.MI->getOperand(1).getReg()).getReg(0);
+      B.buildInstr(PreferredFusedOpcode, {MI.getOperand(0).getReg()},
+                   {NegY, RHS.MI->getOperand(2).getReg(), LHS.Reg});
     };
     return true;
   }
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 306af808659a..64c2f0d5f8e4 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -37,6 +37,11 @@ Align GISelKnownBits::computeKnownAlignment(Register R, unsigned Depth) {
   switch (MI->getOpcode()) {
   case TargetOpcode::COPY:
     return computeKnownAlignment(MI->getOperand(1).getReg(), Depth);
+  case TargetOpcode::G_ASSERT_ALIGN: {
+    // TODO: Min with source
+    int64_t LogAlign = MI->getOperand(2).getImm();
+    return Align(1ull << LogAlign);
+  }
   case TargetOpcode::G_FRAME_INDEX: {
     int FrameIdx = MI->getOperand(1).getIndex();
     return MF.getFrameInfo().getObjectAlign(FrameIdx);
@@ -466,6 +471,18 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
       Known.Zero.setBitsFrom(SrcBitWidth);
     break;
   }
+  case TargetOpcode::G_ASSERT_ALIGN: {
+    int64_t LogOfAlign = MI.getOperand(2).getImm();
+    if (LogOfAlign == 0)
+      break;
+
+    // TODO: Should use maximum with source
+    // If a node is guaranteed to be aligned, set low zero bits accordingly as
+    // well as clearing one bits.
+    Known.Zero.setLowBits(LogOfAlign);
+    Known.One.clearLowBits(LogOfAlign);
+    break;
+  }
   case TargetOpcode::G_MERGE_VALUES: {
     unsigned NumOps = MI.getNumOperands();
     unsigned OpSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 4ae427484945..e5f95ca5aa73 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -297,10 +297,8 @@ bool InlineAsmLowering::lowerInlineAsm(
     GISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
 
     // Compute the value type for each operand.
-    if (OpInfo.Type == InlineAsm::isInput ||
-        (OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
-
-      OpInfo.CallOperandVal = const_cast<Value *>(Call.getArgOperand(ArgNo++));
+    if (OpInfo.hasArg()) {
+      OpInfo.CallOperandVal = const_cast<Value *>(Call.getArgOperand(ArgNo));
 
       if (isa<BasicBlock>(OpInfo.CallOperandVal)) {
         LLVM_DEBUG(dbgs() << "Basic block input operands not supported yet\n");
@@ -312,10 +310,8 @@ bool InlineAsmLowering::lowerInlineAsm(
       // If this is an indirect operand, the operand is a pointer to the
       // accessed type.
       if (OpInfo.isIndirect) {
-        PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
-        if (!PtrTy)
-          report_fatal_error("Indirect operand for inline asm not a pointer!");
-        OpTy = PtrTy->getElementType();
+        OpTy = Call.getAttributes().getParamElementType(ArgNo);
+        assert(OpTy && "Indirect operand must have elementtype attribute");
       }
 
       // FIXME: Support aggregate input operands
@@ -327,7 +323,7 @@ bool InlineAsmLowering::lowerInlineAsm(
 
       OpInfo.ConstraintVT =
           TLI->getAsmOperandValueType(DL, OpTy, true).getSimpleVT();
-
+      ++ArgNo;
     } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
       assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
       if (StructType *STy = dyn_cast<StructType>(Call.getType())) {
@@ -627,7 +623,8 @@ bool InlineAsmLowering::lowerInlineAsm(
 
       Register SrcReg = OpInfo.Regs[0];
       unsigned SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI);
-      if (MRI->getType(ResRegs[i]).getSizeInBits() < SrcSize) {
+      LLT ResTy = MRI->getType(ResRegs[i]);
+      if (ResTy.isScalar() && ResTy.getSizeInBits() < SrcSize) {
         // First copy the non-typed virtual register into a generic virtual
         // register
         Register Tmp1Reg =
@@ -635,9 +632,14 @@ bool InlineAsmLowering::lowerInlineAsm(
         MIRBuilder.buildCopy(Tmp1Reg, SrcReg);
         // Need to truncate the result of the register
         MIRBuilder.buildTrunc(ResRegs[i], Tmp1Reg);
-      } else {
+      } else if (ResTy.getSizeInBits() == SrcSize) {
         MIRBuilder.buildCopy(ResRegs[i], SrcReg);
+      } else {
+        LLVM_DEBUG(dbgs() << "Unhandled output operand with "
+                             "mismatched register size\n");
+        return false;
       }
+
       break;
     }
     case TargetLowering::C_Immediate:
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index b10c9272a508..2bb5addefe48 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -71,9 +71,10 @@ InstructionSelect::InstructionSelect()
 
 void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+
   if (OptLevel != CodeGenOpt::None) {
-    AU.addRequired<GISelKnownBitsAnalysis>();
-    AU.addPreserved<GISelKnownBitsAnalysis>();
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
   }
@@ -97,9 +98,8 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   OptLevel = MF.getFunction().hasOptNone() ? CodeGenOpt::None
                                            : MF.getTarget().getOptLevel();
 
-  GISelKnownBits *KB = nullptr;
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
   if (OptLevel != CodeGenOpt::None) {
-    KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
     PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
     if (PSI && PSI->hasProfileSummary())
       BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index dc5a4d8f85aa..1d0c106fd5db 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -29,7 +29,7 @@
 using namespace llvm;
 
 InstructionSelector::MatcherState::MatcherState(unsigned MaxRenderers)
-    : Renderers(MaxRenderers), MIs() {}
+    : Renderers(MaxRenderers) {}
 
 InstructionSelector::InstructionSelector() = default;
 
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e8a8efd5dad4..37bc8a65dc7c 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -564,7 +564,7 @@ static bool isLibCallInTailPosition(MachineInstr &MI,
   // the return. Ignore NoAlias and NonNull because they don't affect the
   // call sequence.
   AttributeList CallerAttrs = F.getAttributes();
-  if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
+  if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
           .removeAttribute(Attribute::NoAlias)
           .removeAttribute(Attribute::NonNull)
           .hasAttributes())
@@ -1677,7 +1677,7 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
 
     // Widen SrcTy to WideTy. This does not affect the result, but since the
     // user requested this size, it is probably better handled than SrcTy and
-    // should reduce the total number of legalization artifacts
+    // should reduce the total number of legalization artifacts.
     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
       SrcTy = WideTy;
       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
@@ -3655,7 +3655,6 @@ static bool hasSameNumEltsOnAllVectorOperands(
     if (!Ty.isVector()) {
       if (!is_contained(NonVecOpIndices, OpIdx))
         return false;
-      is_contained(NonVecOpIndices, OpIdx);
       continue;
     }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index a1acc4195840..328a278f3d68 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -124,14 +124,13 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
         LocalizedInstrs.insert(LocalizedMI);
         MachineInstr &UseMI = *MOUse.getParent();
         if (MRI->hasOneUse(Reg) && !UseMI.isPHI())
-          InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(UseMI), LocalizedMI);
+          InsertMBB->insert(UseMI, LocalizedMI);
         else
           InsertMBB->insert(InsertMBB->SkipPHIsAndLabels(InsertMBB->begin()),
                             LocalizedMI);
 
         // Set a new register for the definition.
-        Register NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
-        MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
+        Register NewReg = MRI->cloneVirtualRegister(Reg);
         LocalizedMI->getOperand(0).setReg(NewReg);
         NewVRegIt =
             MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
@@ -174,9 +173,10 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
     while (II != MBB.end() && !Users.count(&*II))
       ++II;
 
-    LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *&*II
-                      << "\n");
     assert(II != MBB.end() && "Didn't find the user in the MBB");
+    LLVM_DEBUG(dbgs() << "Intra-block: moving " << *MI << " before " << *II
+                      << '\n');
+
     MI->removeFromParent();
     MBB.insert(II, MI);
     Changed = true;
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 391251886fbb..c6720568b362 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -282,18 +282,6 @@ MachineInstrBuilder MachineIRBuilder::buildCopy(const DstOp &Res,
   return buildInstr(TargetOpcode::COPY, Res, Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildAssertSExt(const DstOp &Res,
-                                                      const SrcOp &Op,
-                                                      unsigned Size) {
-  return buildInstr(TargetOpcode::G_ASSERT_SEXT, Res, Op).addImm(Size);
-}
-
-MachineInstrBuilder MachineIRBuilder::buildAssertZExt(const DstOp &Res,
-                                                      const SrcOp &Op,
-                                                      unsigned Size) {
-  return buildInstr(TargetOpcode::G_ASSERT_ZEXT, Res, Op).addImm(Size);
-}
-
 MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
                                                     const ConstantInt &Val) {
   LLT Ty = Res.getLLTTy(*getMRI());
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 937d94764be1..01af6bb51bb7 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -626,7 +626,8 @@ bool RegBankSelect::assignInstr(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
   if (isPreISelGenericOptimizationHint(Opc)) {
     assert((Opc == TargetOpcode::G_ASSERT_ZEXT ||
-            Opc == TargetOpcode::G_ASSERT_SEXT) &&
+            Opc == TargetOpcode::G_ASSERT_SEXT ||
+            Opc == TargetOpcode::G_ASSERT_ALIGN) &&
            "Unexpected hint opcode!");
     // The only correct mapping for these is to always use the source register
     // bank.
@@ -856,7 +857,7 @@ void RegBankSelect::RepairingPlacement::addInsertPoint(
 
 RegBankSelect::InstrInsertPoint::InstrInsertPoint(MachineInstr &Instr,
                                                   bool Before)
-    : InsertPoint(), Instr(Instr), Before(Before) {
+    : Instr(Instr), Before(Before) {
   // Since we do not support splitting, we do not need to update
   // liveness and such, so do not do anything with P.
   assert((!Before || !Instr.isPHI()) &&
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 4981a537dc7c..544af9a2954f 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -592,17 +592,17 @@ Optional<APFloat> llvm::ConstantFoldFPBinOp(unsigned Opcode, const Register Op1,
   return None;
 }
 
-Optional<MachineInstr *>
-llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
-                              const Register Op2,
-                              const MachineRegisterInfo &MRI,
-                              MachineIRBuilder &MIB) {
-  auto *SrcVec1 = getOpcodeDef<GBuildVector>(Op1, MRI);
-  if (!SrcVec1)
-    return None;
+Register llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
+                                       const Register Op2,
+                                       const MachineRegisterInfo &MRI,
+                                       MachineIRBuilder &MIB) {
   auto *SrcVec2 = getOpcodeDef<GBuildVector>(Op2, MRI);
   if (!SrcVec2)
-    return None;
+    return Register();
+
+  auto *SrcVec1 = getOpcodeDef<GBuildVector>(Op1, MRI);
+  if (!SrcVec1)
+    return Register();
 
   const LLT EltTy = MRI.getType(SrcVec1->getSourceReg(0));
 
@@ -611,14 +611,14 @@ llvm::ConstantFoldVectorBinop(unsigned Opcode, const Register Op1,
     auto MaybeCst = ConstantFoldBinOp(Opcode, SrcVec1->getSourceReg(Idx),
                                       SrcVec2->getSourceReg(Idx), MRI);
     if (!MaybeCst)
-      return None;
+      return Register();
     auto FoldedCstReg = MIB.buildConstant(EltTy, *MaybeCst).getReg(0);
     FoldedElements.emplace_back(FoldedCstReg);
   }
   // Create the new vector constant.
   auto CstVec =
       MIB.buildBuildVector(MRI.getType(SrcVec1->getReg(0)), FoldedElements);
-  return &*CstVec;
+  return CstVec.getReg(0);
 }
 
 bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
@@ -704,8 +704,7 @@ Register llvm::getFunctionLiveInPhysReg(MachineFunction &MF,
                                         const TargetInstrInfo &TII,
                                         MCRegister PhysReg,
                                         const TargetRegisterClass &RC,
-                                        LLT RegTy) {
-  DebugLoc DL; // FIXME: Is no location the right choice?
+                                        const DebugLoc &DL, LLT RegTy) {
   MachineBasicBlock &EntryMBB = MF.front();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   Register LiveIn = MRI.getLiveInVirtReg(PhysReg);
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 9fabcfb1f326..2ee9379cb286 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -185,7 +185,7 @@ class Polynomial {
   APInt A;
 
 public:
-  Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() {
+  Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V) {
     IntegerType *Ty = dyn_cast<IntegerType>(V->getType());
     if (Ty) {
       ErrorMSBs = 0;
@@ -195,12 +195,12 @@ public:
   }
 
   Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
-      : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {}
+      : ErrorMSBs(ErrorMSBs), V(nullptr), A(A) {}
 
   Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
-      : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {}
+      : ErrorMSBs(ErrorMSBs), V(nullptr), A(BitWidth, A) {}
 
-  Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {}
+  Polynomial() : ErrorMSBs((unsigned)-1), V(nullptr) {}
 
   /// Increment and clamp the number of undefined bits.
   void incErrorMSBs(unsigned amt) {
@@ -677,7 +677,7 @@ public:
   FixedVectorType *const VTy;
 
   VectorInfo(FixedVectorType *VTy)
-      : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
+      : BB(nullptr), PV(nullptr), SVI(nullptr), VTy(VTy) {
     EI = new ElementInfo[VTy->getNumElements()];
   }
 
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index e97dcca201e8..8a190e769941 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -251,9 +251,10 @@ public:
   /// creates DBG_VALUEs and puts them in #Transfers, then prepares the other
   /// object fields to track variable locations as we step through the block.
   /// FIXME: could just examine mloctracker instead of passing in \p mlocs?
-  void loadInlocs(MachineBasicBlock &MBB, ValueIDNum *MLocs,
-                  SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs,
-                  unsigned NumLocs) {
+  void
+  loadInlocs(MachineBasicBlock &MBB, ValueIDNum *MLocs,
+             const SmallVectorImpl<std::pair<DebugVariable, DbgValue>> &VLocs,
+             unsigned NumLocs) {
     ActiveMLocs.clear();
     ActiveVLocs.clear();
     VarLocs.clear();
@@ -272,7 +273,7 @@ public:
     };
 
     // Map of the preferred location for each value.
-    std::map<ValueIDNum, LocIdx> ValueToLoc;
+    DenseMap<ValueIDNum, LocIdx> ValueToLoc;
     ActiveMLocs.reserve(VLocs.size());
     ActiveVLocs.reserve(VLocs.size());
 
@@ -283,6 +284,11 @@ public:
       LocIdx Idx = Location.Idx;
       ValueIDNum &VNum = MLocs[Idx.asU64()];
       VarLocs.push_back(VNum);
+
+      // Short-circuit unnecessary preferred location update.
+      if (VLocs.empty())
+        continue;
+
       auto it = ValueToLoc.find(VNum);
       // In order of preference, pick:
       //  * Callee saved registers,
@@ -298,7 +304,7 @@ public:
     }
 
     // Now map variables to their picked LocIdxes.
-    for (auto Var : VLocs) {
+    for (const auto &Var : VLocs) {
       if (Var.second.Kind == DbgValue::Const) {
         PendingDbgValues.push_back(
             emitMOLoc(*Var.second.MO, Var.first, Var.second.Properties));
@@ -413,7 +419,8 @@ public:
     return Reg != SP && Reg != FP;
   }
 
-  bool recoverAsEntryValue(const DebugVariable &Var, DbgValueProperties &Prop,
+  bool recoverAsEntryValue(const DebugVariable &Var,
+                           const DbgValueProperties &Prop,
                            const ValueIDNum &Num) {
     // Is this variable location a candidate to be an entry value. First,
     // should we be trying this at all?
@@ -2799,31 +2806,28 @@ void InstrRefBasedLDV::emitLocations(
     }
   }
 
-  // We have to insert DBG_VALUEs in a consistent order, otherwise they appeaer
-  // in DWARF in different orders. Use the order that they appear when walking
-  // through each block / each instruction, stored in AllVarsNumbering.
-  auto OrderDbgValues = [&](const MachineInstr *A,
-                            const MachineInstr *B) -> bool {
-    DebugVariable VarA(A->getDebugVariable(), A->getDebugExpression(),
-                       A->getDebugLoc()->getInlinedAt());
-    DebugVariable VarB(B->getDebugVariable(), B->getDebugExpression(),
-                       B->getDebugLoc()->getInlinedAt());
-    return AllVarsNumbering.find(VarA)->second <
-           AllVarsNumbering.find(VarB)->second;
-  };
-
   // Go through all the transfers recorded in the TransferTracker -- this is
   // both the live-ins to a block, and any movements of values that happen
   // in the middle.
-  for (auto &P : TTracker->Transfers) {
-    // Sort them according to appearance order.
-    llvm::sort(P.Insts, OrderDbgValues);
+  for (const auto &P : TTracker->Transfers) {
+    // We have to insert DBG_VALUEs in a consistent order, otherwise they
+    // appear in DWARF in different orders. Use the order that they appear
+    // when walking through each block / each instruction, stored in
+    // AllVarsNumbering.
+    SmallVector<std::pair<unsigned, MachineInstr *>> Insts;
+    for (MachineInstr *MI : P.Insts) {
+      DebugVariable Var(MI->getDebugVariable(), MI->getDebugExpression(),
+                        MI->getDebugLoc()->getInlinedAt());
+      Insts.emplace_back(AllVarsNumbering.find(Var)->second, MI);
+    }
+    llvm::sort(Insts,
+               [](const auto &A, const auto &B) { return A.first < B.first; });
+
     // Insert either before or after the designated point...
     if (P.MBB) {
       MachineBasicBlock &MBB = *P.MBB;
-      for (auto *MI : P.Insts) {
-        MBB.insert(P.Pos, MI);
-      }
+      for (const auto &Pair : Insts)
+        MBB.insert(P.Pos, Pair.second);
     } else {
       // Terminators, like tail calls, can clobber things. Don't try and place
       // transfers after them.
@@ -2831,9 +2835,8 @@ void InstrRefBasedLDV::emitLocations(
         continue;
 
       MachineBasicBlock &MBB = *P.Pos->getParent();
-      for (auto *MI : P.Insts) {
-        MBB.insertAfterBundle(P.Pos, MI);
-      }
+      for (const auto &Pair : Insts)
+        MBB.insertAfterBundle(P.Pos, Pair.second);
     }
   }
 }
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
index 789205e61cdb..9e9c0ce394fd 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.h
@@ -494,7 +494,7 @@ public:
     return StackIdxesToPos.find(Idx)->second;
   }
 
-  unsigned getNumLocs(void) const { return LocIdxToIDNum.size(); }
+  unsigned getNumLocs() const { return LocIdxToIDNum.size(); }
 
   /// Reset all locations to contain a PHI value at the designated block. Used
   /// sometimes for actual PHI values, othertimes to indicate the block entry
@@ -516,7 +516,7 @@ public:
   }
 
   /// Wipe any un-necessary location records after traversing a block.
-  void reset(void) {
+  void reset() {
     // We could reset all the location values too; however either loadFromArray
     // or setMPhis should be called before this object is re-used. Just
     // clear Masks, they're definitely not needed.
@@ -525,7 +525,7 @@ public:
 
   /// Clear all data. Destroys the LocID <=> LocIdx map, which makes most of
   /// the information in this pass uninterpretable.
-  void clear(void) {
+  void clear() {
     reset();
     LocIDToLocIdx.clear();
     LocIdxToLocID.clear();
@@ -1082,7 +1082,9 @@ template <> struct DenseMapInfo<ValueIDNum> {
     return ValueIDNum::TombstoneValue;
   }
 
-  static unsigned getHashValue(const ValueIDNum &Val) { return Val.asU64(); }
+  static unsigned getHashValue(const ValueIDNum &Val) {
+    return hash_value(Val.asU64());
+  }
 
   static bool isEqual(const ValueIDNum &A, const ValueIDNum &B) {
     return A == B;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
index 691977dc34e6..8f697611a82c 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -40,6 +40,10 @@ static cl::opt<bool>
                               "normal DBG_VALUE inputs"),
                      cl::init(false));
 
+static cl::opt<cl::boolOrDefault> ValueTrackingVariableLocations(
+    "experimental-debug-variable-locations",
+    cl::desc("Use experimental new value-tracking variable locations"));
+
 // Options to prevent pathological compile-time behavior. If InputBBLimit and
 // InputDbgValueLimit are both exceeded, range extension is disabled.
 static cl::opt<unsigned> InputBBLimit(
@@ -117,3 +121,8 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
   return TheImpl->ExtendRanges(MF, DomTree, TPC, InputBBLimit,
                                InputDbgValueLimit);
 }
+
+bool llvm::debuginfoShouldUseDebugInstrRef(const Triple &T) {
+  // Enable if explicitly requested on command line.
+  return ValueTrackingVariableLocations == cl::boolOrDefault::BOU_TRUE;
+}
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
index a5936c8a96f0..8f0b2ec3e1fc 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.h
@@ -12,6 +12,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/ADT/Triple.h"
 
 namespace llvm {
 
@@ -35,6 +36,9 @@ public:
 // Factory functions for LiveDebugValues implementations.
 extern LDVImpl *makeVarLocBasedLiveDebugValues();
 extern LDVImpl *makeInstrRefBasedLiveDebugValues();
+
+extern bool debuginfoShouldUseDebugInstrRef(const Triple &T);
+
 } // namespace llvm
 
 #endif // LLVM_LIB_CODEGEN_LIVEDEBUGVALUES_LIVEDEBUGVALUES_H
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index e6661e5135c3..6d806135240e 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -152,7 +152,7 @@ public:
     }
   }
 
-  DbgVariableValue() : LocNoCount(0), WasIndirect(0), WasList(0) {}
+  DbgVariableValue() : LocNoCount(0), WasIndirect(false), WasList(false) {}
   DbgVariableValue(const DbgVariableValue &Other)
       : LocNoCount(Other.LocNoCount), WasIndirect(Other.getWasIndirect()),
         WasList(Other.getWasList()), Expression(Other.getExpression()) {
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 2f97386b6d18..9571afa434c1 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -827,6 +827,8 @@ CancelKill:
 
 MachineBasicBlock*
 LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
+  assert(!LI.empty() && "LiveInterval is empty.");
+
   // A local live range must be fully contained inside the block, meaning it is
   // defined and killed at instructions, not at block boundaries. It is not
   // live in or out of any block.
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 1a04e1ca56a9..6477965bdc21 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -875,11 +875,11 @@ bool MIParser::parseBasicBlock(MachineBasicBlock &MBB,
   // N.B: Multiple lists of successors and liveins are allowed and they're
   // merged into one.
   // Example:
-  //   liveins: %edi
-  //   liveins: %esi
+  //   liveins: $edi
+  //   liveins: $esi
   //
   // is equivalent to
-  //   liveins: %edi, %esi
+  //   liveins: $edi, $esi
   bool ExplicitSuccessors = false;
   while (true) {
     if (Token.is(MIToken::kw_successors)) {
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index d0323eaf3d78..f144639770bc 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -182,8 +182,7 @@ static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) {
 MIRParserImpl::MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents,
                              StringRef Filename, LLVMContext &Context,
                              std::function<void(Function &)> Callback)
-    : SM(),
-      Context(Context),
+    : Context(Context),
       In(SM.getMemoryBuffer(SM.AddNewSourceBuffer(std::move(Contents), SMLoc()))
              ->getBuffer(),
          nullptr, handleYAMLDiag, this),
diff --git a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
new file mode 100644
index 000000000000..a74c57690640
--- /dev/null
+++ b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
@@ -0,0 +1,862 @@
+//===- MLRegAllocEvictAdvisor.cpp - ML eviction advisor -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the ML eviction advisor and reward injection pass
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocEvictionAdvisor.h"
+#include "RegAllocGreedy.h"
+#include "RegAllocScore.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/Analysis/ModelUnderTrainingRunner.h"
+#include "llvm/Analysis/NoInferenceModelRunner.h"
+#include "llvm/Analysis/ReleaseModeModelRunner.h"
+#include "llvm/Analysis/Utils/TFUtils.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Config/config.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <array>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ml-regalloc"
+
+// Generated header in release (AOT) mode
+#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL)
+#include "RegallocEvictModel.h"
+#endif
+
+// Options that only make sense in development mode
+#ifdef LLVM_HAVE_TF_API
+static cl::opt<std::string> TrainingLog(
+    "regalloc-training-log", cl::Hidden,
+    cl::desc("Training log for the register allocator eviction model"));
+
+static cl::opt<std::string> ModelUnderTraining(
+    "regalloc-model", cl::Hidden,
+    cl::desc("The model being trained for register allocation eviction"));
+
+#endif // #ifdef LLVM_HAVE_TF_API
+
+/// The score injection pass.
+/// This pass calculates the score for a function and inserts it in the log, but
+/// this happens only in development mode. It's a no-op otherwise.
+namespace llvm {
+class RegAllocScoring : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RegAllocScoring() : MachineFunctionPass(ID) {
+    initializeRegAllocScoringPass(*PassRegistry::getPassRegistry());
+  }
+
+  ~RegAllocScoring() override = default;
+
+  StringRef getPassName() const override {
+    return "Register Allocation Pass Scoring";
+  }
+
+  /// RegAllocReward analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<RegAllocEvictionAdvisorAnalysis>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Performs this pass
+  bool runOnMachineFunction(MachineFunction &) override;
+};
+
+char RegAllocScoring::ID = 0;
+FunctionPass *createRegAllocScoringPass() { return new RegAllocScoring(); }
+
+} // namespace llvm
+
+INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
+                "Register Allocation Scoring Pass", false, false)
+
+// ===================================
+// Common ML Advisor declarations
+// ===================================
+namespace {
+// This is the maximum number of interfererring ranges. That's the number of
+// distinct AllocationOrder values, which comes from MCRegisterClass::RegsSize.
+// For X86, that's 32.
+// TODO: find a way to get this, statically, in a programmatic way.
+static const int64_t MaxInterferences = 32;
+
+// Logically, we can think of the feature set given to the evaluator as a 2D
+// matrix. The rows are the features (see next). The columns correspond to the
+// interferences. We treat the candidate virt reg as an 'interference', too, as
+// its feature set is the same as that of the interferring ranges. So we'll have
+// MaxInterferences + 1 columns and by convention, we will use the last column
+// for the virt reg seeking allocation.
+static const int64_t CandidateVirtRegPos = MaxInterferences;
+static const int64_t NumberOfInterferences = CandidateVirtRegPos + 1;
+
+// Most features are as described above, so we'll reuse this vector in defining
+// them.
+static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
+
+// --------------
+// Features table
+// --------------
+// For each interfering live range (incl. the candidate) we collect a number of
+// features. However, because the features are of different types (and because
+// of ML best practices), we organize the tensors per feature, not per
+// candidate. Each such tensor has a scalar value corresponding to the
+// interferring live range at that position, in the order in AllocationOrder.
+// The last position corresponds to the virt reg seeking allocation.
+// Exception to all that is the progression feature, which is just a scalar (see
+// its documentation for details).
+// Note on naming: the "_by_max" are normalized using the largest value of that
+// tensor, as observed in the current decision making stage (i.e. for the
+// current call to the advisor's tryFindEvictionCandidate)
+//
+// The feature list format: type, name, shape, documentation.
+// Note: we can really just use int64 and float, hence the modeling of some
+// bools as int64 values.
+#define RA_EVICT_FEATURES_LIST(M)                                              \
+  M(int64_t, mask, PerLiveRangeShape,                                          \
+    "boolean values, 0 for unavailable candidates (i.e. if a position is 0, "  \
+    "it "                                                                      \
+    "can't be evicted)")                                                       \
+  M(int64_t, is_free, PerLiveRangeShape,                                       \
+    "boolean values, 1 if this phys reg is actually free (no interferences)")  \
+  M(float, nr_urgent, PerLiveRangeShape,                                       \
+    "number of 'urgent' intervals, normalized. Urgent are those that are OK "  \
+    "to break cascades")                                                       \
+  M(float, nr_broken_hints, PerLiveRangeShape,                                 \
+    "if this position were evicted, how many broken hints would there be")     \
+  M(int64_t, is_hint, PerLiveRangeShape,                                       \
+    "is this a preferred phys reg for the candidate")                          \
+  M(int64_t, is_local, PerLiveRangeShape,                                      \
+    "is this live range local to a basic block")                               \
+  M(float, nr_rematerializable, PerLiveRangeShape,                             \
+    "nr rematerializable ranges")                                              \
+  M(float, nr_defs_and_uses, PerLiveRangeShape,                                \
+    "bb freq - weighed nr defs and uses")                                      \
+  M(float, weighed_reads_by_max, PerLiveRangeShape,                            \
+    "bb freq - weighed nr of reads, normalized")                               \
+  M(float, weighed_writes_by_max, PerLiveRangeShape,                           \
+    "bb feq - weighed nr of writes, normalized")                               \
+  M(float, weighed_read_writes_by_max, PerLiveRangeShape,                      \
+    "bb freq - weighed nr of uses that are both read and writes, normalized")  \
+  M(float, weighed_indvars_by_max, PerLiveRangeShape,                          \
+    "bb freq - weighed nr of uses that are indvars, normalized")               \
+  M(float, hint_weights_by_max, PerLiveRangeShape,                             \
+    "bb freq - weighed nr of uses that are hints, normalized")                 \
+  M(float, start_bb_freq_by_max, PerLiveRangeShape,                            \
+    "the freq in the start block, normalized")                                 \
+  M(float, end_bb_freq_by_max, PerLiveRangeShape,                              \
+    "freq of end block, normalized")                                           \
+  M(float, hottest_bb_freq_by_max, PerLiveRangeShape,                          \
+    "hottest BB freq, normalized")                                             \
+  M(float, liverange_size, PerLiveRangeShape,                                  \
+    "size (instr index diff) of the LR")                                       \
+  M(float, use_def_density, PerLiveRangeShape,                                 \
+    "the max weight, as computed by the manual heuristic")                     \
+  M(int64_t, max_stage, PerLiveRangeShape,                                     \
+    "largest stage of an interval in this LR")                                 \
+  M(int64_t, min_stage, PerLiveRangeShape,                                     \
+    "lowest stage of an interval in this LR")                                  \
+  M(float, progress, {1}, "ratio of current queue size to initial size")
+
+// The model learns to pick one of the mask == 1 interferences. This is the name
+// of the output tensor.
+// The contract with the model is that the output will be guaranteed to be to a
+// mask == 1 position.
+// Using a macro here to avoid 'not used' warnings (and keep cond compilation to
+// a minimum)
+#define DecisionName "index_to_evict"
+
+// Named features index.
+enum FeatureIDs {
+#define _FEATURE_IDX(_, name, __, ___) name,
+  RA_EVICT_FEATURES_LIST(_FEATURE_IDX)
+#undef _FEATURE_IDX
+      FeatureCount
+};
+
+// The ML advisor will typically have a sparse input to the evaluator, because
+// various phys regs won't be available. It's easier (maintenance-wise) to
+// bulk-reset the state of the evaluator each time we are about to use it again.
+template <typename T> size_t getTotalSize(const std::vector<int64_t> &Shape) {
+  size_t Ret = sizeof(T);
+  for (const auto V : Shape)
+    Ret *= V;
+  return Ret;
+}
+
+void resetInputs(MLModelRunner &Runner) {
+#define _RESET(TYPE, NAME, SHAPE, __)                                          \
+  std::memset(Runner.getTensorUntyped(FeatureIDs::NAME), 0,                    \
+              getTotalSize<TYPE>(SHAPE));
+  RA_EVICT_FEATURES_LIST(_RESET)
+#undef _RESET
+}
+
+using CandidateRegList =
+    std::array<std::pair<MCRegister, bool>, NumberOfInterferences>;
+using FeaturesListNormalizer = std::array<float, FeatureIDs::FeatureCount>;
+
+/// The ML evictor (commonalities between release and development mode)
+class MLEvictAdvisor : public RegAllocEvictionAdvisor {
+public:
+  MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                 MLModelRunner *Runner, const MachineBlockFrequencyInfo &MBFI,
+                 const MachineLoopInfo &Loops);
+
+protected:
+  const RegAllocEvictionAdvisor &getDefaultAdvisor() const {
+    return static_cast<const RegAllocEvictionAdvisor &>(DefaultAdvisor);
+  }
+
+  // The assumption is that if the Runner could not be constructed, we emit-ed
+  // error, and we shouldn't be asking for it here.
+  const MLModelRunner &getRunner() const { return *Runner; }
+
+  /// This just calls Evaluate on the Runner, but in the development mode case,
+  /// if we're just capturing the log of the default advisor, it needs to call
+  /// the latter instead, so we need to pass all the necessary parameters for
+  /// it. In the development case, it will also log.
+  virtual int64_t tryFindEvictionCandidatePosition(
+      LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit,
+      uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const;
+
+  /// Load the features of the given VirtReg (allocated or not) at column Pos,
+  /// but if  that can't be evicted, return false instead.
+  bool
+  loadInterferenceFeatures(LiveInterval &VirtReg, MCRegister PhysReg,
+                           bool IsHint, const SmallVirtRegSet &FixedRegisters,
+                           std::array<float, FeatureIDs::FeatureCount> &Largest,
+                           size_t Pos) const;
+
+private:
+  static float getInitialQueueSize(const MachineFunction &MF);
+
+  MCRegister tryFindEvictionCandidate(
+      LiveInterval &VirtReg, const AllocationOrder &Order,
+      uint8_t CostPerUseLimit,
+      const SmallVirtRegSet &FixedRegisters) const override;
+
+  void extractFeatures(const SmallVectorImpl<LiveInterval *> &Intervals,
+                       std::array<float, FeatureIDs::FeatureCount> &Largest,
+                       size_t Pos, int64_t IsHint, int64_t LocalIntfsCount,
+                       float NrUrgent) const;
+
+  // Point-in-time: we didn't learn this, so we always delegate to the default.
+  bool canEvictHintInterference(
+      LiveInterval &VirtReg, MCRegister PhysReg,
+      const SmallVirtRegSet &FixedRegisters) const override {
+    return getDefaultAdvisor().canEvictHintInterference(VirtReg, PhysReg,
+                                                        FixedRegisters);
+  }
+
+  // Hold on to a default advisor for:
+  // 1) the implementation of canEvictHintInterference, because we didn't learn
+  // that nuance yet;
+  // 2) for bootstrapping (logging) in the development mode case.
+  const DefaultEvictionAdvisor DefaultAdvisor;
+  MLModelRunner *const Runner;
+  const MachineBlockFrequencyInfo &MBFI;
+  const MachineLoopInfo &Loops;
+
+  // Indices of those features we don't want to normalize.
+  // This could be static and shared, but its initialization is non-trivial.
+  std::bitset<FeatureIDs::FeatureCount> DoNotNormalize;
+  const float InitialQSize;
+};
+
+// ===================================
+// Release (AOT) - specifics
+// ===================================
+#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL)
+const std::array<std::string, FeatureIDs::FeatureCount> FeatureNames{
+#define _GETNAME(_, NAME, __, ___) #NAME,
+    RA_EVICT_FEATURES_LIST(_GETNAME)
+#undef _GETNAME
+};
+class ReleaseModeEvictionAdvisorAnalysis final
+    : public RegAllocEvictionAdvisorAnalysis {
+public:
+  ReleaseModeEvictionAdvisorAnalysis()
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Release) {}
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Release;
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineLoopInfo>();
+    RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+
+  std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    if (!Runner)
+      Runner = std::make_unique<ReleaseModeModelRunner<RegallocEvictModel>>(
+          MF.getFunction().getContext(), FeatureNames, DecisionName);
+    return std::make_unique<MLEvictAdvisor>(
+        MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(),
+        getAnalysis<MachineLoopInfo>());
+  }
+  std::unique_ptr<ReleaseModeModelRunner<RegallocEvictModel>> Runner;
+};
+#endif
+
+// ===================================
+// Development mode-specifics
+// ===================================
+//
+// Features we log
+#ifdef LLVM_HAVE_TF_API
+#define _DECL_FEATURES(type, name, shape, _)                                   \
+  TensorSpec::createSpec<type>(#name, shape),
+
+static const std::vector<TensorSpec> InputFeatures{
+    {RA_EVICT_FEATURES_LIST(_DECL_FEATURES)},
+};
+#undef _DECL_FEATURES
+static const TensorSpec Output =
+    TensorSpec::createSpec<int64_t>(DecisionName, {1});
+static const TensorSpec Reward = TensorSpec::createSpec<float>("reward", {1});
+
+// Features we bind on the model. The tensor names have a prefix, and we also
+// need to include some tensors that are expected to be present by the training
+// algo.
+// TODO: can we just get rid of these?
+#define _DECL_TRAIN_FEATURES(type, name, shape, _)                             \
+  TensorSpec::createSpec<type>(std::string("action_") + #name, shape),
+
+static const std::vector<TensorSpec> TrainingInputFeatures{
+    {RA_EVICT_FEATURES_LIST(_DECL_TRAIN_FEATURES)
+         TensorSpec::createSpec<float>("action_discount", {1}),
+     TensorSpec::createSpec<int32_t>("action_step_type", {1}),
+     TensorSpec::createSpec<float>("action_reward", {1})}};
+#undef _DECL_TRAIN_FEATURES
+
+class DevelopmentModeEvictAdvisor : public MLEvictAdvisor {
+public:
+  DevelopmentModeEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                              MLModelRunner *Runner,
+                              const MachineBlockFrequencyInfo &MBFI,
+                              const MachineLoopInfo &Loops, Logger *Log)
+      : MLEvictAdvisor(MF, RA, Runner, MBFI, Loops), Log(Log) {}
+
+private:
+  int64_t tryFindEvictionCandidatePosition(
+      LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit,
+      uint8_t CostPerUseLimit,
+      const SmallVirtRegSet &FixedRegisters) const override;
+
+  Logger *const Log;
+};
+
+class DevelopmentModeEvictionAdvisorAnalysis final
+    : public RegAllocEvictionAdvisorAnalysis {
+public:
+  DevelopmentModeEvictionAdvisorAnalysis()
+      : RegAllocEvictionAdvisorAnalysis(AdvisorMode::Development) {}
+  // support for isa<> and dyn_cast.
+  static bool classof(const RegAllocEvictionAdvisorAnalysis *R) {
+    return R->getAdvisorMode() == AdvisorMode::Development;
+  }
+
+  /// get the logger for the given function, or nullptr if we didn't collect
+  /// one. This is used to inject the score by the RegAllocScoring pass.
+  Logger *getLogger(const MachineFunction &MF) const {
+    auto I = LogMap.find(MF.getName());
+    if (I == LogMap.end())
+      return nullptr;
+    return I->second.get();
+  }
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineLoopInfo>();
+    RegAllocEvictionAdvisorAnalysis::getAnalysisUsage(AU);
+  }
+
+  // Save all the logs (when requested).
+  bool doFinalization(Module &M) override {
+    if (TrainingLog.empty())
+      return false;
+    std::error_code EC;
+    auto OS = std::make_unique<raw_fd_ostream>(TrainingLog, EC);
+    if (EC) {
+      M.getContext().emitError(EC.message() + ":" + TrainingLog);
+      return false;
+    }
+    Logger::flushLogs(*OS, LogMap);
+    return false;
+  }
+
+  std::unique_ptr<RegAllocEvictionAdvisor>
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    LLVMContext &Ctx = MF.getFunction().getContext();
+    if (ModelUnderTraining.empty() && TrainingLog.empty()) {
+      Ctx.emitError("Regalloc development mode should be requested with at "
+                    "least logging enabled and/or a training model");
+      return nullptr;
+    }
+    if (!Runner) {
+      if (ModelUnderTraining.empty())
+        Runner = std::make_unique<NoInferenceModelRunner>(Ctx, InputFeatures);
+      else
+        Runner = ModelUnderTrainingRunner::createAndEnsureValid(
+            Ctx, ModelUnderTraining, DecisionName, TrainingInputFeatures);
+      if (!Runner) {
+        Ctx.emitError("Regalloc: could not set up the model runner");
+        return nullptr;
+      }
+    }
+
+    Logger *Log = nullptr;
+    if (!TrainingLog.empty()) {
+      std::vector<LoggedFeatureSpec> LFS;
+      for (const auto &FS : InputFeatures)
+        LFS.push_back({FS, None});
+      if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(Runner.get()))
+        if (MUTR->outputLoggedFeatureSpecs().size() > 1)
+          append_range(LFS, drop_begin(MUTR->outputLoggedFeatureSpecs()));
+      // We always log the output; in particular, if we're not evaluating, we
+      // don't have an output spec json file. That's why we handle the
+      // 'normal' output separately.
+      LFS.push_back({Output, None});
+      auto I = LogMap.insert(std::make_pair(
+          MF.getFunction().getName(),
+          std::make_unique<Logger>(LFS, Reward, /*IncludeReward*/ true)));
+      assert(I.second);
+      Log = I.first->second.get();
+    }
+    return std::make_unique<DevelopmentModeEvictAdvisor>(
+        MF, RA, Runner.get(), getAnalysis<MachineBlockFrequencyInfo>(),
+        getAnalysis<MachineLoopInfo>(), Log);
+  }
+
+  std::unique_ptr<MLModelRunner> Runner;
+  StringMap<std::unique_ptr<Logger>> LogMap;
+};
+#endif //#ifdef LLVM_HAVE_TF_API
+} // namespace
+
+float MLEvictAdvisor::getInitialQueueSize(const MachineFunction &MF) {
+  auto &MRI = MF.getRegInfo();
+  float Ret = 0.0;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    ++Ret;
+  }
+  return Ret;
+}
+
+MLEvictAdvisor::MLEvictAdvisor(const MachineFunction &MF, const RAGreedy &RA,
+                               MLModelRunner *Runner,
+                               const MachineBlockFrequencyInfo &MBFI,
+                               const MachineLoopInfo &Loops)
+    : RegAllocEvictionAdvisor(MF, RA), DefaultAdvisor(MF, RA),
+      Runner(std::move(Runner)), MBFI(MBFI), Loops(Loops),
+      InitialQSize(MLEvictAdvisor::getInitialQueueSize(MF)) {
+  assert(this->Runner);
+  DoNotNormalize.set(FeatureIDs::mask);
+  DoNotNormalize.set(FeatureIDs::is_free);
+  DoNotNormalize.set(FeatureIDs::is_hint);
+  DoNotNormalize.set(FeatureIDs::is_local);
+  DoNotNormalize.set(FeatureIDs::min_stage);
+  DoNotNormalize.set(FeatureIDs::max_stage);
+  DoNotNormalize.set(FeatureIDs::progress);
+}
+
+int64_t MLEvictAdvisor::tryFindEvictionCandidatePosition(
+    LiveInterval &, const AllocationOrder &, unsigned, uint8_t,
+    const SmallVirtRegSet &) const {
+  int64_t Ret = Runner->evaluate<int64_t>();
+  assert(Ret >= 0);
+  assert(Ret <= CandidateVirtRegPos);
+  return Ret;
+}
+
+bool MLEvictAdvisor::loadInterferenceFeatures(
+    LiveInterval &VirtReg, MCRegister PhysReg, bool IsHint,
+    const SmallVirtRegSet &FixedRegisters, FeaturesListNormalizer &Largest,
+    size_t Pos) const {
+  // It is only possible to evict virtual register interference.
+  if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg) {
+    // leave unavailable
+    return false;
+  }
+
+  const bool IsLocal = LIS->intervalIsInOneMBB(VirtReg);
+  int64_t LocalIntfs = 0;
+  float NrUrgent = 0.0f;
+
+  // The cascade tracking is the same as in the default advisor
+  unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg());
+
+  SmallVector<LiveInterval *, MaxInterferences> InterferingIntervals;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    // Different from the default heuristic, we don't make any assumptions about
+    // what having more than 10 results in the query may mean.
+    const auto &IFIntervals = Q.interferingVRegs();
+    if (IFIntervals.empty() && InterferingIntervals.empty())
+      continue;
+    InterferingIntervals.append(IFIntervals.begin(), IFIntervals.end());
+    for (LiveInterval *Intf : reverse(IFIntervals)) {
+      assert(Register::isVirtualRegister(Intf->reg()) &&
+             "Only expecting virtual register interference from query");
+      // This is the same set of legality checks as in the default case: don't
+      // try to evict fixed regs or 'done' ones. Also don't break cascades,
+      // except in the urgent case, with the same nuances used in the default
+      // heuristic.
+      // We could try sharing this between the advisors, but it may end up
+      // more complex than it is right now.
+      if (FixedRegisters.count(Intf->reg()))
+        return false;
+      if (RA.getExtraInfo().getStage(*Intf) == RS_Done)
+        return false;
+      bool Urgent =
+          !VirtReg.isSpillable() &&
+          (Intf->isSpillable() ||
+           RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg())) <
+               RegClassInfo.getNumAllocatableRegs(
+                   MRI->getRegClass(Intf->reg())));
+      // Only evict older cascades or live ranges without a cascade.
+      unsigned IntfCascade = RA.getExtraInfo().getCascade(Intf->reg());
+      if (Cascade <= IntfCascade) {
+        if (!Urgent)
+          return false;
+        ++NrUrgent;
+      }
+
+      LocalIntfs += (IsLocal && LIS->intervalIsInOneMBB(*Intf) &&
+                     (!EnableLocalReassign || !canReassign(*Intf, PhysReg)));
+    }
+  }
+  // OK, so if we made it this far, this LR is an eviction candidate, load its
+  // features.
+  extractFeatures(InterferingIntervals, Largest, Pos, IsHint, LocalIntfs,
+                  NrUrgent);
+  return true;
+}
+
+MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
+    LiveInterval &VirtReg, const AllocationOrder &Order,
+    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
+  auto MaybeOrderLimit = getOrderLimit(VirtReg, Order, CostPerUseLimit);
+  if (!MaybeOrderLimit)
+    return MCRegister::NoRegister;
+  unsigned OrderLimit = *MaybeOrderLimit;
+
+  // The heuristic sets initial costs such as, if CostPerUseLimit is
+  // max<uint8_t>, then any of the costs of the legally-evictable intervals
+  // would be lower. When that happens, one of those will be selected.
+  // Therefore, we allow the candidate be selected, unless the candidate is
+  // unspillable, in which case it would be incorrect to not find a register for
+  // it.
+  const bool MustFindEviction =
+      (!VirtReg.isSpillable() && CostPerUseLimit == static_cast<uint8_t>(~0u));
+  // Number of available candidates - if 0, no need to continue.
+  size_t Available = 0;
+  // Make sure we don't have leftover partial state from an attempt where we had
+  // no available candidates and bailed out early.
+  resetInputs(*Runner);
+
+  // Track the index->register mapping because AllocationOrder doesn't do that
+  // and we'd have to scan it.
+  // Also track their mask, to write asserts/debug.
+  CandidateRegList Regs;
+  Regs.fill({0, false});
+
+  // Track the largest value of features seen during this eviction session. We
+  // only normalize (some of) the float features, but it's just simpler to
+  // dimension 'Largest' to all the features, especially since we have the
+  // 'DoNotNormalize' list.
+  FeaturesListNormalizer Largest;
+  Largest.fill(0.0);
+
+  // Same overal idea as in the default eviction policy - we visit the values of
+  // AllocationOrder one at a time. If it's not legally available, we mask off
+  // the corresponding feature column (==do nothing because we already reset all
+  // the features to 0)
+  // Use Pos to capture the column we load features at - in AllocationOrder
+  // order.
+  size_t Pos = 0;
+  for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E;
+       ++I, ++Pos) {
+    MCRegister PhysReg = *I;
+    Regs[Pos] = std::make_pair(PhysReg, true);
+    assert(PhysReg);
+    if (!canAllocatePhysReg(CostPerUseLimit, PhysReg)) {
+      Regs[Pos].second = false;
+      continue;
+    }
+    if (loadInterferenceFeatures(VirtReg, PhysReg, I.isHint(), FixedRegisters,
+                                 Largest, Pos)) {
+      ++Available;
+      Regs[Pos].second = true;
+    }
+  }
+  if (Available == 0) {
+    // Nothing to decide, nothing to learn.
+    assert(!MustFindEviction);
+    return MCRegister::NoRegister;
+  }
+  // If we must find eviction, the candidate should be masked out of the
+  // decision making process.
+  Regs[CandidateVirtRegPos].second = !MustFindEviction;
+  if (!MustFindEviction)
+    extractFeatures(SmallVector<LiveInterval *, 1>(1, &VirtReg), Largest,
+                    CandidateVirtRegPos, /*IsHint*/ 0, /*LocalIntfsCount*/ 0,
+                    /*NrUrgent*/ 0.0);
+  assert(InitialQSize > 0.0 && "We couldn't have gotten here if we had "
+                               "nothing to allocate initially.");
+  // Normalize the features.
+  for (auto &V : Largest)
+    V = V ? V : 1.0;
+  for (size_t FeatureIndex = 0; FeatureIndex < FeatureIDs::FeatureCount;
+       ++FeatureIndex) {
+    if (DoNotNormalize.test(FeatureIndex))
+      continue;
+    for (size_t Pos = 0; Pos < NumberOfInterferences; ++Pos) {
+      Runner->getTensor<float>(FeatureIndex)[Pos] /= Largest[FeatureIndex];
+    }
+  }
+  *Runner->getTensor<float>(FeatureIDs::progress) =
+      static_cast<float>(RA.getQueueSize()) / InitialQSize;
+
+  // Get a decision.
+  size_t CandidatePos = tryFindEvictionCandidatePosition(
+      VirtReg, Order, OrderLimit, CostPerUseLimit, FixedRegisters);
+  // The contract with the ML side is that CandidatePos is mask == 1 (i.e.
+  // Regs[CandidatePos].second)
+  assert(Regs[CandidatePos].second);
+  if (CandidatePos == CandidateVirtRegPos) {
+    assert(!MustFindEviction);
+    return MCRegister::NoRegister;
+  }
+  return Regs[CandidatePos].first;
+}
+
+// Overall, this currently mimics what we do for weight calculation, but instead
+// of accummulating the various features, we keep them separate.
+void MLEvictAdvisor::extractFeatures(
+    const SmallVectorImpl<LiveInterval *> &Intervals,
+    std::array<float, FeatureIDs::FeatureCount> &Largest, size_t Pos,
+    int64_t IsHint, int64_t LocalIntfsCount, float NrUrgent) const {
+  int64_t NrDefsAndUses = 0;
+  int64_t NrBrokenHints = 0;
+  float R = 0;
+  float W = 0;
+  float RW = 0;
+  float IndVarUpdates = 0;
+  float HintWeights = 0.0;
+  float StartBBFreq = 0.0;
+  float EndBBFreq = 0.0;
+  float HottestBlockFreq = 0.0;
+  int32_t NrRematerializable = 0;
+  float TotalWeight = 0.0;
+
+  SlotIndex EndSI = LIS->getSlotIndexes()->getZeroIndex();
+  SlotIndex StartSI = LIS->getSlotIndexes()->getLastIndex();
+  int64_t MaxStage = 0;
+  int64_t MinStage =
+      Intervals.empty() ? 0 : std::numeric_limits<int64_t>::max();
+
+  for (const auto *L : Intervals) {
+    const LiveInterval &LI = *L;
+    MaxStage = std::max<int64_t>(
+        MaxStage, static_cast<int64_t>(RA.getExtraInfo().getStage(LI)));
+    MinStage = std::min<int64_t>(
+        MinStage, static_cast<int64_t>(RA.getExtraInfo().getStage(LI)));
+
+    TotalWeight = std::max(TotalWeight, LI.weight());
+
+    if (LI.beginIndex() < StartSI)
+      StartSI = LI.beginIndex();
+
+    if (LI.endIndex() > EndSI)
+      EndSI = LI.endIndex();
+
+    SmallPtrSet<MachineInstr *, 8> Visited;
+    const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+    NrBrokenHints += VRM->hasPreferredPhys(LI.reg());
+
+    for (MachineRegisterInfo::reg_instr_nodbg_iterator
+             I = MRI->reg_instr_nodbg_begin(LI.reg()),
+             E = MRI->reg_instr_nodbg_end();
+         I != E;) {
+      MachineInstr *MI = &*(I++);
+
+      ++NrDefsAndUses;
+      if (!Visited.insert(MI).second)
+        continue;
+
+      if (MI->isIdentityCopy() || MI->isImplicitDef())
+        continue;
+
+      bool Reads, Writes;
+      std::tie(Reads, Writes) = MI->readsWritesVirtualRegister(LI.reg());
+
+      float Freq = MBFI.getBlockFreqRelativeToEntryBlock(MI->getParent());
+      if (Freq > HottestBlockFreq)
+        HottestBlockFreq = Freq;
+      R += (Reads && !Writes) * Freq;
+      W += (!Reads && Writes) * Freq;
+      RW += (Reads && Writes) * Freq;
+
+      auto *MBB = MI->getParent();
+      auto *Loop = Loops.getLoopFor(MBB);
+      bool IsExiting = Loop ? Loop->isLoopExiting(MBB) : false;
+
+      if (Writes && IsExiting && LIS->isLiveOutOfMBB(LI, MBB))
+        IndVarUpdates += Freq;
+
+      if (MI->isCopy() && VirtRegAuxInfo::copyHint(MI, LI.reg(), TRI, *MRI))
+        HintWeights += Freq;
+    }
+    NrRematerializable += VirtRegAuxInfo::isRematerializable(
+        LI, *LIS, *VRM, *MF.getSubtarget().getInstrInfo());
+  }
+  size_t Size = 0;
+  if (!Intervals.empty()) {
+    StartBBFreq =
+        MBFI.getBlockFreqRelativeToEntryBlock(LIS->getMBBFromIndex(StartSI));
+    if (EndSI >= LIS->getSlotIndexes()->getLastIndex())
+      EndSI = LIS->getSlotIndexes()->getLastIndex().getPrevIndex();
+    EndBBFreq =
+        MBFI.getBlockFreqRelativeToEntryBlock(LIS->getMBBFromIndex(EndSI));
+    Size = StartSI.distance(EndSI);
+  }
+  // Set the features at the column 'Pos'.
+#define SET(ID, TYPE, VAL)                                                     \
+  do {                                                                         \
+    Runner->getTensor<TYPE>(FeatureIDs::ID)[Pos] = static_cast<TYPE>(VAL);     \
+    if (!DoNotNormalize.test(FeatureIDs::ID))                                  \
+      Largest[FeatureIDs::ID] =                                                \
+          std::max(Largest[FeatureIDs::ID], static_cast<float>(VAL));          \
+  } while (false)
+  SET(mask, int64_t, 1);
+  SET(is_free, int64_t, Intervals.empty());
+  SET(nr_urgent, float, NrUrgent);
+  SET(nr_broken_hints, float, NrBrokenHints);
+  SET(is_hint, int64_t, IsHint);
+  SET(is_local, int64_t, LocalIntfsCount);
+  SET(nr_rematerializable, float, NrRematerializable);
+  SET(nr_defs_and_uses, float, NrDefsAndUses);
+  SET(weighed_reads_by_max, float, R);
+  SET(weighed_writes_by_max, float, W);
+  SET(weighed_read_writes_by_max, float, RW);
+  SET(weighed_indvars_by_max, float, IndVarUpdates);
+  SET(hint_weights_by_max, float, HintWeights);
+  SET(start_bb_freq_by_max, float, StartBBFreq);
+  SET(end_bb_freq_by_max, float, EndBBFreq);
+  SET(hottest_bb_freq_by_max, float, HottestBlockFreq);
+  SET(liverange_size, float, Size);
+  SET(use_def_density, float, TotalWeight);
+  SET(max_stage, int64_t, MaxStage);
+  SET(min_stage, int64_t, MinStage);
+#undef SET
+}
+
+// Development mode-specific implementations
+#ifdef LLVM_HAVE_TF_API
+RegAllocEvictionAdvisorAnalysis *llvm::createDevelopmentModeAdvisor() {
+  return new DevelopmentModeEvictionAdvisorAnalysis();
+}
+
+int64_t DevelopmentModeEvictAdvisor::tryFindEvictionCandidatePosition(
+    LiveInterval &VirtReg, const AllocationOrder &Order, unsigned OrderLimit,
+    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
+  int64_t Ret = 0;
+  if (isa<ModelUnderTrainingRunner>(getRunner())) {
+    Ret = MLEvictAdvisor::tryFindEvictionCandidatePosition(
+        VirtReg, Order, OrderLimit, CostPerUseLimit, FixedRegisters);
+  } else {
+    MCRegister PhysReg = getDefaultAdvisor().tryFindEvictionCandidate(
+        VirtReg, Order, CostPerUseLimit, FixedRegisters);
+    // Find the index of the selected PhysReg. We need it for logging, otherwise
+    // this is wasted cycles (but so would starting development mode without a
+    // model nor logging)
+    if (!PhysReg)
+      Ret = CandidateVirtRegPos;
+    else
+      for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit);
+           I != E; ++I, ++Ret)
+        if (*I == PhysReg)
+          break;
+  }
+  if (TrainingLog.empty())
+    return Ret;
+  size_t CurrentFeature = 0;
+  for (; CurrentFeature < FeatureIDs::FeatureCount; ++CurrentFeature) {
+    Log->logSpecifiedTensorValue(
+        CurrentFeature, reinterpret_cast<const char *>(
+                            getRunner().getTensorUntyped(CurrentFeature)));
+  }
+  if (auto *MUTR = dyn_cast<ModelUnderTrainingRunner>(&getRunner()))
+    for (size_t I = 1; I < MUTR->outputLoggedFeatureSpecs().size();
+         ++I, ++CurrentFeature)
+      Log->logSpecifiedTensorValue(
+          CurrentFeature,
+          reinterpret_cast<const char *>(
+              MUTR->lastEvaluationResult()->getUntypedTensorValue(I)));
+  // The output is right after the features and the extra outputs
+  Log->logInt64Value(CurrentFeature, &Ret);
+  return Ret;
+}
+
+bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) {
+  if (auto *DevModeAnalysis = dyn_cast<DevelopmentModeEvictionAdvisorAnalysis>(
+          &getAnalysis<RegAllocEvictionAdvisorAnalysis>()))
+    if (auto *Log = DevModeAnalysis->getLogger(MF))
+      Log->logFloatFinalReward(static_cast<float>(
+          calculateRegAllocScore(
+              MF, getAnalysis<MachineBlockFrequencyInfo>(),
+              getAnalysis<AAResultsWrapperPass>().getAAResults())
+              .getScore()));
+
+  return false;
+}
+#endif // #ifdef LLVM_HAVE_TF_API
+
+#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL)
+RegAllocEvictionAdvisorAnalysis *llvm::createReleaseModeAdvisor() {
+  return new ReleaseModeEvictionAdvisorAnalysis();
+}
+#endif
+
+// In all cases except development mode, we don't need scoring.
+#if !defined(LLVM_HAVE_TF_API)
+bool RegAllocScoring::runOnMachineFunction(MachineFunction &) { return false; }
+#endif
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 692587cd58fa..c93ffaabf74c 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -96,6 +96,12 @@ static cl::opt<unsigned> AlignAllNonFallThruBlocks(
              "format (e.g 4 means align on 16B boundaries)."),
     cl::init(0), cl::Hidden);
 
+static cl::opt<unsigned> MaxBytesForAlignmentOverride(
+    "max-bytes-for-alignment",
+    cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
+             "alignment"),
+    cl::init(0), cl::Hidden);
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
     "block-placement-exit-block-bias",
@@ -2929,10 +2935,21 @@ void MachineBlockPlacement::alignBlocks() {
     MachineBasicBlock *LayoutPred =
         &*std::prev(MachineFunction::iterator(ChainBB));
 
+    auto DetermineMaxAlignmentPadding = [&]() {
+      // Set the maximum bytes allowed to be emitted for alignment.
+      unsigned MaxBytes;
+      if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
+        MaxBytes = MaxBytesForAlignmentOverride;
+      else
+        MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
+      ChainBB->setMaxBytesForAlignment(MaxBytes);
+    };
+
     // Force alignment if all the predecessors are jumps. We already checked
     // that the block isn't cold above.
     if (!LayoutPred->isSuccessor(ChainBB)) {
       ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
       continue;
     }
 
@@ -2943,8 +2960,10 @@ void MachineBlockPlacement::alignBlocks() {
     BranchProbability LayoutProb =
         MBPI->getEdgeProbability(LayoutPred, ChainBB);
     BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
-    if (LayoutEdgeFreq <= (Freq * ColdProb))
+    if (LayoutEdgeFreq <= (Freq * ColdProb)) {
       ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
+    }
   }
 }
 
@@ -3418,17 +3437,30 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
+  bool HasMaxBytesOverride =
+      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
+
   if (AlignAllBlock)
     // Align all of the blocks in the function to a specific alignment.
-    for (MachineBasicBlock &MBB : MF)
-      MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    for (MachineBasicBlock &MBB : MF) {
+      if (HasMaxBytesOverride)
+        MBB.setAlignment(Align(1ULL << AlignAllBlock),
+                         MaxBytesForAlignmentOverride);
+      else
+        MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    }
   else if (AlignAllNonFallThruBlocks) {
     // Align all of the blocks that have no fall-through predecessors to a
     // specific alignment.
     for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
       auto LayoutPred = std::prev(MBI);
-      if (!LayoutPred->isSuccessor(&*MBI))
-        MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      if (!LayoutPred->isSuccessor(&*MBI)) {
+        if (HasMaxBytesOverride)
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
+                            MaxBytesForAlignmentOverride);
+        else
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      }
     }
   }
   if (ViewBlockLayoutWithBFI != GVDT_None &&
diff --git a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
index 6ddb1758719b..a39dc79baaa8 100644
--- a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -29,9 +29,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier",
                 "Machine Dominance Frontier Construction", true, true)
 
-MachineDominanceFrontier::MachineDominanceFrontier()
-  : MachineFunctionPass(ID),
-    Base() {
+MachineDominanceFrontier::MachineDominanceFrontier() : MachineFunctionPass(ID) {
   initializeMachineDominanceFrontierPass(*PassRegistry::getPassRegistry());
 }
 
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 81ed3d0e93ff..fd5ea5cad072 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -76,6 +76,8 @@
 #include <utility>
 #include <vector>
 
+#include "LiveDebugValues/LiveDebugValues.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
@@ -1238,7 +1240,7 @@ bool MachineFunction::useDebugInstrRef() const {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return false;
 
-  if (getTarget().Options.ValueTrackingVariableLocations)
+  if (llvm::debuginfoShouldUseDebugInstrRef(getTarget().getTargetTriple()))
     return true;
 
   return false;
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 6ca97031b92a..759cff179790 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -144,6 +144,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSet<Register, 8> UndefUseSet;
   SmallVector<MachineOperand*, 4> Defs;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
+    // Debug instructions have no effects to track.
+    if (MII->isDebugInstr())
+      continue;
+
     for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MII->getOperand(i);
       if (!MO.isReg())
diff --git a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index 59fc23983d3d..5347a7b0d890 100644
--- a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -22,8 +22,7 @@
 using namespace llvm;
 
 DiagnosticInfoMIROptimization::MachineArgument::MachineArgument(
-    StringRef MKey, const MachineInstr &MI)
-    : Argument() {
+    StringRef MKey, const MachineInstr &MI) {
   Key = std::string(MKey);
 
   raw_string_ostream OS(Val);
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 54c478645dcf..0dbbc218e946 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -796,9 +796,14 @@ bool MachineSinking::isProfitableToSinkTo(Register Reg, MachineInstr &MI,
     if (Reg == 0)
       continue;
 
-    // Don't handle physical register.
-    if (Register::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg)) {
+      if (MO.isUse() &&
+          (MRI->isConstantPhysReg(Reg) || TII->isIgnorableUse(MO)))
+        continue;
+
+      // Don't handle non-constant and non-ignorable physical register.
       return false;
+    }
 
     // Users for the defs are all dominated by SuccToSinkTo.
     if (MO.isDef()) {
@@ -898,7 +903,7 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
         // it could get allocated to something with a def during allocation.
-        if (!MRI->isConstantPhysReg(Reg))
+        if (!MRI->isConstantPhysReg(Reg) && !TII->isIgnorableUse(MO))
           return nullptr;
       } else if (!MO.isDead()) {
         // A def that isn't dead. We can't move it.
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index aaa6403cc978..f91a9d2c3a32 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -1704,7 +1704,7 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
   // Peel out the prologs.
   LS.reset();
   for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
-    LS[I] = 1;
+    LS[I] = true;
     Prologs.push_back(peelKernel(LPD_Front));
     LiveStages[Prologs.back()] = LS;
     AvailableStages[Prologs.back()] = LS;
@@ -1752,7 +1752,7 @@ void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
       // Move stage one block at a time so that Phi nodes are updated correctly.
       for (size_t K = Iteration; K > I; K--)
         moveStageBetweenBlocks(Epilogs[K - 1], Epilogs[K], Stage);
-      LS[Stage] = 1;
+      LS[Stage] = true;
     }
     LiveStages[Epilogs[I]] = LS;
     AvailableStages[Epilogs[I]] = AS;
diff --git a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
index 9ed3471c0fc9..db5217469fba 100644
--- a/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
+++ b/llvm/lib/CodeGen/NonRelocatableStringpool.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/NonRelocatableStringpool.h"
+#include "llvm/ADT/STLExtras.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index e3eb3f825851..74b903f99284 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -97,7 +97,7 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
       objcarc::ARCInstKind Kind = objcarc::getAttachedARCFunctionKind(CB);
       (void)Kind;
       assert((Kind == objcarc::ARCInstKind::RetainRV ||
-              Kind == objcarc::ARCInstKind::ClaimRV) &&
+              Kind == objcarc::ARCInstKind::UnsafeClaimRV) &&
              "use expected to be the argument of operand bundle "
              "\"clang.arc.attachedcall\"");
       U.set(FCache.getCallee());
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
index 9f1012c95964..87df7bb4a689 100644
--- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RegAllocEvictionAdvisor.h"
+#include "RegAllocGreedy.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -42,6 +43,9 @@ static cl::opt<bool> EnableLocalReassignment(
     cl::init(false));
 
 #define DEBUG_TYPE "regalloc"
+#ifdef LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL
+#define LLVM_HAVE_TF_AOT
+#endif
 
 char RegAllocEvictionAdvisorAnalysis::ID = 0;
 INITIALIZE_PASS(RegAllocEvictionAdvisorAnalysis, "regalloc-evict",
@@ -62,12 +66,8 @@ public:
 
 private:
   std::unique_ptr<RegAllocEvictionAdvisor>
-  getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
-             LiveIntervals *LIS, VirtRegMap *VRM,
-             const RegisterClassInfo &RegClassInfo,
-             ExtraRegInfo *ExtraInfo) override {
-    return std::make_unique<DefaultEvictionAdvisor>(MF, Matrix, LIS, VRM,
-                                                    RegClassInfo, ExtraInfo);
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) override {
+    return std::make_unique<DefaultEvictionAdvisor>(MF, RA);
   }
   bool doInitialization(Module &M) override {
     if (NotAsRequested)
@@ -86,10 +86,14 @@ template <> Pass *llvm::callDefaultCtor<RegAllocEvictionAdvisorAnalysis>() {
     Ret = new DefaultEvictionAdvisorAnalysis(/*NotAsRequested*/ false);
     break;
   case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Development:
-    // TODO(mtrofin): add implementation
+#if defined(LLVM_HAVE_TF_API)
+    Ret = createDevelopmentModeAdvisor();
+#endif
     break;
   case RegAllocEvictionAdvisorAnalysis::AdvisorMode::Release:
-    // TODO(mtrofin): add implementation
+#if defined(LLVM_HAVE_TF_AOT)
+    Ret = createReleaseModeAdvisor();
+#endif
     break;
   }
   if (Ret)
@@ -109,13 +113,12 @@ StringRef RegAllocEvictionAdvisorAnalysis::getPassName() const {
   llvm_unreachable("Unknown advisor kind");
 }
 
-RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(
-    const MachineFunction &MF, LiveRegMatrix *Matrix, LiveIntervals *LIS,
-    VirtRegMap *VRM, const RegisterClassInfo &RegClassInfo,
-    ExtraRegInfo *ExtraInfo)
-    : MF(MF), Matrix(Matrix), LIS(LIS), VRM(VRM), MRI(&VRM->getRegInfo()),
-      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RegClassInfo),
-      RegCosts(TRI->getRegisterCosts(MF)), ExtraInfo(ExtraInfo),
+RegAllocEvictionAdvisor::RegAllocEvictionAdvisor(const MachineFunction &MF,
+                                                 const RAGreedy &RA)
+    : MF(MF), RA(RA), Matrix(RA.getInterferenceMatrix()),
+      LIS(RA.getLiveIntervals()), VRM(RA.getVirtRegMap()),
+      MRI(&VRM->getRegInfo()), TRI(MF.getSubtarget().getRegisterInfo()),
+      RegClassInfo(RA.getRegClassInfo()), RegCosts(TRI->getRegisterCosts(MF)),
       EnableLocalReassign(EnableLocalReassignment ||
                           MF.getSubtarget().enableRALocalReassignment(
                               MF.getTarget().getOptLevel())) {}
diff --git a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
index debb75ed5020..33e03aed81a7 100644
--- a/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
+++ b/llvm/lib/CodeGen/RegAllocEvictionAdvisor.h
@@ -87,87 +87,9 @@ struct EvictionCost {
   }
 };
 
-/// Track allocation stage and eviction loop prevention during allocation.
-// TODO(mtrofin): Consider exposing RAGreedy in a header instead, and folding
-// this back into it.
-class ExtraRegInfo final {
-  // RegInfo - Keep additional information about each live range.
-  struct RegInfo {
-    LiveRangeStage Stage = RS_New;
-
-    // Cascade - Eviction loop prevention. See
-    // canEvictInterferenceBasedOnCost().
-    unsigned Cascade = 0;
-
-    RegInfo() = default;
-  };
-
-  IndexedMap<RegInfo, VirtReg2IndexFunctor> Info;
-  unsigned NextCascade = 1;
-
-public:
-  ExtraRegInfo() = default;
-  ExtraRegInfo(const ExtraRegInfo &) = delete;
-
-  LiveRangeStage getStage(Register Reg) const { return Info[Reg].Stage; }
-
-  LiveRangeStage getStage(const LiveInterval &VirtReg) const {
-    return getStage(VirtReg.reg());
-  }
-
-  void setStage(Register Reg, LiveRangeStage Stage) {
-    Info.grow(Reg.id());
-    Info[Reg].Stage = Stage;
-  }
-
-  void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
-    setStage(VirtReg.reg(), Stage);
-  }
-
-  /// Return the current stage of the register, if present, otherwise initialize
-  /// it and return that.
-  LiveRangeStage getOrInitStage(Register Reg) {
-    Info.grow(Reg.id());
-    return getStage(Reg);
-  }
-
-  unsigned getCascade(Register Reg) const { return Info[Reg].Cascade; }
-
-  void setCascade(Register Reg, unsigned Cascade) {
-    Info.grow(Reg.id());
-    Info[Reg].Cascade = Cascade;
-  }
-
-  unsigned getOrAssignNewCascade(Register Reg) {
-    unsigned Cascade = getCascade(Reg);
-    if (!Cascade) {
-      Cascade = NextCascade++;
-      setCascade(Reg, Cascade);
-    }
-    return Cascade;
-  }
-
-  unsigned getCascadeOrCurrentNext(Register Reg) const {
-    unsigned Cascade = getCascade(Reg);
-    if (!Cascade)
-      Cascade = NextCascade;
-    return Cascade;
-  }
-
-  template <typename Iterator>
-  void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
-    for (; Begin != End; ++Begin) {
-      Register Reg = *Begin;
-      Info.grow(Reg.id());
-      if (Info[Reg].Stage == RS_New)
-        Info[Reg].Stage = NewStage;
-    }
-  }
-  void LRE_DidCloneVirtReg(Register New, Register Old);
-};
-
 /// Interface to the eviction advisor, which is responsible for making a
 /// decision as to which live ranges should be evicted (if any).
+class RAGreedy;
 class RegAllocEvictionAdvisor {
 public:
   RegAllocEvictionAdvisor(const RegAllocEvictionAdvisor &) = delete;
@@ -193,14 +115,23 @@ public:
   bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
 
 protected:
-  RegAllocEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
-                          LiveIntervals *LIS, VirtRegMap *VRM,
-                          const RegisterClassInfo &RegClassInfo,
-                          ExtraRegInfo *ExtraInfo);
+  RegAllocEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA);
 
   Register canReassign(LiveInterval &VirtReg, Register PrevReg) const;
 
+  // Get the upper limit of elements in the given Order we need to analize.
+  // TODO: is this heuristic,  we could consider learning it.
+  Optional<unsigned> getOrderLimit(const LiveInterval &VirtReg,
+                                   const AllocationOrder &Order,
+                                   unsigned CostPerUseLimit) const;
+
+  // Determine if it's worth trying to allocate this reg, given the
+  // CostPerUseLimit
+  // TODO: this is a heuristic component we could consider learning, too.
+  bool canAllocatePhysReg(unsigned CostPerUseLimit, MCRegister PhysReg) const;
+
   const MachineFunction &MF;
+  const RAGreedy &RA;
   LiveRegMatrix *const Matrix;
   LiveIntervals *const LIS;
   VirtRegMap *const VRM;
@@ -208,7 +139,6 @@ protected:
   const TargetRegisterInfo *const TRI;
   const RegisterClassInfo &RegClassInfo;
   const ArrayRef<uint8_t> RegCosts;
-  ExtraRegInfo *const ExtraInfo;
 
   /// Run or not the local reassignment heuristic. This information is
   /// obtained from the TargetSubtargetInfo.
@@ -243,19 +173,17 @@ public:
 
   /// Get an advisor for the given context (i.e. machine function, etc)
   virtual std::unique_ptr<RegAllocEvictionAdvisor>
-  getAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
-             LiveIntervals *LIS, VirtRegMap *VRM,
-             const RegisterClassInfo &RegClassInfo,
-             ExtraRegInfo *ExtraInfo) = 0;
+  getAdvisor(const MachineFunction &MF, const RAGreedy &RA) = 0;
   AdvisorMode getAdvisorMode() const { return Mode; }
 
-private:
+protected:
   // This analysis preserves everything, and subclasses may have additional
   // requirements.
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
 
+private:
   StringRef getPassName() const override;
   const AdvisorMode Mode;
 };
@@ -264,25 +192,16 @@ private:
 /// an instance of the eviction advisor.
 template <> Pass *callDefaultCtor<RegAllocEvictionAdvisorAnalysis>();
 
-// TODO(mtrofin): implement these.
-#ifdef LLVM_HAVE_TF_AOT
 RegAllocEvictionAdvisorAnalysis *createReleaseModeAdvisor();
-#endif
 
-#ifdef LLVM_HAVE_TF_API
 RegAllocEvictionAdvisorAnalysis *createDevelopmentModeAdvisor();
-#endif
 
 // TODO: move to RegAllocEvictionAdvisor.cpp when we move implementation
 // out of RegAllocGreedy.cpp
 class DefaultEvictionAdvisor : public RegAllocEvictionAdvisor {
 public:
-  DefaultEvictionAdvisor(const MachineFunction &MF, LiveRegMatrix *Matrix,
-                         LiveIntervals *LIS, VirtRegMap *VRM,
-                         const RegisterClassInfo &RegClassInfo,
-                         ExtraRegInfo *ExtraInfo)
-      : RegAllocEvictionAdvisor(MF, Matrix, LIS, VRM, RegClassInfo, ExtraInfo) {
-  }
+  DefaultEvictionAdvisor(const MachineFunction &MF, const RAGreedy &RA)
+      : RegAllocEvictionAdvisor(MF, RA) {}
 
 private:
   MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &,
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index ce3cf31dbd6b..6ea6dbcbbb74 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "RegAllocGreedy.h"
 #include "AllocationOrder.h"
 #include "InterferenceCache.h"
 #include "LiveDebugVariables.h"
@@ -135,362 +136,6 @@ static cl::opt<bool> ConsiderLocalIntervalCost(
 static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
                                        createGreedyRegisterAllocator);
 
-namespace {
-
-class RAGreedy : public MachineFunctionPass,
-                 public RegAllocBase,
-                 private LiveRangeEdit::Delegate {
-  // Convenient shortcuts.
-  using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
-  using SmallLISet = SmallPtrSet<LiveInterval *, 4>;
-
-  // context
-  MachineFunction *MF;
-
-  // Shortcuts to some useful interface.
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  RegisterClassInfo RCI;
-
-  // analyses
-  SlotIndexes *Indexes;
-  MachineBlockFrequencyInfo *MBFI;
-  MachineDominatorTree *DomTree;
-  MachineLoopInfo *Loops;
-  MachineOptimizationRemarkEmitter *ORE;
-  EdgeBundles *Bundles;
-  SpillPlacement *SpillPlacer;
-  LiveDebugVariables *DebugVars;
-  AliasAnalysis *AA;
-
-  // state
-  std::unique_ptr<Spiller> SpillerInstance;
-  PQueue Queue;
-  std::unique_ptr<VirtRegAuxInfo> VRAI;
-  Optional<ExtraRegInfo> ExtraInfo;
-  std::unique_ptr<RegAllocEvictionAdvisor> EvictAdvisor;
-
-  // Enum CutOffStage to keep a track whether the register allocation failed
-  // because of the cutoffs encountered in last chance recoloring.
-  // Note: This is used as bitmask. New value should be next power of 2.
-  enum CutOffStage {
-    // No cutoffs encountered
-    CO_None = 0,
-
-    // lcr-max-depth cutoff encountered
-    CO_Depth = 1,
-
-    // lcr-max-interf cutoff encountered
-    CO_Interf = 2
-  };
-
-  uint8_t CutOffInfo;
-
-#ifndef NDEBUG
-  static const char *const StageName[];
-#endif
-
-  /// EvictionTrack - Keeps track of past evictions in order to optimize region
-  /// split decision.
-  class EvictionTrack {
-
-  public:
-    using EvictorInfo =
-        std::pair<Register /* evictor */, MCRegister /* physreg */>;
-    using EvicteeInfo = llvm::DenseMap<Register /* evictee */, EvictorInfo>;
-
-  private:
-    /// Each Vreg that has been evicted in the last stage of selectOrSplit will
-    /// be mapped to the evictor Vreg and the PhysReg it was evicted from.
-    EvicteeInfo Evictees;
-
-  public:
-    /// Clear all eviction information.
-    void clear() { Evictees.clear(); }
-
-    ///  Clear eviction information for the given evictee Vreg.
-    /// E.g. when Vreg get's a new allocation, the old eviction info is no
-    /// longer relevant.
-    /// \param Evictee The evictee Vreg for whom we want to clear collected
-    /// eviction info.
-    void clearEvicteeInfo(Register Evictee) { Evictees.erase(Evictee); }
-
-    /// Track new eviction.
-    /// The Evictor vreg has evicted the Evictee vreg from Physreg.
-    /// \param PhysReg The physical register Evictee was evicted from.
-    /// \param Evictor The evictor Vreg that evicted Evictee.
-    /// \param Evictee The evictee Vreg.
-    void addEviction(MCRegister PhysReg, Register Evictor, Register Evictee) {
-      Evictees[Evictee].first = Evictor;
-      Evictees[Evictee].second = PhysReg;
-    }
-
-    /// Return the Evictor Vreg which evicted Evictee Vreg from PhysReg.
-    /// \param Evictee The evictee vreg.
-    /// \return The Evictor vreg which evicted Evictee vreg from PhysReg. 0 if
-    /// nobody has evicted Evictee from PhysReg.
-    EvictorInfo getEvictor(Register Evictee) {
-      if (Evictees.count(Evictee)) {
-        return Evictees[Evictee];
-      }
-
-      return EvictorInfo(0, 0);
-    }
-  };
-
-  // Keeps track of past evictions in order to optimize region split decision.
-  EvictionTrack LastEvicted;
-
-  // splitting state.
-  std::unique_ptr<SplitAnalysis> SA;
-  std::unique_ptr<SplitEditor> SE;
-
-  /// Cached per-block interference maps
-  InterferenceCache IntfCache;
-
-  /// All basic blocks where the current register has uses.
-  SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints;
-
-  /// Global live range splitting candidate info.
-  struct GlobalSplitCandidate {
-    // Register intended for assignment, or 0.
-    MCRegister PhysReg;
-
-    // SplitKit interval index for this candidate.
-    unsigned IntvIdx;
-
-    // Interference for PhysReg.
-    InterferenceCache::Cursor Intf;
-
-    // Bundles where this candidate should be live.
-    BitVector LiveBundles;
-    SmallVector<unsigned, 8> ActiveBlocks;
-
-    void reset(InterferenceCache &Cache, MCRegister Reg) {
-      PhysReg = Reg;
-      IntvIdx = 0;
-      Intf.setPhysReg(Cache, Reg);
-      LiveBundles.clear();
-      ActiveBlocks.clear();
-    }
-
-    // Set B[I] = C for every live bundle where B[I] was NoCand.
-    unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
-      unsigned Count = 0;
-      for (unsigned I : LiveBundles.set_bits())
-        if (B[I] == NoCand) {
-          B[I] = C;
-          Count++;
-        }
-      return Count;
-    }
-  };
-
-  /// Candidate info for each PhysReg in AllocationOrder.
-  /// This vector never shrinks, but grows to the size of the largest register
-  /// class.
-  SmallVector<GlobalSplitCandidate, 32> GlobalCand;
-
-  enum : unsigned { NoCand = ~0u };
-
-  /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
-  /// NoCand which indicates the stack interval.
-  SmallVector<unsigned, 32> BundleCand;
-
-  /// Callee-save register cost, calculated once per machine function.
-  BlockFrequency CSRCost;
-
-  /// Enable or not the consideration of the cost of local intervals created
-  /// by a split candidate when choosing the best split candidate.
-  bool EnableAdvancedRASplitCost;
-
-  /// Set of broken hints that may be reconciled later because of eviction.
-  SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
-
-  /// The register cost values. This list will be recreated for each Machine
-  /// Function
-  ArrayRef<uint8_t> RegCosts;
-
-public:
-  RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses);
-
-  /// Return the pass name.
-  StringRef getPassName() const override { return "Greedy Register Allocator"; }
-
-  /// RAGreedy analysis usage.
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void releaseMemory() override;
-  Spiller &spiller() override { return *SpillerInstance; }
-  void enqueueImpl(LiveInterval *LI) override;
-  LiveInterval *dequeue() override;
-  MCRegister selectOrSplit(LiveInterval &,
-                           SmallVectorImpl<Register> &) override;
-  void aboutToRemoveInterval(LiveInterval &) override;
-
-  /// Perform register allocation.
-  bool runOnMachineFunction(MachineFunction &mf) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoPHIs);
-  }
-
-  MachineFunctionProperties getClearedProperties() const override {
-    return MachineFunctionProperties().set(
-      MachineFunctionProperties::Property::IsSSA);
-  }
-
-  static char ID;
-
-private:
-  MCRegister selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &,
-                               SmallVirtRegSet &, unsigned = 0);
-
-  bool LRE_CanEraseVirtReg(Register) override;
-  void LRE_WillShrinkVirtReg(Register) override;
-  void LRE_DidCloneVirtReg(Register, Register) override;
-  void enqueue(PQueue &CurQueue, LiveInterval *LI);
-  LiveInterval *dequeue(PQueue &CurQueue);
-
-  BlockFrequency calcSpillCost();
-  bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency&);
-  bool addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
-  bool growRegion(GlobalSplitCandidate &Cand);
-  bool splitCanCauseEvictionChain(Register Evictee, GlobalSplitCandidate &Cand,
-                                  unsigned BBNumber,
-                                  const AllocationOrder &Order);
-  bool splitCanCauseLocalSpill(unsigned VirtRegToSplit,
-                               GlobalSplitCandidate &Cand, unsigned BBNumber,
-                               const AllocationOrder &Order);
-  BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &,
-                                     const AllocationOrder &Order,
-                                     bool *CanCauseEvictionChain);
-  bool calcCompactRegion(GlobalSplitCandidate&);
-  void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
-  void calcGapWeights(MCRegister, SmallVectorImpl<float> &);
-  bool canEvictInterferenceInRange(const LiveInterval &VirtReg,
-                                   MCRegister PhysReg, SlotIndex Start,
-                                   SlotIndex End, EvictionCost &MaxCost) const;
-  MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order,
-                                      const LiveInterval &VirtReg,
-                                      SlotIndex Start, SlotIndex End,
-                                      float *BestEvictWeight) const;
-  void evictInterference(LiveInterval &, MCRegister,
-                         SmallVectorImpl<Register> &);
-  bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg,
-                                  SmallLISet &RecoloringCandidates,
-                                  const SmallVirtRegSet &FixedRegisters);
-
-  MCRegister tryAssign(LiveInterval&, AllocationOrder&,
-                     SmallVectorImpl<Register>&,
-                     const SmallVirtRegSet&);
-  MCRegister tryFindEvictionCandidate(LiveInterval &, const AllocationOrder &,
-                                      uint8_t, const SmallVirtRegSet &) const;
-  MCRegister tryEvict(LiveInterval &, AllocationOrder &,
-                    SmallVectorImpl<Register> &, uint8_t,
-                    const SmallVirtRegSet &);
-  MCRegister tryRegionSplit(LiveInterval &, AllocationOrder &,
-                            SmallVectorImpl<Register> &);
-  /// Calculate cost of region splitting.
-  unsigned calculateRegionSplitCost(LiveInterval &VirtReg,
-                                    AllocationOrder &Order,
-                                    BlockFrequency &BestCost,
-                                    unsigned &NumCands, bool IgnoreCSR,
-                                    bool *CanCauseEvictionChain = nullptr);
-  /// Perform region splitting.
-  unsigned doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
-                         bool HasCompact,
-                         SmallVectorImpl<Register> &NewVRegs);
-  /// Check other options before using a callee-saved register for the first
-  /// time.
-  MCRegister tryAssignCSRFirstTime(LiveInterval &VirtReg,
-                                   AllocationOrder &Order, MCRegister PhysReg,
-                                   uint8_t &CostPerUseLimit,
-                                   SmallVectorImpl<Register> &NewVRegs);
-  void initializeCSRCost();
-  unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
-                         SmallVectorImpl<Register>&);
-  unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
-                               SmallVectorImpl<Register>&);
-  unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
-    SmallVectorImpl<Register>&);
-  unsigned trySplit(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<Register>&,
-                    const SmallVirtRegSet&);
-  unsigned tryLastChanceRecoloring(LiveInterval &, AllocationOrder &,
-                                   SmallVectorImpl<Register> &,
-                                   SmallVirtRegSet &, unsigned);
-  bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &,
-                               SmallVirtRegSet &, unsigned);
-  void tryHintRecoloring(LiveInterval &);
-  void tryHintsRecoloring();
-
-  /// Model the information carried by one end of a copy.
-  struct HintInfo {
-    /// The frequency of the copy.
-    BlockFrequency Freq;
-    /// The virtual register or physical register.
-    Register Reg;
-    /// Its currently assigned register.
-    /// In case of a physical register Reg == PhysReg.
-    MCRegister PhysReg;
-
-    HintInfo(BlockFrequency Freq, Register Reg, MCRegister PhysReg)
-        : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
-  };
-  using HintsInfo = SmallVector<HintInfo, 4>;
-
-  BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister);
-  void collectHintInfo(Register, HintsInfo &);
-
-  /// Greedy RA statistic to remark.
-  struct RAGreedyStats {
-    unsigned Reloads = 0;
-    unsigned FoldedReloads = 0;
-    unsigned ZeroCostFoldedReloads = 0;
-    unsigned Spills = 0;
-    unsigned FoldedSpills = 0;
-    unsigned Copies = 0;
-    float ReloadsCost = 0.0f;
-    float FoldedReloadsCost = 0.0f;
-    float SpillsCost = 0.0f;
-    float FoldedSpillsCost = 0.0f;
-    float CopiesCost = 0.0f;
-
-    bool isEmpty() {
-      return !(Reloads || FoldedReloads || Spills || FoldedSpills ||
-               ZeroCostFoldedReloads || Copies);
-    }
-
-    void add(RAGreedyStats other) {
-      Reloads += other.Reloads;
-      FoldedReloads += other.FoldedReloads;
-      ZeroCostFoldedReloads += other.ZeroCostFoldedReloads;
-      Spills += other.Spills;
-      FoldedSpills += other.FoldedSpills;
-      Copies += other.Copies;
-      ReloadsCost += other.ReloadsCost;
-      FoldedReloadsCost += other.FoldedReloadsCost;
-      SpillsCost += other.SpillsCost;
-      FoldedSpillsCost += other.FoldedSpillsCost;
-      CopiesCost += other.CopiesCost;
-    }
-
-    void report(MachineOptimizationRemarkMissed &R);
-  };
-
-  /// Compute statistic for a basic block.
-  RAGreedyStats computeStats(MachineBasicBlock &MBB);
-
-  /// Compute and report statistic through a remark.
-  RAGreedyStats reportStats(MachineLoop *L);
-
-  /// Report the statistic for each loop.
-  void reportStats();
-};
-
-} // end anonymous namespace
-
 char RAGreedy::ID = 0;
 char &llvm::RAGreedyID = RAGreedy::ID;
 
@@ -613,7 +258,7 @@ void RAGreedy::LRE_DidCloneVirtReg(Register New, Register Old) {
   ExtraInfo->LRE_DidCloneVirtReg(New, Old);
 }
 
-void ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) {
+void RAGreedy::ExtraRegInfo::LRE_DidCloneVirtReg(Register New, Register Old) {
   // Cloning a register we haven't even heard about yet?  Just ignore it.
   if (!Info.inBounds(Old))
     return;
@@ -811,7 +456,7 @@ Register RegAllocEvictionAdvisor::canReassign(LiveInterval &VirtReg,
 bool DefaultEvictionAdvisor::shouldEvict(LiveInterval &A, bool IsHint,
                                          LiveInterval &B,
                                          bool BreaksHint) const {
-  bool CanSplit = ExtraInfo->getStage(B) < RS_Spill;
+  bool CanSplit = RA.getExtraInfo().getStage(B) < RS_Spill;
 
   // Be fairly aggressive about following hints as long as the evictee can be
   // split.
@@ -852,7 +497,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
   if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
     return false;
 
-  bool IsLocal = LIS->intervalIsInOneMBB(VirtReg);
+  bool IsLocal = VirtReg.empty() || LIS->intervalIsInOneMBB(VirtReg);
 
   // Find VirtReg's cascade number. This will be unassigned if VirtReg was never
   // involved in an eviction before. If a cascade number was assigned, deny
@@ -861,7 +506,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
   //
   // This works out so a register without a cascade number is allowed to evict
   // anything, and it can be evicted by anything.
-  unsigned Cascade = ExtraInfo->getCascadeOrCurrentNext(VirtReg.reg());
+  unsigned Cascade = RA.getExtraInfo().getCascadeOrCurrentNext(VirtReg.reg());
 
   EvictionCost Cost;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
@@ -883,7 +528,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
         return false;
 
       // Never evict spill products. They cannot split or spill.
-      if (ExtraInfo->getStage(*Intf) == RS_Done)
+      if (RA.getExtraInfo().getStage(*Intf) == RS_Done)
         return false;
       // Once a live range becomes small enough, it is urgent that we find a
       // register for it. This is indicated by an infinite spill weight. These
@@ -898,7 +543,7 @@ bool DefaultEvictionAdvisor::canEvictInterferenceBasedOnCost(
                RegClassInfo.getNumAllocatableRegs(
                    MRI->getRegClass(Intf->reg())));
       // Only evict older cascades or live ranges without a cascade.
-      unsigned IntfCascade = ExtraInfo->getCascade(Intf->reg());
+      unsigned IntfCascade = RA.getExtraInfo().getCascade(Intf->reg());
       if (Cascade <= IntfCascade) {
         if (!Urgent)
           return false;
@@ -1069,28 +714,20 @@ bool RegAllocEvictionAdvisor::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
   return !Matrix->isPhysRegUsed(PhysReg);
 }
 
-MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate(
-    LiveInterval &VirtReg, const AllocationOrder &Order,
-    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
-  // Keep track of the cheapest interference seen so far.
-  EvictionCost BestCost;
-  BestCost.setMax();
-  MCRegister BestPhys;
+Optional<unsigned>
+RegAllocEvictionAdvisor::getOrderLimit(const LiveInterval &VirtReg,
+                                       const AllocationOrder &Order,
+                                       unsigned CostPerUseLimit) const {
   unsigned OrderLimit = Order.getOrder().size();
 
-  // When we are just looking for a reduced cost per use, don't break any
-  // hints, and only evict smaller spill weights.
   if (CostPerUseLimit < uint8_t(~0u)) {
-    BestCost.BrokenHints = 0;
-    BestCost.MaxWeight = VirtReg.weight();
-
     // Check of any registers in RC are below CostPerUseLimit.
     const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg());
     uint8_t MinCost = RegClassInfo.getMinCost(RC);
     if (MinCost >= CostPerUseLimit) {
       LLVM_DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = "
                         << MinCost << ", no cheaper registers to be found.\n");
-      return 0;
+      return None;
     }
 
     // It is normal for register classes to have a long tail of registers with
@@ -1101,24 +738,50 @@ MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate(
                         << " regs.\n");
     }
   }
+  return OrderLimit;
+}
+
+bool RegAllocEvictionAdvisor::canAllocatePhysReg(unsigned CostPerUseLimit,
+                                                 MCRegister PhysReg) const {
+  if (RegCosts[PhysReg] >= CostPerUseLimit)
+    return false;
+  // The first use of a callee-saved register in a function has cost 1.
+  // Don't start using a CSR when the CostPerUseLimit is low.
+  if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) {
+    LLVM_DEBUG(
+        dbgs() << printReg(PhysReg, TRI) << " would clobber CSR "
+               << printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
+               << '\n');
+    return false;
+  }
+  return true;
+}
+
+MCRegister DefaultEvictionAdvisor::tryFindEvictionCandidate(
+    LiveInterval &VirtReg, const AllocationOrder &Order,
+    uint8_t CostPerUseLimit, const SmallVirtRegSet &FixedRegisters) const {
+  // Keep track of the cheapest interference seen so far.
+  EvictionCost BestCost;
+  BestCost.setMax();
+  MCRegister BestPhys;
+  auto MaybeOrderLimit = getOrderLimit(VirtReg, Order, CostPerUseLimit);
+  if (!MaybeOrderLimit)
+    return MCRegister::NoRegister;
+  unsigned OrderLimit = *MaybeOrderLimit;
+
+  // When we are just looking for a reduced cost per use, don't break any
+  // hints, and only evict smaller spill weights.
+  if (CostPerUseLimit < uint8_t(~0u)) {
+    BestCost.BrokenHints = 0;
+    BestCost.MaxWeight = VirtReg.weight();
+  }
 
   for (auto I = Order.begin(), E = Order.getOrderLimitEnd(OrderLimit); I != E;
        ++I) {
     MCRegister PhysReg = *I;
     assert(PhysReg);
-    if (RegCosts[PhysReg] >= CostPerUseLimit)
-      continue;
-    // The first use of a callee-saved register in a function has cost 1.
-    // Don't start using a CSR when the CostPerUseLimit is low.
-    if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) {
-      LLVM_DEBUG(
-          dbgs() << printReg(PhysReg, TRI) << " would clobber CSR "
-                 << printReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
-                 << '\n');
-      continue;
-    }
-
-    if (!canEvictInterferenceBasedOnCost(VirtReg, PhysReg, false, BestCost,
+    if (!canAllocatePhysReg(CostPerUseLimit, PhysReg) ||
+        !canEvictInterferenceBasedOnCost(VirtReg, PhysReg, false, BestCost,
                                          FixedRegisters))
       continue;
 
@@ -3269,8 +2932,8 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
   SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI));
   ExtraInfo.emplace();
-  EvictAdvisor = getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor(
-      *MF, Matrix, LIS, VRM, RegClassInfo, &*ExtraInfo);
+  EvictAdvisor =
+      getAnalysis<RegAllocEvictionAdvisorAnalysis>().getAdvisor(*MF, *this);
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
   SetOfBrokenHints.clear();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
new file mode 100644
index 000000000000..e9a5fe635f26
--- /dev/null
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -0,0 +1,507 @@
+//==- RegAllocGreedy.h ------- greedy register allocator  ----------*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file defines the RAGreedy function pass for register allocation in
+// optimized builds.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_REGALLOCGREEDY_H_
+#define LLVM_CODEGEN_REGALLOCGREEDY_H_
+
+#include "AllocationOrder.h"
+#include "InterferenceCache.h"
+#include "LiveDebugVariables.h"
+#include "RegAllocBase.h"
+#include "RegAllocEvictionAdvisor.h"
+#include "SpillPlacement.h"
+#include "SplitKit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <queue>
+#include <tuple>
+#include <utility>
+
+namespace llvm {
+class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
+                                         public RegAllocBase,
+                                         private LiveRangeEdit::Delegate {
+  // Interface to eviction advisers
+public:
+  /// Track allocation stage and eviction loop prevention during allocation.
+  class ExtraRegInfo final {
+    // RegInfo - Keep additional information about each live range.
+    struct RegInfo {
+      LiveRangeStage Stage = RS_New;
+
+      // Cascade - Eviction loop prevention. See
+      // canEvictInterferenceBasedOnCost().
+      unsigned Cascade = 0;
+
+      RegInfo() = default;
+    };
+
+    IndexedMap<RegInfo, VirtReg2IndexFunctor> Info;
+    unsigned NextCascade = 1;
+
+  public:
+    ExtraRegInfo() = default;
+    ExtraRegInfo(const ExtraRegInfo &) = delete;
+
+    LiveRangeStage getStage(Register Reg) const { return Info[Reg].Stage; }
+
+    LiveRangeStage getStage(const LiveInterval &VirtReg) const {
+      return getStage(VirtReg.reg());
+    }
+
+    void setStage(Register Reg, LiveRangeStage Stage) {
+      Info.grow(Reg.id());
+      Info[Reg].Stage = Stage;
+    }
+
+    void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
+      setStage(VirtReg.reg(), Stage);
+    }
+
+    /// Return the current stage of the register, if present, otherwise
+    /// initialize it and return that.
+    LiveRangeStage getOrInitStage(Register Reg) {
+      Info.grow(Reg.id());
+      return getStage(Reg);
+    }
+
+    unsigned getCascade(Register Reg) const { return Info[Reg].Cascade; }
+
+    void setCascade(Register Reg, unsigned Cascade) {
+      Info.grow(Reg.id());
+      Info[Reg].Cascade = Cascade;
+    }
+
+    unsigned getOrAssignNewCascade(Register Reg) {
+      unsigned Cascade = getCascade(Reg);
+      if (!Cascade) {
+        Cascade = NextCascade++;
+        setCascade(Reg, Cascade);
+      }
+      return Cascade;
+    }
+
+    unsigned getCascadeOrCurrentNext(Register Reg) const {
+      unsigned Cascade = getCascade(Reg);
+      if (!Cascade)
+        Cascade = NextCascade;
+      return Cascade;
+    }
+
+    template <typename Iterator>
+    void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
+      for (; Begin != End; ++Begin) {
+        Register Reg = *Begin;
+        Info.grow(Reg.id());
+        if (Info[Reg].Stage == RS_New)
+          Info[Reg].Stage = NewStage;
+      }
+    }
+    void LRE_DidCloneVirtReg(Register New, Register Old);
+  };
+
+  LiveRegMatrix *getInterferenceMatrix() const { return Matrix; }
+  LiveIntervals *getLiveIntervals() const { return LIS; }
+  VirtRegMap *getVirtRegMap() const { return VRM; }
+  const RegisterClassInfo &getRegClassInfo() const { return RegClassInfo; }
+  const ExtraRegInfo &getExtraInfo() const { return *ExtraInfo; }
+  size_t getQueueSize() const { return Queue.size(); }
+  // end (interface to eviction advisers)
+
+private:
+  // Convenient shortcuts.
+  using PQueue = std::priority_queue<std::pair<unsigned, unsigned>>;
+  using SmallLISet = SmallPtrSet<LiveInterval *, 4>;
+
+  // context
+  MachineFunction *MF;
+
+  // Shortcuts to some useful interface.
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  RegisterClassInfo RCI;
+
+  // analyses
+  SlotIndexes *Indexes;
+  MachineBlockFrequencyInfo *MBFI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineOptimizationRemarkEmitter *ORE;
+  EdgeBundles *Bundles;
+  SpillPlacement *SpillPlacer;
+  LiveDebugVariables *DebugVars;
+  AliasAnalysis *AA;
+
+  // state
+  std::unique_ptr<Spiller> SpillerInstance;
+  PQueue Queue;
+  std::unique_ptr<VirtRegAuxInfo> VRAI;
+  Optional<ExtraRegInfo> ExtraInfo;
+  std::unique_ptr<RegAllocEvictionAdvisor> EvictAdvisor;
+
+  // Enum CutOffStage to keep a track whether the register allocation failed
+  // because of the cutoffs encountered in last chance recoloring.
+  // Note: This is used as bitmask. New value should be next power of 2.
+  enum CutOffStage {
+    // No cutoffs encountered
+    CO_None = 0,
+
+    // lcr-max-depth cutoff encountered
+    CO_Depth = 1,
+
+    // lcr-max-interf cutoff encountered
+    CO_Interf = 2
+  };
+
+  uint8_t CutOffInfo;
+
+#ifndef NDEBUG
+  static const char *const StageName[];
+#endif
+
+  /// EvictionTrack - Keeps track of past evictions in order to optimize region
+  /// split decision.
+  class EvictionTrack {
+
+  public:
+    using EvictorInfo =
+        std::pair<Register /* evictor */, MCRegister /* physreg */>;
+    using EvicteeInfo = llvm::DenseMap<Register /* evictee */, EvictorInfo>;
+
+  private:
+    /// Each Vreg that has been evicted in the last stage of selectOrSplit will
+    /// be mapped to the evictor Vreg and the PhysReg it was evicted from.
+    EvicteeInfo Evictees;
+
+  public:
+    /// Clear all eviction information.
+    void clear() { Evictees.clear(); }
+
+    ///  Clear eviction information for the given evictee Vreg.
+    /// E.g. when Vreg get's a new allocation, the old eviction info is no
+    /// longer relevant.
+    /// \param Evictee The evictee Vreg for whom we want to clear collected
+    /// eviction info.
+    void clearEvicteeInfo(Register Evictee) { Evictees.erase(Evictee); }
+
+    /// Track new eviction.
+    /// The Evictor vreg has evicted the Evictee vreg from Physreg.
+    /// \param PhysReg The physical register Evictee was evicted from.
+    /// \param Evictor The evictor Vreg that evicted Evictee.
+    /// \param Evictee The evictee Vreg.
+    void addEviction(MCRegister PhysReg, Register Evictor, Register Evictee) {
+      Evictees[Evictee].first = Evictor;
+      Evictees[Evictee].second = PhysReg;
+    }
+
+    /// Return the Evictor Vreg which evicted Evictee Vreg from PhysReg.
+    /// \param Evictee The evictee vreg.
+    /// \return The Evictor vreg which evicted Evictee vreg from PhysReg. 0 if
+    /// nobody has evicted Evictee from PhysReg.
+    EvictorInfo getEvictor(Register Evictee) {
+      if (Evictees.count(Evictee)) {
+        return Evictees[Evictee];
+      }
+
+      return EvictorInfo(0, 0);
+    }
+  };
+
+  // Keeps track of past evictions in order to optimize region split decision.
+  EvictionTrack LastEvicted;
+
+  // splitting state.
+  std::unique_ptr<SplitAnalysis> SA;
+  std::unique_ptr<SplitEditor> SE;
+
+  /// Cached per-block interference maps
+  InterferenceCache IntfCache;
+
+  /// All basic blocks where the current register has uses.
+  SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints;
+
+  /// Global live range splitting candidate info.
+  struct GlobalSplitCandidate {
+    // Register intended for assignment, or 0.
+    MCRegister PhysReg;
+
+    // SplitKit interval index for this candidate.
+    unsigned IntvIdx;
+
+    // Interference for PhysReg.
+    InterferenceCache::Cursor Intf;
+
+    // Bundles where this candidate should be live.
+    BitVector LiveBundles;
+    SmallVector<unsigned, 8> ActiveBlocks;
+
+    void reset(InterferenceCache &Cache, MCRegister Reg) {
+      PhysReg = Reg;
+      IntvIdx = 0;
+      Intf.setPhysReg(Cache, Reg);
+      LiveBundles.clear();
+      ActiveBlocks.clear();
+    }
+
+    // Set B[I] = C for every live bundle where B[I] was NoCand.
+    unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
+      unsigned Count = 0;
+      for (unsigned I : LiveBundles.set_bits())
+        if (B[I] == NoCand) {
+          B[I] = C;
+          Count++;
+        }
+      return Count;
+    }
+  };
+
+  /// Candidate info for each PhysReg in AllocationOrder.
+  /// This vector never shrinks, but grows to the size of the largest register
+  /// class.
+  SmallVector<GlobalSplitCandidate, 32> GlobalCand;
+
+  enum : unsigned { NoCand = ~0u };
+
+  /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
+  /// NoCand which indicates the stack interval.
+  SmallVector<unsigned, 32> BundleCand;
+
+  /// Callee-save register cost, calculated once per machine function.
+  BlockFrequency CSRCost;
+
+  /// Enable or not the consideration of the cost of local intervals created
+  /// by a split candidate when choosing the best split candidate.
+  bool EnableAdvancedRASplitCost;
+
+  /// Set of broken hints that may be reconciled later because of eviction.
+  SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
+
+  /// The register cost values. This list will be recreated for each Machine
+  /// Function
+  ArrayRef<uint8_t> RegCosts;
+
+public:
+  RAGreedy(const RegClassFilterFunc F = allocateAllRegClasses);
+
+  /// Return the pass name.
+  StringRef getPassName() const override { return "Greedy Register Allocator"; }
+
+  /// RAGreedy analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  Spiller &spiller() override { return *SpillerInstance; }
+  void enqueueImpl(LiveInterval *LI) override;
+  LiveInterval *dequeue() override;
+  MCRegister selectOrSplit(LiveInterval &,
+                           SmallVectorImpl<Register> &) override;
+  void aboutToRemoveInterval(LiveInterval &) override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  static char ID;
+
+private:
+  MCRegister selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &,
+                               SmallVirtRegSet &, unsigned = 0);
+
+  bool LRE_CanEraseVirtReg(Register) override;
+  void LRE_WillShrinkVirtReg(Register) override;
+  void LRE_DidCloneVirtReg(Register, Register) override;
+  void enqueue(PQueue &CurQueue, LiveInterval *LI);
+  LiveInterval *dequeue(PQueue &CurQueue);
+
+  BlockFrequency calcSpillCost();
+  bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency &);
+  bool addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
+  bool growRegion(GlobalSplitCandidate &Cand);
+  bool splitCanCauseEvictionChain(Register Evictee, GlobalSplitCandidate &Cand,
+                                  unsigned BBNumber,
+                                  const AllocationOrder &Order);
+  bool splitCanCauseLocalSpill(unsigned VirtRegToSplit,
+                               GlobalSplitCandidate &Cand, unsigned BBNumber,
+                               const AllocationOrder &Order);
+  BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &,
+                                     const AllocationOrder &Order,
+                                     bool *CanCauseEvictionChain);
+  bool calcCompactRegion(GlobalSplitCandidate &);
+  void splitAroundRegion(LiveRangeEdit &, ArrayRef<unsigned>);
+  void calcGapWeights(MCRegister, SmallVectorImpl<float> &);
+  bool canEvictInterferenceInRange(const LiveInterval &VirtReg,
+                                   MCRegister PhysReg, SlotIndex Start,
+                                   SlotIndex End, EvictionCost &MaxCost) const;
+  MCRegister getCheapestEvicteeWeight(const AllocationOrder &Order,
+                                      const LiveInterval &VirtReg,
+                                      SlotIndex Start, SlotIndex End,
+                                      float *BestEvictWeight) const;
+  void evictInterference(LiveInterval &, MCRegister,
+                         SmallVectorImpl<Register> &);
+  bool mayRecolorAllInterferences(MCRegister PhysReg, LiveInterval &VirtReg,
+                                  SmallLISet &RecoloringCandidates,
+                                  const SmallVirtRegSet &FixedRegisters);
+
+  MCRegister tryAssign(LiveInterval &, AllocationOrder &,
+                       SmallVectorImpl<Register> &, const SmallVirtRegSet &);
+  MCRegister tryEvict(LiveInterval &, AllocationOrder &,
+                      SmallVectorImpl<Register> &, uint8_t,
+                      const SmallVirtRegSet &);
+  MCRegister tryRegionSplit(LiveInterval &, AllocationOrder &,
+                            SmallVectorImpl<Register> &);
+  /// Calculate cost of region splitting.
+  unsigned calculateRegionSplitCost(LiveInterval &VirtReg,
+                                    AllocationOrder &Order,
+                                    BlockFrequency &BestCost,
+                                    unsigned &NumCands, bool IgnoreCSR,
+                                    bool *CanCauseEvictionChain = nullptr);
+  /// Perform region splitting.
+  unsigned doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
+                         bool HasCompact, SmallVectorImpl<Register> &NewVRegs);
+  /// Check other options before using a callee-saved register for the first
+  /// time.
+  MCRegister tryAssignCSRFirstTime(LiveInterval &VirtReg,
+                                   AllocationOrder &Order, MCRegister PhysReg,
+                                   uint8_t &CostPerUseLimit,
+                                   SmallVectorImpl<Register> &NewVRegs);
+  void initializeCSRCost();
+  unsigned tryBlockSplit(LiveInterval &, AllocationOrder &,
+                         SmallVectorImpl<Register> &);
+  unsigned tryInstructionSplit(LiveInterval &, AllocationOrder &,
+                               SmallVectorImpl<Register> &);
+  unsigned tryLocalSplit(LiveInterval &, AllocationOrder &,
+                         SmallVectorImpl<Register> &);
+  unsigned trySplit(LiveInterval &, AllocationOrder &,
+                    SmallVectorImpl<Register> &, const SmallVirtRegSet &);
+  unsigned tryLastChanceRecoloring(LiveInterval &, AllocationOrder &,
+                                   SmallVectorImpl<Register> &,
+                                   SmallVirtRegSet &, unsigned);
+  bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &,
+                               SmallVirtRegSet &, unsigned);
+  void tryHintRecoloring(LiveInterval &);
+  void tryHintsRecoloring();
+
+  /// Model the information carried by one end of a copy.
+  struct HintInfo {
+    /// The frequency of the copy.
+    BlockFrequency Freq;
+    /// The virtual register or physical register.
+    Register Reg;
+    /// Its currently assigned register.
+    /// In case of a physical register Reg == PhysReg.
+    MCRegister PhysReg;
+
+    HintInfo(BlockFrequency Freq, Register Reg, MCRegister PhysReg)
+        : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
+  };
+  using HintsInfo = SmallVector<HintInfo, 4>;
+
+  BlockFrequency getBrokenHintFreq(const HintsInfo &, MCRegister);
+  void collectHintInfo(Register, HintsInfo &);
+
+  /// Greedy RA statistic to remark.
+  struct RAGreedyStats {
+    unsigned Reloads = 0;
+    unsigned FoldedReloads = 0;
+    unsigned ZeroCostFoldedReloads = 0;
+    unsigned Spills = 0;
+    unsigned FoldedSpills = 0;
+    unsigned Copies = 0;
+    float ReloadsCost = 0.0f;
+    float FoldedReloadsCost = 0.0f;
+    float SpillsCost = 0.0f;
+    float FoldedSpillsCost = 0.0f;
+    float CopiesCost = 0.0f;
+
+    bool isEmpty() {
+      return !(Reloads || FoldedReloads || Spills || FoldedSpills ||
+               ZeroCostFoldedReloads || Copies);
+    }
+
+    void add(RAGreedyStats other) {
+      Reloads += other.Reloads;
+      FoldedReloads += other.FoldedReloads;
+      ZeroCostFoldedReloads += other.ZeroCostFoldedReloads;
+      Spills += other.Spills;
+      FoldedSpills += other.FoldedSpills;
+      Copies += other.Copies;
+      ReloadsCost += other.ReloadsCost;
+      FoldedReloadsCost += other.FoldedReloadsCost;
+      SpillsCost += other.SpillsCost;
+      FoldedSpillsCost += other.FoldedSpillsCost;
+      CopiesCost += other.CopiesCost;
+    }
+
+    void report(MachineOptimizationRemarkMissed &R);
+  };
+
+  /// Compute statistic for a basic block.
+  RAGreedyStats computeStats(MachineBasicBlock &MBB);
+
+  /// Compute and report statistic through a remark.
+  RAGreedyStats reportStats(MachineLoop *L);
+
+  /// Report the statistic for each loop.
+  void reportStats();
+};
+} // namespace llvm
+#endif // #ifndef LLVM_CODEGEN_REGALLOCGREEDY_H_
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index c0a07ec4c91d..424ad7419165 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -533,6 +533,22 @@ Register RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
         Candidates.reset(*AI);
   }
 
+  // If we have already scavenged some registers, remove them from the
+  // candidates. If we end up recursively calling eliminateFrameIndex, we don't
+  // want to be clobbering previously scavenged registers or their associated
+  // stack slots.
+  for (ScavengedInfo &SI : Scavenged) {
+    if (SI.Reg) {
+      if (isRegUsed(SI.Reg)) {
+        LLVM_DEBUG(
+          dbgs() << "Removing " << printReg(SI.Reg, TRI) <<
+          " from scavenging candidates since it was already scavenged\n");
+        for (MCRegAliasIterator AI(SI.Reg, TRI, true); AI.isValid(); ++AI)
+          Candidates.reset(*AI);
+      }
+    }
+  }
+
   // Try to find a register that's unused if there is one, as then we won't
   // have to spill.
   BitVector Available = getRegsAvailable(RC);
@@ -553,6 +569,12 @@ Register RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   if (!AllowSpill)
     return 0;
 
+#ifndef NDEBUG
+  for (ScavengedInfo &SI : Scavenged) {
+    assert(SI.Reg != SReg && "scavenged a previously scavenged register");
+  }
+#endif
+
   ScavengedInfo &Scavenged = spill(SReg, *RC, SPAdj, I, UseMI);
   Scavenged.Restore = &*std::prev(UseMI);
 
diff --git a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 6e05de888cc0..a61a2b2728fa 100644
--- a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -30,8 +30,7 @@ using namespace llvm;
 ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
     const InstrItineraryData *II, const ScheduleDAG *SchedDAG,
     const char *ParentDebugType)
-    : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II),
-      DAG(SchedDAG) {
+    : DebugType(ParentDebugType), ItinData(II), DAG(SchedDAG) {
   (void)DebugType;
   // Determine the maximum depth of any itinerary. This determines the depth of
   // the scoreboard. We always make the scoreboard at least 1 cycle deep to
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 067ad819e0d2..932f263d2558 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -593,7 +593,7 @@ namespace {
     SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue mergeTruncStores(StoreSDNode *N);
-    SDValue ReduceLoadWidth(SDNode *N);
+    SDValue reduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
@@ -1070,7 +1070,7 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
         return DAG.getNode(Opc, DL, VT, N00, OpNode);
       return SDValue();
     }
-    if (N0.hasOneUse()) {
+    if (TLI.isReassocProfitable(DAG, N0, N1)) {
       // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
       //              iff (op x, c1) has one use
       if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1))
@@ -3058,9 +3058,8 @@ static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
 //
 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
 // a single path for carry/borrow out propagation:
-static SDValue combineCarryDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
-                                   const TargetLowering &TLI, SDValue Carry0,
-                                   SDValue Carry1, SDNode *N) {
+static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
+                                   SDValue Carry0, SDValue Carry1, SDNode *N) {
   if (Carry0.getResNo() != 1 || Carry1.getResNo() != 1)
     return SDValue();
   unsigned Opcode = Carry0.getOpcode();
@@ -3908,7 +3907,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
   // use.
   {
-    SDValue Sh(nullptr, 0), Y(nullptr, 0);
+    SDValue Sh, Y;
 
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
     if (N0.getOpcode() == ISD::SHL &&
@@ -4471,15 +4470,15 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
       return FoldedVOp;
 
     // fold (mulhs x, 0) -> 0
-    // do not return N0/N1, because undef node may exist.
-    if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) ||
-        ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+    // do not return N1, because undef node may exist.
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       return DAG.getConstant(0, DL, VT);
   }
 
   // fold (mulhs x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
+
   // fold (mulhs x, 1) -> (sra x, size(x)-1)
   if (isOneConstant(N1))
     return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
@@ -4531,18 +4530,19 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
       return FoldedVOp;
 
     // fold (mulhu x, 0) -> 0
-    // do not return N0/N1, because undef node may exist.
-    if (ISD::isConstantSplatVectorAllZeros(N0.getNode()) ||
-        ISD::isConstantSplatVectorAllZeros(N1.getNode()))
+    // do not return N1, because undef node may exist.
+    if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       return DAG.getConstant(0, DL, VT);
   }
 
   // fold (mulhu x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
+
   // fold (mulhu x, 1) -> 0
   if (isOneConstant(N1))
     return DAG.getConstant(0, DL, N0.getValueType());
+
   // fold (mulhu x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, DL, VT);
@@ -4892,6 +4892,42 @@ static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
                   : DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0));
 }
 
+static SDValue PerformUMinFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
+                                         SDValue N3, ISD::CondCode CC,
+                                         SelectionDAG &DAG) {
+  // We are looking for UMIN(FPTOUI(X), (2^n)-1), which may have come via a
+  // select/vselect/select_cc. The two operands pairs for the select (N2/N3) may
+  // be truncated versions of the the setcc (N0/N1).
+  if ((N0 != N2 &&
+       (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0))) ||
+      N0.getOpcode() != ISD::FP_TO_UINT || CC != ISD::SETULT)
+    return SDValue();
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  ConstantSDNode *N3C = isConstOrConstSplat(N3);
+  if (!N1C || !N3C)
+    return SDValue();
+  const APInt &C1 = N1C->getAPIntValue();
+  const APInt &C3 = N3C->getAPIntValue();
+  if (!(C1 + 1).isPowerOf2() || C1.getBitWidth() < C3.getBitWidth() ||
+      C1 != C3.zextOrSelf(C1.getBitWidth()))
+    return SDValue();
+
+  unsigned BW = (C1 + 1).exactLogBase2();
+  EVT FPVT = N0.getOperand(0).getValueType();
+  EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
+  if (FPVT.isVector())
+    NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
+                             FPVT.getVectorElementCount());
+  if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(ISD::FP_TO_UINT_SAT,
+                                                        FPVT, NewVT))
+    return SDValue();
+
+  SDValue Sat =
+      DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(N0), NewVT, N0.getOperand(0),
+                  DAG.getValueType(NewVT.getScalarType()));
+  return DAG.getZExtOrTrunc(Sat, SDLoc(N0), N3.getValueType());
+}
+
 SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4934,6 +4970,9 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
     if (SDValue S = PerformMinMaxFpToSatCombine(
             N0, N1, N0, N1, Opcode == ISD::SMIN ? ISD::SETLT : ISD::SETGT, DAG))
       return S;
+  if (Opcode == ISD::UMIN)
+    if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG))
+      return S;
 
   // Simplify the operands using demanded-bits information.
   if (SimplifyDemandedBits(SDValue(N, 0)))
@@ -5491,6 +5530,8 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
 
     // Some constants may need fixing up later if they are too large.
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (Mask->getValueType(0) != C->getValueType(0))
+        return false;
       if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
           (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
         NodesWithConsts.insert(N);
@@ -5524,9 +5565,9 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
     case ISD::AssertZext: {
       unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
       EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
-      EVT VT = Op.getOpcode() == ISD::AssertZext ?
-        cast<VTSDNode>(Op.getOperand(1))->getVT() :
-        Op.getOperand(0).getValueType();
+      EVT VT = Op.getOpcode() == ISD::AssertZext
+                   ? cast<VTSDNode>(Op.getOperand(1))->getVT()
+                   : Op.getOperand(0).getValueType();
 
       // We can accept extending nodes if the mask is wider or an equal
       // width to the original type.
@@ -5534,6 +5575,15 @@ bool DAGCombiner::SearchForAndLoads(SDNode *N,
         continue;
       break;
     }
+    case ISD::ANY_EXTEND: {
+      unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
+      EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+      EVT VT = Op.getOperand(0).getValueType();
+      if (ExtVT.bitsGE(VT))
+        break;
+      // Fallthrough to searching for nodes from the operands of the extend.
+      LLVM_FALLTHROUGH;
+    }
     case ISD::OR:
     case ISD::XOR:
     case ISD::AND:
@@ -5593,12 +5643,14 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
     // masking.
     if (FixupNode) {
       LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
-      SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
-                                FixupNode->getValueType(0),
-                                SDValue(FixupNode, 0), MaskOp);
+      SDValue MaskOpT = DAG.getZExtOrTrunc(MaskOp, SDLoc(FixupNode),
+                                           FixupNode->getValueType(0));
+      SDValue And =
+          DAG.getNode(ISD::AND, SDLoc(FixupNode), FixupNode->getValueType(0),
+                      SDValue(FixupNode, 0), MaskOpT);
       DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
       if (And.getOpcode() == ISD ::AND)
-        DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
+        DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOpT);
     }
 
     // Narrow any constants that need it.
@@ -5607,10 +5659,12 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
       SDValue Op1 = LogicN->getOperand(1);
 
       if (isa<ConstantSDNode>(Op0))
-          std::swap(Op0, Op1);
+        std::swap(Op0, Op1);
 
-      SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
-                                Op1, MaskOp);
+      SDValue MaskOpT =
+          DAG.getZExtOrTrunc(MaskOp, SDLoc(Op1), Op1.getValueType());
+      SDValue And =
+          DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), Op1, MaskOpT);
 
       DAG.UpdateNodeOperands(LogicN, Op0, And);
     }
@@ -5618,13 +5672,15 @@ bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
     // Create narrow loads.
     for (auto *Load : Loads) {
       LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
+      SDValue MaskOpT =
+          DAG.getZExtOrTrunc(MaskOp, SDLoc(Load), Load->getValueType(0));
       SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
-                                SDValue(Load, 0), MaskOp);
+                                SDValue(Load, 0), MaskOpT);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
       if (And.getOpcode() == ISD ::AND)
         And = SDValue(
-            DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
-      SDValue NewLoad = ReduceLoadWidth(And.getNode());
+            DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOpT), 0);
+      SDValue NewLoad = reduceLoadWidth(And.getNode());
       assert(NewLoad &&
              "Shouldn't be masking the load if it can't be narrowed");
       CombineTo(Load, NewLoad, NewLoad.getValue(1));
@@ -5799,18 +5855,12 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return FoldedVOp;
 
     // fold (and x, 0) -> 0, vector edition
-    if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
-      // do not return N0, because undef node may exist in N0
-      return DAG.getConstant(APInt::getZero(N0.getScalarValueSizeInBits()),
-                             SDLoc(N), N0.getValueType());
     if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
       // do not return N1, because undef node may exist in N1
       return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()),
                              SDLoc(N), N1.getValueType());
 
     // fold (and x, -1) -> x, vector edition
-    if (ISD::isConstantSplatVectorAllOnes(N0.getNode()))
-      return N1;
     if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
       return N0;
 
@@ -5862,7 +5912,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     if (SDValue Shuffle = XformToShuffleWithZero(N))
       return Shuffle;
 
-  if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
+  if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
     return Combined;
 
   // fold (and (or x, C), D) -> D if (C & D) == D
@@ -6024,7 +6074,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD ||
                                 (N0.getOpcode() == ISD::ANY_EXTEND &&
                                  N0.getOperand(0).getOpcode() == ISD::LOAD))) {
-    if (SDValue Res = ReduceLoadWidth(N)) {
+    if (SDValue Res = reduceLoadWidth(N)) {
       LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
         ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);
       AddToWorklist(N);
@@ -6659,7 +6709,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (SDValue Combined = visitORLike(N0, N1, N))
     return Combined;
 
-  if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
+  if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
     return Combined;
 
   // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
@@ -8156,7 +8206,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
-  if (SDValue Combined = combineCarryDiamond(*this, DAG, TLI, N0, N1, N))
+  if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
     return Combined;
 
   return SDValue();
@@ -8948,6 +8998,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
     return MULH;
 
+  // Attempt to convert a sra of a load into a narrower sign-extending load.
+  if (SDValue NarrowLoad = reduceLoadWidth(N))
+    return NarrowLoad;
+
   return SDValue();
 }
 
@@ -9140,7 +9194,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       return NewSRL;
 
   // Attempt to convert a srl of a load into a narrower zero-extending load.
-  if (SDValue NarrowLoad = ReduceLoadWidth(N))
+  if (SDValue NarrowLoad = reduceLoadWidth(N))
     return NarrowLoad;
 
   // Here is a common situation. We want to optimize:
@@ -9358,6 +9412,17 @@ SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   // fold (bswap (bswap x)) -> x
   if (N0.getOpcode() == ISD::BSWAP)
     return N0->getOperand(0);
+
+  // Canonicalize bswap(bitreverse(x)) -> bitreverse(bswap(x)). If bitreverse
+  // isn't supported, it will be expanded to bswap followed by a manual reversal
+  // of bits in each byte. By placing bswaps before bitreverse, we can remove
+  // the two bswaps if the bitreverse gets expanded.
+  if (N0.getOpcode() == ISD::BITREVERSE && N0.hasOneUse()) {
+    SDLoc DL(N);
+    SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
+  }
+
   return SDValue();
 }
 
@@ -10288,6 +10353,8 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
 
     if (SDValue S = PerformMinMaxFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
       return S;
+    if (SDValue S = PerformUMinFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
+      return S;
 
     // If this select has a condition (setcc) with narrower operands than the
     // select, try to widen the compare to match the select width.
@@ -11357,7 +11424,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (sext (truncate (load x))) -> (sext (smaller load x))
     // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
-    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+    if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
       SDNode *oye = N0.getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
@@ -11621,7 +11688,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (zext (truncate (load x))) -> (zext (smaller load x))
     // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
-    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+    if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
       SDNode *oye = N0.getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
@@ -11864,7 +11931,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   // fold (aext (truncate (load x))) -> (aext (smaller load x))
   // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
   if (N0.getOpcode() == ISD::TRUNCATE) {
-    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+    if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
       SDNode *oye = N0.getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
@@ -12095,13 +12162,10 @@ SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
   return SDValue();
 }
 
-/// If the result of a wider load is shifted to right of N  bits and then
-/// truncated to a narrower type and where N is a multiple of number of bits of
-/// the narrower type, transform it to a narrower load from address + N / num of
-/// bits of new type. Also narrow the load if the result is masked with an AND
-/// to effectively produce a smaller type. If the result is to be extended, also
-/// fold the extension to form a extending load.
-SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
+/// If the result of a load is shifted/masked/truncated to an effectively
+/// narrower type, try to transform the load to a narrower type and/or
+/// use an extending load.
+SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
   unsigned Opc = N->getOpcode();
 
   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
@@ -12113,32 +12177,48 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   if (VT.isVector())
     return SDValue();
 
+  // The ShAmt variable is used to indicate that we've consumed a right
+  // shift. I.e. we want to narrow the width of the load by skipping to load the
+  // ShAmt least significant bits.
   unsigned ShAmt = 0;
+  // A special case is when the least significant bits from the load are masked
+  // away, but using an AND rather than a right shift. HasShiftedOffset is used
+  // to indicate that the narrowed load should be left-shifted ShAmt bits to get
+  // the result.
   bool HasShiftedOffset = false;
   // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
   // extended to VT.
   if (Opc == ISD::SIGN_EXTEND_INREG) {
     ExtType = ISD::SEXTLOAD;
     ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
-  } else if (Opc == ISD::SRL) {
-    // Another special-case: SRL is basically zero-extending a narrower value,
-    // or it maybe shifting a higher subword, half or byte into the lowest
-    // bits.
-    ExtType = ISD::ZEXTLOAD;
-    N0 = SDValue(N, 0);
+  } else if (Opc == ISD::SRL || Opc == ISD::SRA) {
+    // Another special-case: SRL/SRA is basically zero/sign-extending a narrower
+    // value, or it may be shifting a higher subword, half or byte into the
+    // lowest bits.
 
-    auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
-    auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
-    if (!N01 || !LN0)
+    // Only handle shift with constant shift amount, and the shiftee must be a
+    // load.
+    auto *LN = dyn_cast<LoadSDNode>(N0);
+    auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!N1C || !LN)
+      return SDValue();
+    // If the shift amount is larger than the memory type then we're not
+    // accessing any of the loaded bytes.
+    ShAmt = N1C->getZExtValue();
+    uint64_t MemoryWidth = LN->getMemoryVT().getScalarSizeInBits();
+    if (MemoryWidth <= ShAmt)
+      return SDValue();
+    // Attempt to fold away the SRL by using ZEXTLOAD and SRA by using SEXTLOAD.
+    ExtType = Opc == ISD::SRL ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
+    // If original load is a SEXTLOAD then we can't simply replace it by a
+    // ZEXTLOAD (we could potentially replace it by a more narrow SEXTLOAD
+    // followed by a ZEXT, but that is not handled at the moment). Similarly if
+    // the original load is a ZEXTLOAD and we want to use a SEXTLOAD.
+    if ((LN->getExtensionType() == ISD::SEXTLOAD ||
+         LN->getExtensionType() == ISD::ZEXTLOAD) &&
+        LN->getExtensionType() != ExtType)
       return SDValue();
-
-    uint64_t ShiftAmt = N01->getZExtValue();
-    uint64_t MemoryWidth = LN0->getMemoryVT().getScalarSizeInBits();
-    if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
-      ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
-    else
-      ExtVT = EVT::getIntegerVT(*DAG.getContext(),
-                                VT.getScalarSizeInBits() - ShiftAmt);
   } else if (Opc == ISD::AND) {
     // An AND with a constant mask is the same as a truncate + zero-extend.
     auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
@@ -12161,55 +12241,80 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
   }
 
-  if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
-    SDValue SRL = N0;
-    if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
-      ShAmt = ConstShift->getZExtValue();
-      unsigned EVTBits = ExtVT.getScalarSizeInBits();
-      // Is the shift amount a multiple of size of VT?
-      if ((ShAmt & (EVTBits-1)) == 0) {
-        N0 = N0.getOperand(0);
-        // Is the load width a multiple of size of VT?
-        if ((N0.getScalarValueSizeInBits() & (EVTBits - 1)) != 0)
-          return SDValue();
-      }
+  // In case Opc==SRL we've already prepared ExtVT/ExtType/ShAmt based on doing
+  // a right shift. Here we redo some of those checks, to possibly adjust the
+  // ExtVT even further based on "a masking AND". We could also end up here for
+  // other reasons (e.g. based on Opc==TRUNCATE) and that is why some checks
+  // need to be done here as well.
+  if (Opc == ISD::SRL || N0.getOpcode() == ISD::SRL) {
+    SDValue SRL = Opc == ISD::SRL ? SDValue(N, 0) : N0;
+    // Bail out when the SRL has more than one use. This is done for historical
+    // (undocumented) reasons. Maybe intent was to guard the AND-masking below
+    // check below? And maybe it could be non-profitable to do the transform in
+    // case the SRL has multiple uses and we get here with Opc!=ISD::SRL?
+    // FIXME: Can't we just skip this check for the Opc==ISD::SRL case.
+    if (!SRL.hasOneUse())
+      return SDValue();
+
+    // Only handle shift with constant shift amount, and the shiftee must be a
+    // load.
+    auto *LN = dyn_cast<LoadSDNode>(SRL.getOperand(0));
+    auto *SRL1C = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
+    if (!SRL1C || !LN)
+      return SDValue();
 
-      // At this point, we must have a load or else we can't do the transform.
-      auto *LN0 = dyn_cast<LoadSDNode>(N0);
-      if (!LN0) return SDValue();
+    // If the shift amount is larger than the input type then we're not
+    // accessing any of the loaded bytes.  If the load was a zextload/extload
+    // then the result of the shift+trunc is zero/undef (handled elsewhere).
+    ShAmt = SRL1C->getZExtValue();
+    uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits();
+    if (ShAmt >= MemoryWidth)
+      return SDValue();
 
-      // Because a SRL must be assumed to *need* to zero-extend the high bits
-      // (as opposed to anyext the high bits), we can't combine the zextload
-      // lowering of SRL and an sextload.
-      if (LN0->getExtensionType() == ISD::SEXTLOAD)
-        return SDValue();
+    // Because a SRL must be assumed to *need* to zero-extend the high bits
+    // (as opposed to anyext the high bits), we can't combine the zextload
+    // lowering of SRL and an sextload.
+    if (LN->getExtensionType() == ISD::SEXTLOAD)
+      return SDValue();
 
-      // If the shift amount is larger than the input type then we're not
-      // accessing any of the loaded bytes.  If the load was a zextload/extload
-      // then the result of the shift+trunc is zero/undef (handled elsewhere).
-      if (ShAmt >= LN0->getMemoryVT().getSizeInBits())
+    // Avoid reading outside the memory accessed by the original load (could
+    // happened if we only adjust the load base pointer by ShAmt). Instead we
+    // try to narrow the load even further. The typical scenario here is:
+    //   (i64 (truncate (i96 (srl (load x), 64)))) ->
+    //     (i64 (truncate (i96 (zextload (load i32 + offset) from i32))))
+    if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) {
+      // Don't replace sextload by zextload.
+      if (ExtType == ISD::SEXTLOAD)
         return SDValue();
-
-      // If the SRL is only used by a masking AND, we may be able to adjust
-      // the ExtVT to make the AND redundant.
-      SDNode *Mask = *(SRL->use_begin());
-      if (Mask->getOpcode() == ISD::AND &&
-          isa<ConstantSDNode>(Mask->getOperand(1))) {
-        const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
-        if (ShiftMask.isMask()) {
-          EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
-                                           ShiftMask.countTrailingOnes());
-          // If the mask is smaller, recompute the type.
-          if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
-              TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT))
-            ExtVT = MaskedVT;
-        }
+      // Narrow the load.
+      ExtType = ISD::ZEXTLOAD;
+      ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
+    }
+
+    // If the SRL is only used by a masking AND, we may be able to adjust
+    // the ExtVT to make the AND redundant.
+    SDNode *Mask = *(SRL->use_begin());
+    if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
+        isa<ConstantSDNode>(Mask->getOperand(1))) {
+      const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
+      if (ShiftMask.isMask()) {
+        EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
+                                         ShiftMask.countTrailingOnes());
+        // If the mask is smaller, recompute the type.
+        if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
+            TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
+          ExtVT = MaskedVT;
       }
     }
+
+    N0 = SRL.getOperand(0);
   }
 
-  // If the load is shifted left (and the result isn't shifted back right),
-  // we can fold the truncate through the shift.
+  // If the load is shifted left (and the result isn't shifted back right), we
+  // can fold a truncate through the shift. The typical scenario is that N
+  // points at a TRUNCATE here so the attempted fold is:
+  //   (truncate (shl (load x), c))) -> (shl (narrow load x), c)
+  // ShLeftAmt will indicate how much a narrowed load should be shifted left.
   unsigned ShLeftAmt = 0;
   if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
       ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
@@ -12237,12 +12342,12 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     return LVTStoreBits - EVTStoreBits - ShAmt;
   };
 
-  // For big endian targets, we need to adjust the offset to the pointer to
-  // load the correct bytes.
-  if (DAG.getDataLayout().isBigEndian())
-    ShAmt = AdjustBigEndianShift(ShAmt);
+  // We need to adjust the pointer to the load by ShAmt bits in order to load
+  // the correct bytes.
+  unsigned PtrAdjustmentInBits =
+      DAG.getDataLayout().isBigEndian() ? AdjustBigEndianShift(ShAmt) : ShAmt;
 
-  uint64_t PtrOff = ShAmt / 8;
+  uint64_t PtrOff = PtrAdjustmentInBits / 8;
   Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff);
   SDLoc DL(LN0);
   // The original load itself didn't wrap, so an offset within it doesn't.
@@ -12285,11 +12390,6 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   }
 
   if (HasShiftedOffset) {
-    // Recalculate the shift amount after it has been altered to calculate
-    // the offset.
-    if (DAG.getDataLayout().isBigEndian())
-      ShAmt = AdjustBigEndianShift(ShAmt);
-
     // We're using a shifted mask, so the load now has an offset. This means
     // that data has been loaded into the lower bytes than it would have been
     // before, so we need to shl the loaded data into the correct position in the
@@ -12320,7 +12420,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
-  if (ExtVTBits >= DAG.ComputeMinSignedBits(N0))
+  if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0))
     return N0;
 
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
@@ -12336,7 +12436,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     unsigned N00Bits = N00.getScalarValueSizeInBits();
-    if ((N00Bits <= ExtVTBits || DAG.ComputeMinSignedBits(N00) <= ExtVTBits) &&
+    if ((N00Bits <= ExtVTBits ||
+         DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
   }
@@ -12355,7 +12456,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts);
     if ((N00Bits == ExtVTBits ||
          (!IsZext && (N00Bits < ExtVTBits ||
-                      DAG.ComputeMinSignedBits(N00) <= ExtVTBits))) &&
+                      DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits))) &&
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00);
@@ -12381,7 +12482,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
 
   // fold (sext_in_reg (load x)) -> (smaller sextload x)
   // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
-  if (SDValue NarrowLoad = ReduceLoadWidth(N))
+  if (SDValue NarrowLoad = reduceLoadWidth(N))
     return NarrowLoad;
 
   // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
@@ -12668,7 +12769,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
   if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
-    if (SDValue Reduced = ReduceLoadWidth(N))
+    if (SDValue Reduced = reduceLoadWidth(N))
       return Reduced;
 
     // Handle the case where the load remains an extending load even
@@ -17491,6 +17592,10 @@ void DAGCombiner::getStoreMergeCandidates(
         for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
           TryToAddCandidate(I2);
       }
+      // Check stores that depend on the root (e.g. Store 3 in the chart above).
+      if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) {
+        TryToAddCandidate(I);
+      }
     }
   } else {
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
@@ -18351,6 +18456,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       Value.getValueType().isInteger() &&
       (!isa<ConstantSDNode>(Value) ||
        !cast<ConstantSDNode>(Value)->isOpaque())) {
+    // Convert a truncating store of a extension into a standard store.
+    if ((Value.getOpcode() == ISD::ZERO_EXTEND ||
+         Value.getOpcode() == ISD::SIGN_EXTEND ||
+         Value.getOpcode() == ISD::ANY_EXTEND) &&
+        Value.getOperand(0).getValueType() == ST->getMemoryVT() &&
+        TLI.isOperationLegalOrCustom(ISD::STORE, ST->getMemoryVT()))
+      return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                          ST->getMemOperand());
+
     APInt TruncDemandedBits =
         APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
                              ST->getMemoryVT().getScalarSizeInBits());
@@ -23299,6 +23413,8 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
 
   if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG))
     return S;
+  if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG))
+    return S;
 
   return SDValue();
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 4d1449bc2751..bfde35935c7b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1775,12 +1775,13 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
     return false;
 
   case Instruction::Call:
-    // On AIX, call lowering uses the DAG-ISEL path currently so that the
+    // On AIX, normal call lowering uses the DAG-ISEL path currently so that the
     // callee of the direct function call instruction will be mapped to the
     // symbol for the function's entry point, which is distinct from the
     // function descriptor symbol. The latter is the symbol whose XCOFF symbol
     // name is the C-linkage name of the source level function.
-    if (TM.getTargetTriple().isOSAIX())
+    // But fast isel still has the ability to do selection for intrinsics.
+    if (TM.getTargetTriple().isOSAIX() && !isa<IntrinsicInst>(I))
       return false;
     return selectCall(I);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5dfb65ef131a..54481b94fdd8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3593,9 +3593,16 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (Legalized) {
       // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
       // condition code, create a new SETCC node.
-      if (Tmp3.getNode())
-        Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
-                           Tmp1, Tmp2, Tmp3, Node->getFlags());
+      if (Tmp3.getNode()) {
+        if (IsStrict) {
+          Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getVTList(),
+                             {Chain, Tmp1, Tmp2, Tmp3}, Node->getFlags());
+          Chain = Tmp1.getValue(1);
+        } else {
+          Tmp1 = DAG.getNode(Node->getOpcode(), dl, Node->getValueType(0), Tmp1,
+                             Tmp2, Tmp3, Node->getFlags());
+        }
+      }
 
       // If we expanded the SETCC by inverting the condition code, then wrap
       // the existing SETCC in a NOT to restore the intended condition.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 27f9cede1922..6bf38d7296a8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1193,7 +1193,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
     llvm_unreachable("Do not know how to expand the result of this operator!");
 
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
-  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
 
   case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 518e525e13d0..8c7b90b6cd33 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -75,30 +75,28 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
-  case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
-  case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
+  case ISD::SELECT:
+  case ISD::VSELECT:
+  case ISD::VP_SELECT:
+    Res = PromoteIntRes_Select(N);
+    break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:
   case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
   case ISD::SMIN:
-  case ISD::SMAX:
-    Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ false);
-    break;
+  case ISD::SMAX:        Res = PromoteIntRes_SExtIntBinOp(N); break;
   case ISD::UMIN:
   case ISD::UMAX:        Res = PromoteIntRes_UMINUMAX(N); break;
 
   case ISD::SHL:
-    Res = PromoteIntRes_SHL(N, /*IsVP*/ false);
-    break;
+  case ISD::VP_SHL:      Res = PromoteIntRes_SHL(N); break;
   case ISD::SIGN_EXTEND_INREG:
                          Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
   case ISD::SRA:
-    Res = PromoteIntRes_SRA(N, /*IsVP*/ false);
-    break;
+  case ISD::VP_ASHR:     Res = PromoteIntRes_SRA(N); break;
   case ISD::SRL:
-    Res = PromoteIntRes_SRL(N, /*IsVP*/ false);
-    break;
+  case ISD::VP_LSHR:     Res = PromoteIntRes_SRL(N); break;
   case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
   case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
@@ -154,18 +152,22 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
-    Res = PromoteIntRes_SimpleIntBinOp(N, /*IsVP*/ false);
-    break;
+  case ISD::VP_AND:
+  case ISD::VP_OR:
+  case ISD::VP_XOR:
+  case ISD::VP_ADD:
+  case ISD::VP_SUB:
+  case ISD::VP_MUL:      Res = PromoteIntRes_SimpleIntBinOp(N); break;
 
   case ISD::SDIV:
   case ISD::SREM:
-    Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ false);
-    break;
+  case ISD::VP_SDIV:
+  case ISD::VP_SREM:     Res = PromoteIntRes_SExtIntBinOp(N); break;
 
   case ISD::UDIV:
   case ISD::UREM:
-    Res = PromoteIntRes_ZExtIntBinOp(N, /*IsVP*/ false);
-    break;
+  case ISD::VP_UDIV:
+  case ISD::VP_UREM:     Res = PromoteIntRes_ZExtIntBinOp(N); break;
 
   case ISD::SADDO:
   case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
@@ -260,32 +262,6 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::FSHR:
     Res = PromoteIntRes_FunnelShift(N);
     break;
-
-  case ISD::VP_AND:
-  case ISD::VP_OR:
-  case ISD::VP_XOR:
-  case ISD::VP_ADD:
-  case ISD::VP_SUB:
-  case ISD::VP_MUL:
-    Res = PromoteIntRes_SimpleIntBinOp(N, /*IsVP*/ true);
-    break;
-  case ISD::VP_SDIV:
-  case ISD::VP_SREM:
-    Res = PromoteIntRes_SExtIntBinOp(N, /*IsVP*/ true);
-    break;
-  case ISD::VP_UDIV:
-  case ISD::VP_UREM:
-    Res = PromoteIntRes_ZExtIntBinOp(N, /*IsVP*/ true);
-    break;
-  case ISD::VP_SHL:
-    Res = PromoteIntRes_SHL(N, /*IsVP*/ true);
-    break;
-  case ISD::VP_ASHR:
-    Res = PromoteIntRes_SRA(N, /*IsVP*/ true);
-    break;
-  case ISD::VP_LSHR:
-    Res = PromoteIntRes_SRL(N, /*IsVP*/ true);
-    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -1127,20 +1103,18 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
-  SDValue LHS = GetPromotedInteger(N->getOperand(1));
-  SDValue RHS = GetPromotedInteger(N->getOperand(2));
-  return DAG.getSelect(SDLoc(N),
-                       LHS.getValueType(), N->getOperand(0), LHS, RHS);
-}
-
-SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_Select(SDNode *N) {
   SDValue Mask = N->getOperand(0);
 
   SDValue LHS = GetPromotedInteger(N->getOperand(1));
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
-  return DAG.getNode(ISD::VSELECT, SDLoc(N),
-                     LHS.getValueType(), Mask, LHS, RHS);
+
+  unsigned Opcode = N->getOpcode();
+  return Opcode == ISD::VP_SELECT
+             ? DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS, RHS,
+                           N->getOperand(3))
+             : DAG.getNode(Opcode, SDLoc(N), LHS.getValueType(), Mask, LHS,
+                           RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
@@ -1193,12 +1167,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   return DAG.getSExtOrTrunc(SetCC, dl, NVT);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N, bool IsVP) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  if (!IsVP)
+  if (N->getOpcode() != ISD::VP_SHL)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
@@ -1210,34 +1184,40 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N, bool IsVP) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
   // The input may have strange things in the top bits of the registers, but
   // these operations don't care.  They may have weird bits going out, but
   // that too is okay if they are integer operations.
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = GetPromotedInteger(N->getOperand(1));
-  if (!IsVP)
+  if (N->getNumOperands() == 2)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N, bool IsVP) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
   // Sign extend the input.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = SExtPromotedInteger(N->getOperand(1));
-  if (!IsVP)
+  if (N->getNumOperands() == 2)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N, bool IsVP) {
+SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
   // Zero extend the input.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
-  if (!IsVP)
+  if (N->getNumOperands() == 2)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
 }
@@ -1251,25 +1231,25 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UMINUMAX(SDNode *N) {
                      LHS.getValueType(), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N, bool IsVP) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
   // The input value must be properly sign extended.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  if (!IsVP)
+  if (N->getOpcode() != ISD::VP_ASHR)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N, bool IsVP) {
+SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
   // The input value must be properly zero extended.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = N->getOperand(1);
   if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
     RHS = ZExtPromotedInteger(RHS);
-  if (!IsVP)
+  if (N->getOpcode() != ISD::VP_LSHR)
     return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS);
   return DAG.getNode(N->getOpcode(), SDLoc(N), LHS.getValueType(), LHS, RHS,
                      N->getOperand(2), N->getOperand(3));
@@ -1653,7 +1633,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::UDIVFIX:
   case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break;
 
-  case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
+  case ISD::FPOWI:
+  case ISD::STRICT_FPOWI: Res = PromoteIntOp_FPOWI(N); break;
 
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
@@ -1703,50 +1684,64 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
 
 /// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
 /// shared among BR_CC, SELECT_CC, and SETCC handlers.
-void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &LHS, SDValue &RHS,
                                             ISD::CondCode CCCode) {
   // We have to insert explicit sign or zero extends. Note that we could
   // insert sign extends for ALL conditions. For those operations where either
-  // zero or sign extension would be valid, use SExtOrZExtPromotedInteger
-  // which will choose the cheapest for the target.
-  switch (CCCode) {
-  default: llvm_unreachable("Unknown integer comparison!");
-  case ISD::SETEQ:
-  case ISD::SETNE: {
-    SDValue OpL = GetPromotedInteger(NewLHS);
-    SDValue OpR = GetPromotedInteger(NewRHS);
-
-    // We would prefer to promote the comparison operand with sign extension.
-    // If the width of OpL/OpR excluding the duplicated sign bits is no greater
-    // than the width of NewLHS/NewRH, we can avoid inserting real truncate
-    // instruction, which is redundant eventually.
-    unsigned OpLEffectiveBits = DAG.ComputeMinSignedBits(OpL);
-    unsigned OpREffectiveBits = DAG.ComputeMinSignedBits(OpR);
-    if (OpLEffectiveBits <= NewLHS.getScalarValueSizeInBits() &&
-        OpREffectiveBits <= NewRHS.getScalarValueSizeInBits()) {
-      NewLHS = OpL;
-      NewRHS = OpR;
-    } else {
-      NewLHS = SExtOrZExtPromotedInteger(NewLHS);
-      NewRHS = SExtOrZExtPromotedInteger(NewRHS);
+  // zero or sign extension would be valid, we ask the target which extension
+  // it would prefer.
+
+  // Signed comparisons always require sign extension.
+  if (ISD::isSignedIntSetCC(CCCode)) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+    return;
+  }
+
+  assert((ISD::isUnsignedIntSetCC(CCCode) || ISD::isIntEqualitySetCC(CCCode)) &&
+         "Unknown integer comparison!");
+
+  SDValue OpL = GetPromotedInteger(LHS);
+  SDValue OpR = GetPromotedInteger(RHS);
+
+  if (TLI.isSExtCheaperThanZExt(LHS.getValueType(), OpL.getValueType())) {
+    // The target would prefer to promote the comparison operand with sign
+    // extension. Honor that unless the promoted values are already zero
+    // extended.
+    unsigned OpLEffectiveBits =
+        DAG.computeKnownBits(OpL).countMaxActiveBits();
+    unsigned OpREffectiveBits =
+        DAG.computeKnownBits(OpR).countMaxActiveBits();
+    if (OpLEffectiveBits <= LHS.getScalarValueSizeInBits() &&
+        OpREffectiveBits <= RHS.getScalarValueSizeInBits()) {
+      LHS = OpL;
+      RHS = OpR;
+      return;
     }
-    break;
+
+    // The promoted values aren't zero extended, use a sext_inreg.
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+    return;
   }
-  case ISD::SETUGE:
-  case ISD::SETUGT:
-  case ISD::SETULE:
-  case ISD::SETULT:
-    NewLHS = SExtOrZExtPromotedInteger(NewLHS);
-    NewRHS = SExtOrZExtPromotedInteger(NewRHS);
-    break;
-  case ISD::SETGE:
-  case ISD::SETGT:
-  case ISD::SETLT:
-  case ISD::SETLE:
-    NewLHS = SExtPromotedInteger(NewLHS);
-    NewRHS = SExtPromotedInteger(NewRHS);
-    break;
+
+  // Prefer to promote the comparison operand with zero extension.
+
+  // If the width of OpL/OpR excluding the duplicated sign bits is no greater
+  // than the width of LHS/RHS, we can avoid/ inserting a zext_inreg operation
+  // that we might not be able to remove.
+  unsigned OpLEffectiveBits = DAG.ComputeMaxSignificantBits(OpL);
+  unsigned OpREffectiveBits = DAG.ComputeMaxSignificantBits(OpR);
+  if (OpLEffectiveBits <= LHS.getScalarValueSizeInBits() &&
+      OpREffectiveBits <= RHS.getScalarValueSizeInBits()) {
+    LHS = OpL;
+    RHS = OpR;
+    return;
   }
+
+  // Otherwise, use zext_inreg.
+  LHS = ZExtPromotedInteger(LHS);
+  RHS = ZExtPromotedInteger(RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
@@ -2099,8 +2094,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_PREFETCH(SDNode *N, unsigned OpNo) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_FPOWI(SDNode *N) {
-  // FIXME: Support for promotion of STRICT_FPOWI is not implemented yet.
-  assert(N->getOpcode() == ISD::FPOWI && "No STRICT_FPOWI support here yet.");
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
 
   // The integer operand is the last operand in FPOWI (so the result and
   // floating point operand is already type legalized).
@@ -2118,17 +2113,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_FPOWI(SDNode *N) {
     DAG.getContext()->emitError("Don't know how to promote fpowi to fpow");
     return DAG.getUNDEF(N->getValueType(0));
   }
+  unsigned OpOffset = IsStrict ? 1 : 0;
   // The exponent should fit in a sizeof(int) type for the libcall to be valid.
   assert(DAG.getLibInfo().getIntSize() ==
-         N->getOperand(1).getValueType().getSizeInBits() &&
+             N->getOperand(1 + OpOffset).getValueType().getSizeInBits() &&
          "POWI exponent should match with sizeof(int) when doing the libcall.");
   TargetLowering::MakeLibCallOptions CallOptions;
   CallOptions.setSExt(true);
-  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  std::pair<SDValue, SDValue> Tmp =
-      TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops,
-                      CallOptions, SDLoc(N), SDValue());
+  SDValue Ops[2] = {N->getOperand(0 + OpOffset), N->getOperand(1 + OpOffset)};
+  std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(
+      DAG, LC, N->getValueType(0), Ops, CallOptions, SDLoc(N), Chain);
   ReplaceValueWith(SDValue(N, 0), Tmp.first);
+  if (IsStrict)
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
   return SDValue();
 }
 
@@ -2255,7 +2252,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
 
   case ISD::ARITH_FENCE:  SplitRes_ARITH_FENCE(N, Lo, Hi); break;
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
-  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_Select(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::FREEZE:       SplitRes_FREEZE(N, Lo, Hi); break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index da282ecad282..4d8daa82d8c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -334,18 +334,17 @@ private:
   SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
-  SDValue PromoteIntRes_SELECT(SDNode *N);
-  SDValue PromoteIntRes_VSELECT(SDNode *N);
+  SDValue PromoteIntRes_Select(SDNode *N);
   SDValue PromoteIntRes_SELECT_CC(SDNode *N);
   SDValue PromoteIntRes_SETCC(SDNode *N);
-  SDValue PromoteIntRes_SHL(SDNode *N, bool IsVP);
-  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N, bool IsVP);
-  SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N, bool IsVP);
-  SDValue PromoteIntRes_SExtIntBinOp(SDNode *N, bool IsVP);
+  SDValue PromoteIntRes_SHL(SDNode *N);
+  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SExtIntBinOp(SDNode *N);
   SDValue PromoteIntRes_UMINUMAX(SDNode *N);
   SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
-  SDValue PromoteIntRes_SRA(SDNode *N, bool IsVP);
-  SDValue PromoteIntRes_SRL(SDNode *N, bool IsVP);
+  SDValue PromoteIntRes_SRA(SDNode *N);
+  SDValue PromoteIntRes_SRL(SDNode *N);
   SDValue PromoteIntRes_TRUNCATE(SDNode *N);
   SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo);
@@ -819,6 +818,12 @@ private:
   void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
   void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
 
+  /// Split mask operator of a VP intrinsic.
+  std::pair<SDValue, SDValue> SplitMask(SDValue Mask);
+
+  /// Split mask operator of a VP intrinsic in a given location.
+  std::pair<SDValue, SDValue> SplitMask(SDValue Mask, const SDLoc &DL);
+
   // Helper function for incrementing the pointer when splitting
   // memory operations
   void IncrementPointer(MemSDNode *N, EVT MemVT, MachinePointerInfo &MPI,
@@ -826,7 +831,7 @@ private:
 
   // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
   void SplitVectorResult(SDNode *N, unsigned ResNo);
-  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi, bool IsVP);
+  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -847,8 +852,10 @@ private:
   void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
-  void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi,
+                          bool SplitSETCC = false);
   void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -864,6 +871,7 @@ private:
   SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_VECREDUCE_SEQ(SDNode *N);
+  SDValue SplitVecOp_VP_REDUCE(SDNode *N, unsigned OpNo);
   SDValue SplitVecOp_UnaryOp(SDNode *N);
   SDValue SplitVecOp_TruncateHelper(SDNode *N);
 
@@ -873,9 +881,10 @@ private:
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
-  SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
-  SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *MGT, unsigned OpNo);
+  SDValue SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_Gather(MemSDNode *MGT, unsigned OpNo);
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue SplitVecOp_VSETCC(SDNode *N);
   SDValue SplitVecOp_FP_ROUND(SDNode *N);
@@ -900,6 +909,23 @@ private:
   }
   void SetWidenedVector(SDValue Op, SDValue Result);
 
+  /// Given a mask Mask, returns the larger vector into which Mask was widened.
+  SDValue GetWidenedMask(SDValue Mask, ElementCount EC) {
+    // For VP operations, we must also widen the mask. Note that the mask type
+    // may not actually need widening, leading it be split along with the VP
+    // operation.
+    // FIXME: This could lead to an infinite split/widen loop. We only handle
+    // the case where the mask needs widening to an identically-sized type as
+    // the vector inputs.
+    assert(getTypeAction(Mask.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen binary VP op");
+    Mask = GetWidenedVector(Mask);
+    assert(Mask.getValueType().getVectorElementCount() == EC &&
+           "Unable to widen binary VP op");
+    return Mask;
+  }
+
   // Widen Vector Result Promotion.
   void WidenVectorResult(SDNode *N, unsigned ResNo);
   SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo);
@@ -911,10 +937,12 @@ private:
   SDValue WidenVecRes_INSERT_SUBVECTOR(SDNode *N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
+  SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N);
   SDValue WidenVecRes_ScalarOp(SDNode* N);
-  SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVecRes_Select(SDNode *N);
   SDValue WidenVSELECTMask(SDNode *N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_SETCC(SDNode* N);
@@ -923,7 +951,7 @@ private:
   SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
 
   SDValue WidenVecRes_Ternary(SDNode *N);
-  SDValue WidenVecRes_Binary(SDNode *N, bool IsVP);
+  SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N);
   SDValue WidenVecRes_StrictFP(SDNode *N);
@@ -945,9 +973,11 @@ private:
   SDValue WidenVecOp_INSERT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_STORE(SDNode* N);
+  SDValue WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo);
   SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
   SDValue WidenVecOp_MGATHER(SDNode* N, unsigned OpNo);
   SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_VP_SCATTER(SDNode* N, unsigned OpNo);
   SDValue WidenVecOp_SETCC(SDNode* N);
   SDValue WidenVecOp_STRICT_FSETCC(SDNode* N);
   SDValue WidenVecOp_VSELECT(SDNode *N);
@@ -957,6 +987,7 @@ private:
   SDValue WidenVecOp_FCOPYSIGN(SDNode *N);
   SDValue WidenVecOp_VECREDUCE(SDNode *N);
   SDValue WidenVecOp_VECREDUCE_SEQ(SDNode *N);
+  SDValue WidenVecOp_VP_REDUCE(SDNode *N);
 
   /// Helper function to generate a set of operations to perform
   /// a vector operation for a wider type.
@@ -1023,7 +1054,7 @@ private:
   void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
                              SDValue &Lo, SDValue &Hi);
   void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi);
-  void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_Select      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_FREEZE      (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 3d3c9a2ad837..c6885677d644 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -506,9 +506,10 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
   GetSplitOp(Op, Lo, Hi);
 }
 
-void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
+void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue LL, LH, RL, RH, CL, CH;
   SDLoc dl(N);
+  unsigned Opcode = N->getOpcode();
   GetSplitOp(N->getOperand(1), LL, LH);
   GetSplitOp(N->getOperand(2), RL, RH);
 
@@ -539,8 +540,18 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
       std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
   }
 
-  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
-  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
+  if (Opcode != ISD::VP_SELECT && Opcode != ISD::VP_MERGE) {
+    Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL);
+    Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH);
+    return;
+  }
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl);
+
+  Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL, EVLLo);
+  Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi);
 }
 
 void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 1493f36fcd3e..abf6a3ac6916 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -133,6 +133,8 @@ class VectorLegalizer {
   /// Implement vselect in terms of XOR, AND, OR when blend is not
   /// supported by the target.
   SDValue ExpandVSELECT(SDNode *Node);
+  SDValue ExpandVP_SELECT(SDNode *Node);
+  SDValue ExpandVP_MERGE(SDNode *Node);
   SDValue ExpandSELECT(SDNode *Node);
   std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
   SDValue ExpandStore(SDNode *N);
@@ -457,6 +459,14 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
       Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   }
+
+#define BEGIN_REGISTER_VP_SDNODE(VPID, LEGALPOS, ...)                          \
+  case ISD::VPID: {                                                            \
+    EVT LegalizeVT = LEGALPOS < 0 ? Node->getValueType(-(1 + LEGALPOS))        \
+                                  : Node->getOperand(LEGALPOS).getValueType(); \
+    Action = TLI.getOperationAction(Node->getOpcode(), LegalizeVT);            \
+  } break;
+#include "llvm/IR/VPIntrinsics.def"
   }
 
   LLVM_DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG));
@@ -718,6 +728,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::VSELECT:
     Results.push_back(ExpandVSELECT(Node));
     return;
+  case ISD::VP_SELECT:
+    Results.push_back(ExpandVP_SELECT(Node));
+    return;
   case ISD::SELECT:
     Results.push_back(ExpandSELECT(Node));
     return;
@@ -865,6 +878,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::UREM:
     ExpandREM(Node, Results);
     return;
+  case ISD::VP_MERGE:
+    Results.push_back(ExpandVP_MERGE(Node));
+    return;
   }
 
   Results.push_back(DAG.UnrollVectorOp(Node));
@@ -1195,6 +1211,79 @@ SDValue VectorLegalizer::ExpandVSELECT(SDNode *Node) {
   return DAG.getNode(ISD::BITCAST, DL, Node->getValueType(0), Val);
 }
 
+SDValue VectorLegalizer::ExpandVP_SELECT(SDNode *Node) {
+  // Implement VP_SELECT in terms of VP_XOR, VP_AND and VP_OR on platforms which
+  // do not support it natively.
+  SDLoc DL(Node);
+
+  SDValue Mask = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue Op2 = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  EVT VT = Mask.getValueType();
+
+  // If we can't even use the basic vector operations of
+  // VP_AND,VP_OR,VP_XOR, we will have to scalarize the op.
+  if (TLI.getOperationAction(ISD::VP_AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::VP_XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::VP_OR, VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Node);
+
+  // This operation also isn't safe when the operands aren't also booleans.
+  if (Op1.getValueType().getVectorElementType() != MVT::i1)
+    return DAG.UnrollVectorOp(Node);
+
+  SDValue Ones = DAG.getAllOnesConstant(DL, VT);
+  SDValue NotMask = DAG.getNode(ISD::VP_XOR, DL, VT, Mask, Ones, Mask, EVL);
+
+  Op1 = DAG.getNode(ISD::VP_AND, DL, VT, Op1, Mask, Mask, EVL);
+  Op2 = DAG.getNode(ISD::VP_AND, DL, VT, Op2, NotMask, Mask, EVL);
+  return DAG.getNode(ISD::VP_OR, DL, VT, Op1, Op2, Mask, EVL);
+}
+
+SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
+  // Implement VP_MERGE in terms of VSELECT. Construct a mask where vector
+  // indices less than the EVL/pivot are true. Combine that with the original
+  // mask for a full-length mask. Use a full-length VSELECT to select between
+  // the true and false values.
+  SDLoc DL(Node);
+
+  SDValue Mask = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue Op2 = Node->getOperand(2);
+  SDValue EVL = Node->getOperand(3);
+
+  EVT MaskVT = Mask.getValueType();
+  bool IsFixedLen = MaskVT.isFixedLengthVector();
+
+  EVT EVLVecVT = EVT::getVectorVT(*DAG.getContext(), EVL.getValueType(),
+                                  MaskVT.getVectorElementCount());
+
+  // If we can't construct the EVL mask efficiently, it's better to unroll.
+  if ((IsFixedLen &&
+       !TLI.isOperationLegalOrCustom(ISD::BUILD_VECTOR, EVLVecVT)) ||
+      (!IsFixedLen &&
+       (!TLI.isOperationLegalOrCustom(ISD::STEP_VECTOR, EVLVecVT) ||
+        !TLI.isOperationLegalOrCustom(ISD::SPLAT_VECTOR, EVLVecVT))))
+    return DAG.UnrollVectorOp(Node);
+
+  // If using a SETCC would result in a different type than the mask type,
+  // unroll.
+  if (TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                             EVLVecVT) != MaskVT)
+    return DAG.UnrollVectorOp(Node);
+
+  SDValue StepVec = DAG.getStepVector(DL, EVLVecVT);
+  SDValue SplatEVL = IsFixedLen ? DAG.getSplatBuildVector(EVLVecVT, DL, EVL)
+                                : DAG.getSplatVector(EVLVecVT, DL, EVL);
+  SDValue EVLMask =
+      DAG.getSetCC(DL, MaskVT, StepVec, SplatEVL, ISD::CondCode::SETULT);
+
+  SDValue FullMask = DAG.getNode(ISD::AND, DL, MaskVT, Mask, EVLMask);
+  return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
+}
+
 void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node,
                                        SmallVectorImpl<SDValue> &Results) {
   // Attempt to expand using TargetLowering.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7ec2638b1e71..0bd44ce4c872 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -914,7 +914,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::VSELECT:
-  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT:
+  case ISD::VP_MERGE:
+  case ISD::VP_SELECT:    SplitRes_Select(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
   case ISD::BITCAST:           SplitVecRes_BITCAST(N, Lo, Hi); break;
@@ -936,11 +938,15 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
+  case ISD::VP_LOAD:
+    SplitVecRes_VP_LOAD(cast<VPLoadSDNode>(N), Lo, Hi);
+    break;
   case ISD::MLOAD:
     SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
     break;
   case ISD::MGATHER:
-    SplitVecRes_MGATHER(cast<MaskedGatherSDNode>(N), Lo, Hi);
+  case ISD::VP_GATHER:
+    SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
     break;
   case ISD::SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
@@ -1008,31 +1014,31 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_ExtendOp(N, Lo, Hi);
     break;
 
-  case ISD::ADD:
-  case ISD::SUB:
-  case ISD::MUL:
+  case ISD::ADD: case ISD::VP_ADD:
+  case ISD::SUB: case ISD::VP_SUB:
+  case ISD::MUL: case ISD::VP_MUL:
   case ISD::MULHS:
   case ISD::MULHU:
-  case ISD::FADD:
-  case ISD::FSUB:
-  case ISD::FMUL:
+  case ISD::FADD: case ISD::VP_FADD:
+  case ISD::FSUB: case ISD::VP_FSUB:
+  case ISD::FMUL: case ISD::VP_FMUL:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINIMUM:
   case ISD::FMAXIMUM:
-  case ISD::SDIV:
-  case ISD::UDIV:
-  case ISD::FDIV:
+  case ISD::SDIV: case ISD::VP_SDIV:
+  case ISD::UDIV: case ISD::VP_UDIV:
+  case ISD::FDIV: case ISD::VP_FDIV:
   case ISD::FPOW:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR:
-  case ISD::SHL:
-  case ISD::SRA:
-  case ISD::SRL:
-  case ISD::UREM:
-  case ISD::SREM:
-  case ISD::FREM:
+  case ISD::AND: case ISD::VP_AND:
+  case ISD::OR: case ISD::VP_OR:
+  case ISD::XOR: case ISD::VP_XOR:
+  case ISD::SHL: case ISD::VP_SHL:
+  case ISD::SRA: case ISD::VP_ASHR:
+  case ISD::SRL: case ISD::VP_LSHR:
+  case ISD::UREM: case ISD::VP_UREM:
+  case ISD::SREM: case ISD::VP_SREM:
+  case ISD::FREM: case ISD::VP_FREM:
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
@@ -1045,7 +1051,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
-    SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ false);
+    SplitVecRes_BinOp(N, Lo, Hi);
     break;
   case ISD::FMA:
   case ISD::FSHL:
@@ -1082,26 +1088,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UDIVFIXSAT:
     SplitVecRes_FIX(N, Lo, Hi);
     break;
-  case ISD::VP_ADD:
-  case ISD::VP_AND:
-  case ISD::VP_MUL:
-  case ISD::VP_OR:
-  case ISD::VP_SUB:
-  case ISD::VP_XOR:
-  case ISD::VP_SHL:
-  case ISD::VP_LSHR:
-  case ISD::VP_ASHR:
-  case ISD::VP_SDIV:
-  case ISD::VP_UDIV:
-  case ISD::VP_SREM:
-  case ISD::VP_UREM:
-  case ISD::VP_FADD:
-  case ISD::VP_FSUB:
-  case ISD::VP_FMUL:
-  case ISD::VP_FDIV:
-  case ISD::VP_FREM:
-    SplitVecRes_BinOp(N, Lo, Hi, /*IsVP*/ true);
-    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -1133,8 +1119,22 @@ void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
   }
 }
 
-void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi,
-                                         bool IsVP) {
+std::pair<SDValue, SDValue> DAGTypeLegalizer::SplitMask(SDValue Mask) {
+  return SplitMask(Mask, SDLoc(Mask));
+}
+
+std::pair<SDValue, SDValue> DAGTypeLegalizer::SplitMask(SDValue Mask,
+                                                        const SDLoc &DL) {
+  SDValue MaskLo, MaskHi;
+  EVT MaskVT = Mask.getValueType();
+  if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  return std::make_pair(MaskLo, MaskHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue LHSLo, LHSHi;
   GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
   SDValue RHSLo, RHSHi;
@@ -1143,36 +1143,21 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi,
 
   const SDNodeFlags Flags = N->getFlags();
   unsigned Opcode = N->getOpcode();
-  if (!IsVP) {
+  if (N->getNumOperands() == 2) {
     Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
     Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
     return;
   }
 
-  // Split the mask.
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
   SDValue MaskLo, MaskHi;
-  SDValue Mask = N->getOperand(2);
-  EVT MaskVT = Mask.getValueType();
-  if (getTypeAction(MaskVT) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, SDLoc(Mask));
-
-  // Split the vector length parameter.
-  // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts).
-  SDValue EVL = N->getOperand(3);
-  EVT VecVT = N->getValueType(0);
-  EVT EVLVT = EVL.getValueType();
-  assert(VecVT.getVectorElementCount().isKnownEven() &&
-         "Expecting the mask to be an evenly-sized vector");
-  unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2;
-  SDValue HalfNumElts =
-      VecVT.isFixedLengthVector()
-          ? DAG.getConstant(HalfMinNumElts, dl, EVLVT)
-          : DAG.getVScale(dl, EVLVT,
-                          APInt(EVLVT.getScalarSizeInBits(), HalfMinNumElts));
-  SDValue EVLLo = DAG.getNode(ISD::UMIN, dl, EVLVT, EVL, HalfNumElts);
-  SDValue EVLHi = DAG.getNode(ISD::USUBSAT, dl, EVLVT, EVL, HalfNumElts);
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(2));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl);
 
   Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(),
                    {LHSLo, RHSLo, MaskLo, EVLLo}, Flags);
@@ -1781,6 +1766,86 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   ReplaceValueWith(SDValue(LD, 1), Ch);
 }
 
+void DAGTypeLegalizer::SplitVecRes_VP_LOAD(VPLoadSDNode *LD, SDValue &Lo,
+                                           SDValue &Hi) {
+  assert(LD->isUnindexed() && "Indexed VP load during type legalization!");
+  EVT LoVT, HiVT;
+  SDLoc dl(LD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDValue Offset = LD->getOffset();
+  assert(Offset.isUndef() && "Unexpected indexed variable-length load offset");
+  Align Alignment = LD->getOriginalAlign();
+  SDValue Mask = LD->getMask();
+  SDValue EVL = LD->getVectorLength();
+  EVT MemoryVT = LD->getMemoryVT();
+
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty);
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  }
+
+  // Split EVL operand
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, LD->getValueType(0), dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      LD->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, LD->getAAInfo(), LD->getRanges());
+
+  Lo =
+      DAG.getLoadVP(LD->getAddressingMode(), ExtType, LoVT, dl, Ch, Ptr, Offset,
+                    MaskLo, EVLLo, LoMemVT, MMO, LD->isExpandingLoad());
+
+  if (HiIsEmpty) {
+    // The hi vp_load has zero storage size. We therefore simply set it to
+    // the low vp_load and rely on subsequent removal from the chain.
+    Hi = Lo;
+  } else {
+    // Generate hi vp_load.
+    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
+                                     LD->isExpandingLoad());
+
+    MachinePointerInfo MPI;
+    if (LoMemVT.isScalableVector())
+      MPI = MachinePointerInfo(LD->getPointerInfo().getAddrSpace());
+    else
+      MPI = LD->getPointerInfo().getWithOffset(
+          LoMemVT.getStoreSize().getFixedSize());
+
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MPI, MachineMemOperand::MOLoad, MemoryLocation::UnknownSize, Alignment,
+        LD->getAAInfo(), LD->getRanges());
+
+    Hi = DAG.getLoadVP(LD->getAddressingMode(), ExtType, HiVT, dl, Ch, Ptr,
+                       Offset, MaskHi, EVLHi, HiMemVT, MMO,
+                       LD->isExpandingLoad());
+  }
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), Ch);
+}
+
 void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
                                          SDValue &Lo, SDValue &Hi) {
   assert(MLD->isUnindexed() && "Indexed masked load during type legalization!");
@@ -1865,61 +1930,85 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
 
 }
 
-void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
-                                         SDValue &Lo, SDValue &Hi) {
+void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
+                                          SDValue &Hi, bool SplitSETCC) {
   EVT LoVT, HiVT;
-  SDLoc dl(MGT);
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
-
-  SDValue Ch = MGT->getChain();
-  SDValue Ptr = MGT->getBasePtr();
-  SDValue Mask = MGT->getMask();
-  SDValue PassThru = MGT->getPassThru();
-  SDValue Index = MGT->getIndex();
-  SDValue Scale = MGT->getScale();
-  EVT MemoryVT = MGT->getMemoryVT();
-  Align Alignment = MGT->getOriginalAlign();
-  ISD::LoadExtType ExtType = MGT->getExtensionType();
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  SDValue Ch = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  struct Operands {
+    SDValue Mask;
+    SDValue Index;
+    SDValue Scale;
+  } Ops = [&]() -> Operands {
+    if (auto *MSC = dyn_cast<MaskedGatherSDNode>(N)) {
+      return {MSC->getMask(), MSC->getIndex(), MSC->getScale()};
+    }
+    auto *VPSC = cast<VPGatherSDNode>(N);
+    return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale()};
+  }();
+
+  EVT MemoryVT = N->getMemoryVT();
+  Align Alignment = N->getOriginalAlign();
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (Mask.getOpcode() == ISD::SETCC) {
-    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  if (SplitSETCC && Ops.Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi);
   } else {
-    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-      GetSplitVector(Mask, MaskLo, MaskHi);
-    else
-      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+    std::tie(MaskLo, MaskHi) = SplitMask(Ops.Mask, dl);
   }
 
   EVT LoMemVT, HiMemVT;
   // Split MemoryVT
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
-  SDValue PassThruLo, PassThruHi;
-  if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(PassThru, PassThruLo, PassThruHi);
-  else
-    std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
-
   SDValue IndexHi, IndexLo;
-  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Index, IndexLo, IndexHi);
+  if (getTypeAction(Ops.Index.getValueType()) ==
+      TargetLowering::TypeSplitVector)
+    GetSplitVector(Ops.Index, IndexLo, IndexHi);
   else
-    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, dl);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MGT->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
-      MGT->getRanges());
+      N->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
+
+  if (auto *MGT = dyn_cast<MaskedGatherSDNode>(N)) {
+    SDValue PassThru = MGT->getPassThru();
+    SDValue PassThruLo, PassThruHi;
+    if (getTypeAction(PassThru.getValueType()) ==
+        TargetLowering::TypeSplitVector)
+      GetSplitVector(PassThru, PassThruLo, PassThruHi);
+    else
+      std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
 
-  SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
-                           MMO, MGT->getIndexType(), ExtType);
+    ISD::LoadExtType ExtType = MGT->getExtensionType();
+    ISD::MemIndexType IndexTy = MGT->getIndexType();
 
-  SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
-                           MMO, MGT->getIndexType(), ExtType);
+    SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Ops.Scale};
+    Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
+                             OpsLo, MMO, IndexTy, ExtType);
+
+    SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Ops.Scale};
+    Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
+                             OpsHi, MMO, IndexTy, ExtType);
+  } else {
+    auto *VPGT = cast<VPGatherSDNode>(N);
+    SDValue EVLLo, EVLHi;
+    std::tie(EVLLo, EVLHi) =
+        DAG.SplitEVL(VPGT->getVectorLength(), MemoryVT, dl);
+
+    SDValue OpsLo[] = {Ch, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo};
+    Lo = DAG.getGatherVP(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
+                         MMO, VPGT->getIndexType());
+
+    SDValue OpsHi[] = {Ch, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi};
+    Hi = DAG.getGatherVP(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
+                         MMO, VPGT->getIndexType());
+  }
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1928,10 +2017,9 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
-  ReplaceValueWith(SDValue(MGT, 1), Ch);
+  ReplaceValueWith(SDValue(N, 1), Ch);
 }
 
-
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
@@ -2221,14 +2309,19 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::STORE:
     Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
     break;
+  case ISD::VP_STORE:
+    Res = SplitVecOp_VP_STORE(cast<VPStoreSDNode>(N), OpNo);
+    break;
   case ISD::MSTORE:
     Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
     break;
   case ISD::MSCATTER:
-    Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
+  case ISD::VP_SCATTER:
+    Res = SplitVecOp_Scatter(cast<MemSDNode>(N), OpNo);
     break;
   case ISD::MGATHER:
-    Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
+  case ISD::VP_GATHER:
+    Res = SplitVecOp_Gather(cast<MemSDNode>(N), OpNo);
     break;
   case ISD::VSELECT:
     Res = SplitVecOp_VSELECT(N, OpNo);
@@ -2285,6 +2378,23 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VECREDUCE_SEQ_FMUL:
     Res = SplitVecOp_VECREDUCE_SEQ(N);
     break;
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+    Res = SplitVecOp_VP_REDUCE(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2381,6 +2491,33 @@ SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE_SEQ(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, ResVT, Partial, Hi, Flags);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_VP_REDUCE(SDNode *N, unsigned OpNo) {
+  assert(N->isVPOpcode() && "Expected VP opcode");
+  assert(OpNo == 1 && "Can only split reduce vector operand");
+
+  unsigned Opc = N->getOpcode();
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+
+  SDValue VecOp = N->getOperand(OpNo);
+  EVT VecVT = VecOp.getValueType();
+  assert(VecVT.isVector() && "Can only split reduce vector operand");
+  GetSplitVector(VecOp, Lo, Hi);
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = SplitMask(N->getOperand(2));
+
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) = DAG.SplitEVL(N->getOperand(3), VecVT, dl);
+
+  const SDNodeFlags Flags = N->getFlags();
+
+  SDValue ResLo =
+      DAG.getNode(Opc, dl, ResVT, {N->getOperand(0), Lo, MaskLo, EVLLo}, Flags);
+  return DAG.getNode(Opc, dl, ResVT, {ResLo, Hi, MaskHi, EVLHi}, Flags);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
@@ -2558,70 +2695,92 @@ SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
 }
 
-SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
-                                             unsigned OpNo) {
-  EVT LoVT, HiVT;
-  SDLoc dl(MGT);
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
-
-  SDValue Ch = MGT->getChain();
-  SDValue Ptr = MGT->getBasePtr();
-  SDValue Index = MGT->getIndex();
-  SDValue Scale = MGT->getScale();
-  SDValue Mask = MGT->getMask();
-  SDValue PassThru = MGT->getPassThru();
-  Align Alignment = MGT->getOriginalAlign();
-  ISD::LoadExtType ExtType = MGT->getExtensionType();
+SDValue DAGTypeLegalizer::SplitVecOp_Gather(MemSDNode *N, unsigned OpNo) {
+  (void)OpNo;
+  SDValue Lo, Hi;
+  SplitVecRes_Gather(N, Lo, Hi);
 
-  SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    // Split Mask operand
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, N, N->getValueType(0), Lo, Hi);
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return SDValue();
+}
 
-  EVT MemoryVT = MGT->getMemoryVT();
-  EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+SDValue DAGTypeLegalizer::SplitVecOp_VP_STORE(VPStoreSDNode *N, unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed vp_store of vector?");
+  SDValue Ch = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Offset = N->getOffset();
+  assert(Offset.isUndef() && "Unexpected VP store offset");
+  SDValue Mask = N->getMask();
+  SDValue EVL = N->getVectorLength();
+  SDValue Data = N->getValue();
+  Align Alignment = N->getOriginalAlign();
+  SDLoc DL(N);
 
-  SDValue PassThruLo, PassThruHi;
-  if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(PassThru, PassThruLo, PassThruHi);
+  SDValue DataLo, DataHi;
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Data, DataLo, DataHi);
   else
-    std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
-  SDValue IndexHi, IndexLo;
-  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Index, IndexLo, IndexHi);
-  else
-    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  }
 
+  EVT MemoryVT = N->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);
+
+  // Split EVL
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) = DAG.SplitEVL(EVL, Data.getValueType(), DL);
+
+  SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MGT->getPointerInfo(), MachineMemOperand::MOLoad,
-      MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
-      MGT->getRanges());
+      N->getPointerInfo(), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
 
-  SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
-                                   OpsLo, MMO, MGT->getIndexType(), ExtType);
+  Lo = DAG.getStoreVP(Ch, DL, DataLo, Ptr, Offset, MaskLo, EVLLo, LoMemVT, MMO,
+                      N->getAddressingMode(), N->isTruncatingStore(),
+                      N->isCompressingStore());
 
-  SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
-                                   OpsHi, MMO, MGT->getIndexType(), ExtType);
+  // If the hi vp_store has zero storage size, only the lo vp_store is needed.
+  if (HiIsEmpty)
+    return Lo;
 
-  // Build a factor node to remember that this load is independent of the
-  // other one.
-  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                   Hi.getValue(1));
+  Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
+                                   N->isCompressingStore());
 
-  // Legalize the chain result - switch anything that used the old chain to
-  // use the new one.
-  ReplaceValueWith(SDValue(MGT, 1), Ch);
+  MachinePointerInfo MPI;
+  if (LoMemVT.isScalableVector()) {
+    Alignment = commonAlignment(Alignment,
+                                LoMemVT.getSizeInBits().getKnownMinSize() / 8);
+    MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
+  } else
+    MPI = N->getPointerInfo().getWithOffset(
+        LoMemVT.getStoreSize().getFixedSize());
 
-  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo,
-                            Hi);
-  ReplaceValueWith(SDValue(MGT, 0), Res);
-  return SDValue();
+  MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MPI, MachineMemOperand::MOStore, MemoryLocation::UnknownSize, Alignment,
+      N->getAAInfo(), N->getRanges());
+
+  Hi = DAG.getStoreVP(Ch, DL, DataHi, Ptr, Offset, MaskHi, EVLHi, HiMemVT, MMO,
+                      N->getAddressingMode(), N->isTruncatingStore(),
+                      N->isCompressingStore());
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
@@ -2703,64 +2862,87 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   return Res;
 }
 
-SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
-                                              unsigned OpNo) {
-  SDValue Ch  = N->getChain();
+SDValue DAGTypeLegalizer::SplitVecOp_Scatter(MemSDNode *N, unsigned OpNo) {
+  SDValue Ch = N->getChain();
   SDValue Ptr = N->getBasePtr();
-  SDValue Mask = N->getMask();
-  SDValue Index = N->getIndex();
-  SDValue Scale = N->getScale();
-  SDValue Data = N->getValue();
   EVT MemoryVT = N->getMemoryVT();
   Align Alignment = N->getOriginalAlign();
   SDLoc DL(N);
-
+  struct Operands {
+    SDValue Mask;
+    SDValue Index;
+    SDValue Scale;
+    SDValue Data;
+  } Ops = [&]() -> Operands {
+    if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
+      return {MSC->getMask(), MSC->getIndex(), MSC->getScale(),
+              MSC->getValue()};
+    }
+    auto *VPSC = cast<VPScatterSDNode>(N);
+    return {VPSC->getMask(), VPSC->getIndex(), VPSC->getScale(),
+            VPSC->getValue()};
+  }();
   // Split all operands
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue DataLo, DataHi;
-  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+  if (getTypeAction(Ops.Data.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Data operand
-    GetSplitVector(Data, DataLo, DataHi);
+    GetSplitVector(Ops.Data, DataLo, DataHi);
   else
-    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Ops.Data, DL);
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
-    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  if (OpNo == 1 && Ops.Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Ops.Mask.getNode(), MaskLo, MaskHi);
   } else {
-    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-      GetSplitVector(Mask, MaskLo, MaskHi);
-    else
-      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+    std::tie(MaskLo, MaskHi) = SplitMask(Ops.Mask, DL);
   }
 
   SDValue IndexHi, IndexLo;
-  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Index, IndexLo, IndexHi);
+  if (getTypeAction(Ops.Index.getValueType()) ==
+      TargetLowering::TypeSplitVector)
+    GetSplitVector(Ops.Index, IndexLo, IndexHi);
   else
-    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Ops.Index, DL);
 
   SDValue Lo;
   MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
       N->getPointerInfo(), MachineMemOperand::MOStore,
       MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
 
-  SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
-  Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT,
-                            DL, OpsLo, MMO, N->getIndexType(),
-                            N->isTruncatingStore());
+  if (auto *MSC = dyn_cast<MaskedScatterSDNode>(N)) {
+    SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Ops.Scale};
+    Lo =
+        DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO,
+                             MSC->getIndexType(), MSC->isTruncatingStore());
+
+    // The order of the Scatter operation after split is well defined. The "Hi"
+    // part comes after the "Lo". So these two operations should be chained one
+    // after another.
+    SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Ops.Scale};
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi,
+                                MMO, MSC->getIndexType(),
+                                MSC->isTruncatingStore());
+  }
+  auto *VPSC = cast<VPScatterSDNode>(N);
+  SDValue EVLLo, EVLHi;
+  std::tie(EVLLo, EVLHi) =
+      DAG.SplitEVL(VPSC->getVectorLength(), Ops.Data.getValueType(), DL);
+
+  SDValue OpsLo[] = {Ch, DataLo, Ptr, IndexLo, Ops.Scale, MaskLo, EVLLo};
+  Lo = DAG.getScatterVP(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO,
+                        VPSC->getIndexType());
 
   // The order of the Scatter operation after split is well defined. The "Hi"
   // part comes after the "Lo". So these two operations should be chained one
   // after another.
-  SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
-  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT,
-                              DL, OpsHi, MMO, N->getIndexType(),
-                              N->isTruncatingStore());
+  SDValue OpsHi[] = {Lo, DataHi, Ptr, IndexHi, Ops.Scale, MaskHi, EVLHi};
+  return DAG.getScatterVP(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi, MMO,
+                          VPSC->getIndexType());
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -3047,31 +3229,41 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
   case ISD::VSELECT:
-  case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
+  case ISD::SELECT:
+  case ISD::VP_SELECT:
+  case ISD::VP_MERGE:
+    Res = WidenVecRes_Select(N);
+    break;
   case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
   case ISD::SETCC:             Res = WidenVecRes_SETCC(N); break;
   case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
+  case ISD::VP_LOAD:
+    Res = WidenVecRes_VP_LOAD(cast<VPLoadSDNode>(N));
+    break;
   case ISD::MLOAD:
     Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
     break;
   case ISD::MGATHER:
     Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
+  case ISD::VP_GATHER:
+    Res = WidenVecRes_VP_GATHER(cast<VPGatherSDNode>(N));
+    break;
 
-  case ISD::ADD:
-  case ISD::AND:
-  case ISD::MUL:
+  case ISD::ADD: case ISD::VP_ADD:
+  case ISD::AND: case ISD::VP_AND:
+  case ISD::MUL: case ISD::VP_MUL:
   case ISD::MULHS:
   case ISD::MULHU:
-  case ISD::OR:
-  case ISD::SUB:
-  case ISD::XOR:
-  case ISD::SHL:
-  case ISD::SRA:
-  case ISD::SRL:
+  case ISD::OR: case ISD::VP_OR:
+  case ISD::SUB: case ISD::VP_SUB:
+  case ISD::XOR: case ISD::VP_XOR:
+  case ISD::SHL: case ISD::VP_SHL:
+  case ISD::SRA: case ISD::VP_ASHR:
+  case ISD::SRL: case ISD::VP_LSHR:
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
   case ISD::FMINIMUM:
@@ -3088,7 +3280,21 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::USHLSAT:
   case ISD::ROTL:
   case ISD::ROTR:
-    Res = WidenVecRes_Binary(N, /*IsVP*/ false);
+  // Vector-predicated binary op widening. Note that -- unlike the
+  // unpredicated versions -- we don't have to worry about trapping on
+  // operations like UDIV, FADD, etc., as we pass on the original vector
+  // length parameter. This means the widened elements containing garbage
+  // aren't active.
+  case ISD::VP_SDIV:
+  case ISD::VP_UDIV:
+  case ISD::VP_SREM:
+  case ISD::VP_UREM:
+  case ISD::VP_FADD:
+  case ISD::VP_FSUB:
+  case ISD::VP_FMUL:
+  case ISD::VP_FDIV:
+  case ISD::VP_FREM:
+    Res = WidenVecRes_Binary(N);
     break;
 
   case ISD::FADD:
@@ -3212,31 +3418,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FSHR:
     Res = WidenVecRes_Ternary(N);
     break;
-  case ISD::VP_ADD:
-  case ISD::VP_AND:
-  case ISD::VP_MUL:
-  case ISD::VP_OR:
-  case ISD::VP_SUB:
-  case ISD::VP_XOR:
-  case ISD::VP_SHL:
-  case ISD::VP_LSHR:
-  case ISD::VP_ASHR:
-  case ISD::VP_SDIV:
-  case ISD::VP_UDIV:
-  case ISD::VP_SREM:
-  case ISD::VP_UREM:
-  case ISD::VP_FADD:
-  case ISD::VP_FSUB:
-  case ISD::VP_FMUL:
-  case ISD::VP_FDIV:
-  case ISD::VP_FREM:
-    // Vector-predicated binary op widening. Note that -- unlike the
-    // unpredicated versions -- we don't have to worry about trapping on
-    // operations like UDIV, FADD, etc., as we pass on the original vector
-    // length parameter. This means the widened elements containing garbage
-    // aren't active.
-    Res = WidenVecRes_Binary(N, /*IsVP*/ true);
-    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -3254,29 +3435,21 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N, bool IsVP) {
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   // Binary op widening.
   SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
-  if (!IsVP)
+  if (N->getNumOperands() == 2)
     return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2,
                        N->getFlags());
-  // For VP operations, we must also widen the mask. Note that the mask type
-  // may not actually need widening, leading it be split along with the VP
-  // operation.
-  // FIXME: This could lead to an infinite split/widen loop. We only handle the
-  // case where the mask needs widening to an identically-sized type as the
-  // vector inputs.
-  SDValue Mask = N->getOperand(2);
-  assert(getTypeAction(Mask.getValueType()) ==
-             TargetLowering::TypeWidenVector &&
-         "Unable to widen binary VP op");
-  Mask = GetWidenedVector(Mask);
-  assert(Mask.getValueType().getVectorElementCount() ==
-             WidenVT.getVectorElementCount() &&
-         "Unable to widen binary VP op");
+
+  assert(N->getNumOperands() == 4 && "Unexpected number of operands!");
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDValue Mask =
+      GetWidenedMask(N->getOperand(2), WidenVT.getVectorElementCount());
   return DAG.getNode(N->getOpcode(), dl, WidenVT,
                      {InOp1, InOp2, Mask, N->getOperand(3)}, N->getFlags());
 }
@@ -4226,6 +4399,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   report_fatal_error("Unable to widen vector load");
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_VP_LOAD(VPLoadSDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Mask = N->getMask();
+  SDValue EVL = N->getVectorLength();
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  SDLoc dl(N);
+
+  // The mask should be widened as well
+  assert(getTypeAction(Mask.getValueType()) ==
+             TargetLowering::TypeWidenVector &&
+         "Unable to widen binary VP op");
+  Mask = GetWidenedVector(Mask);
+  assert(Mask.getValueType().getVectorElementCount() ==
+             TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType())
+                 .getVectorElementCount() &&
+         "Unable to widen vector load");
+
+  SDValue Res =
+      DAG.getLoadVP(N->getAddressingMode(), ExtType, WidenVT, dl, N->getChain(),
+                    N->getBasePtr(), N->getOffset(), Mask, EVL,
+                    N->getMemoryVT(), N->getMemOperand(), N->isExpandingLoad());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
@@ -4289,6 +4489,29 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_VP_GATHER(VPGatherSDNode *N) {
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Mask = N->getMask();
+  SDValue Scale = N->getScale();
+  ElementCount WideEC = WideVT.getVectorElementCount();
+  SDLoc dl(N);
+
+  SDValue Index = GetWidenedVector(N->getIndex());
+  EVT WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getMemoryVT().getScalarType(), WideEC);
+  Mask = GetWidenedMask(Mask, WideEC);
+
+  SDValue Ops[] = {N->getChain(), N->getBasePtr(),     Index, Scale,
+                   Mask,          N->getVectorLength()};
+  SDValue Res = DAG.getGatherVP(DAG.getVTList(WideVT, MVT::Other), WideMemVT,
+                                dl, Ops, N->getMemOperand(), N->getIndexType());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_ScalarOp(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, N->getOperand(0));
@@ -4522,19 +4745,19 @@ SDValue DAGTypeLegalizer::WidenVSELECTMask(SDNode *N) {
   return Mask;
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_Select(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   ElementCount WidenEC = WidenVT.getVectorElementCount();
 
   SDValue Cond1 = N->getOperand(0);
   EVT CondVT = Cond1.getValueType();
+  unsigned Opcode = N->getOpcode();
   if (CondVT.isVector()) {
     if (SDValue WideCond = WidenVSELECTMask(N)) {
       SDValue InOp1 = GetWidenedVector(N->getOperand(1));
       SDValue InOp2 = GetWidenedVector(N->getOperand(2));
       assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
-      return DAG.getNode(N->getOpcode(), SDLoc(N),
-                         WidenVT, WideCond, InOp1, InOp2);
+      return DAG.getNode(Opcode, SDLoc(N), WidenVT, WideCond, InOp1, InOp2);
     }
 
     EVT CondEltVT = CondVT.getVectorElementType();
@@ -4560,8 +4783,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
   SDValue InOp2 = GetWidenedVector(N->getOperand(2));
   assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
-  return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     WidenVT, Cond1, InOp1, InOp2);
+  return Opcode == ISD::VP_SELECT || Opcode == ISD::VP_MERGE
+             ? DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2,
+                           N->getOperand(3))
+             : DAG.getNode(Opcode, SDLoc(N), WidenVT, Cond1, InOp1, InOp2);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
@@ -4711,9 +4936,11 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+  case ISD::VP_STORE:           Res = WidenVecOp_VP_STORE(N, OpNo); break;
   case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
   case ISD::MGATHER:            Res = WidenVecOp_MGATHER(N, OpNo); break;
   case ISD::MSCATTER:           Res = WidenVecOp_MSCATTER(N, OpNo); break;
+  case ISD::VP_SCATTER:         Res = WidenVecOp_VP_SCATTER(N, OpNo); break;
   case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:     Res = WidenVecOp_STRICT_FSETCC(N); break;
@@ -4766,6 +4993,23 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::VECREDUCE_SEQ_FMUL:
     Res = WidenVecOp_VECREDUCE_SEQ(N);
     break;
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_ADD:
+  case ISD::VP_REDUCE_MUL:
+  case ISD::VP_REDUCE_AND:
+  case ISD::VP_REDUCE_OR:
+  case ISD::VP_REDUCE_XOR:
+  case ISD::VP_REDUCE_SMAX:
+  case ISD::VP_REDUCE_SMIN:
+  case ISD::VP_REDUCE_UMAX:
+  case ISD::VP_REDUCE_UMIN:
+  case ISD::VP_REDUCE_FMAX:
+  case ISD::VP_REDUCE_FMIN:
+    Res = WidenVecOp_VP_REDUCE(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -5092,15 +5336,54 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
     unsigned NumVTElts = StVT.getVectorMinNumElements();
     SDValue EVL =
         DAG.getVScale(DL, EVLVT, APInt(EVLVT.getScalarSizeInBits(), NumVTElts));
-    const auto *MMO = ST->getMemOperand();
-    return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(), Mask,
-                          EVL, MMO->getPointerInfo(), MMO->getAlign(),
-                          MMO->getFlags(), MMO->getAAInfo());
+    return DAG.getStoreVP(ST->getChain(), DL, StVal, ST->getBasePtr(),
+                          DAG.getUNDEF(ST->getBasePtr().getValueType()), Mask,
+                          EVL, StVal.getValueType(), ST->getMemOperand(),
+                          ST->getAddressingMode());
   }
 
   report_fatal_error("Unable to widen vector store");
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_VP_STORE(SDNode *N, unsigned OpNo) {
+  assert((OpNo == 1 || OpNo == 3) &&
+         "Can widen only data or mask operand of vp_store");
+  VPStoreSDNode *ST = cast<VPStoreSDNode>(N);
+  SDValue Mask = ST->getMask();
+  SDValue StVal = ST->getValue();
+  SDLoc dl(N);
+
+  if (OpNo == 1) {
+    // Widen the value.
+    StVal = GetWidenedVector(StVal);
+
+    // We only handle the case where the mask needs widening to an
+    // identically-sized type as the vector inputs.
+    assert(getTypeAction(Mask.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen VP store");
+    Mask = GetWidenedVector(Mask);
+  } else {
+    Mask = GetWidenedVector(Mask);
+
+    // We only handle the case where the stored value needs widening to an
+    // identically-sized type as the mask.
+    assert(getTypeAction(StVal.getValueType()) ==
+               TargetLowering::TypeWidenVector &&
+           "Unable to widen VP store");
+    StVal = GetWidenedVector(StVal);
+  }
+
+  assert(Mask.getValueType().getVectorElementCount() ==
+             StVal.getValueType().getVectorElementCount() &&
+         "Mask and data vectors should have the same number of elements");
+  return DAG.getStoreVP(ST->getChain(), dl, StVal, ST->getBasePtr(),
+                        ST->getOffset(), Mask, ST->getVectorLength(),
+                        ST->getMemoryVT(), ST->getMemOperand(),
+                        ST->getAddressingMode(), ST->isTruncatingStore(),
+                        ST->isCompressingStore());
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
   assert((OpNo == 1 || OpNo == 3) &&
          "Can widen only data or mask operand of mstore");
@@ -5202,6 +5485,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
                               MSC->isTruncatingStore());
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_VP_SCATTER(SDNode *N, unsigned OpNo) {
+  VPScatterSDNode *VPSC = cast<VPScatterSDNode>(N);
+  SDValue DataOp = VPSC->getValue();
+  SDValue Mask = VPSC->getMask();
+  SDValue Index = VPSC->getIndex();
+  SDValue Scale = VPSC->getScale();
+  EVT WideMemVT = VPSC->getMemoryVT();
+
+  if (OpNo == 1) {
+    DataOp = GetWidenedVector(DataOp);
+    Index = GetWidenedVector(Index);
+    const auto WideEC = DataOp.getValueType().getVectorElementCount();
+    Mask = GetWidenedMask(Mask, WideEC);
+    WideMemVT = EVT::getVectorVT(*DAG.getContext(),
+                                 VPSC->getMemoryVT().getScalarType(), WideEC);
+  } else if (OpNo == 4) {
+    // Just widen the index. It's allowed to have extra elements.
+    Index = GetWidenedVector(Index);
+  } else
+    llvm_unreachable("Can't widen this operand of mscatter");
+
+  SDValue Ops[] = {
+      VPSC->getChain(),       DataOp, VPSC->getBasePtr(), Index, Scale, Mask,
+      VPSC->getVectorLength()};
+  return DAG.getScatterVP(DAG.getVTList(MVT::Other), WideMemVT, SDLoc(N), Ops,
+                          VPSC->getMemOperand(), VPSC->getIndexType());
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue InOp0 = GetWidenedVector(N->getOperand(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
@@ -5320,6 +5631,19 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE_SEQ(SDNode *N) {
   return DAG.getNode(Opc, dl, N->getValueType(0), AccOp, Op, Flags);
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_VP_REDUCE(SDNode *N) {
+  assert(N->isVPOpcode() && "Expected VP opcode");
+
+  SDLoc dl(N);
+  SDValue Op = GetWidenedVector(N->getOperand(1));
+  SDValue Mask = GetWidenedMask(N->getOperand(2),
+                                Op.getValueType().getVectorElementCount());
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0),
+                     {N->getOperand(0), Op, Mask, N->getOperand(3)},
+                     N->getFlags());
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
   // This only gets called in the case that the left and right inputs and
   // result are of a legal odd vector type, and the condition is illegal i1 of
@@ -5779,6 +6103,8 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
   EVT InVT = InOp.getValueType();
   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
          "input and widen element type must match");
+  assert(!InVT.isScalableVector() && !NVT.isScalableVector() &&
+         "cannot modify scalable vectors in this way");
   SDLoc dl(InOp);
 
   // Check if InOp already has the right width.
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index aec2cf38b400..403f34573899 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -286,7 +286,7 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   // Cluster loads by adding MVT::Glue outputs and inputs. This also
   // ensure they are scheduled in order of increasing addresses.
   SDNode *Lead = Loads[0];
-  SDValue InGlue = SDValue(nullptr, 0);
+  SDValue InGlue;
   if (AddGlue(Lead, InGlue, true, DAG))
     InGlue = SDValue(Lead, Lead->getNumValues() - 1);
   for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
@@ -1057,12 +1057,13 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
            "first terminator cannot be a debug value");
     for (MachineInstr &MI : make_early_inc_range(
              make_range(std::next(FirstTerm), InsertBB->end()))) {
+      // Only scan up to insertion point.
+      if (&MI == InsertPos)
+        break;
+
       if (!MI.isDebugValue())
         continue;
 
-      if (&MI == InsertPos)
-        InsertPos = std::prev(InsertPos->getIterator());
-
       // The DBG_VALUE was referencing a value produced by a terminator. By
       // moving the DBG_VALUE, the referenced value also needs invalidating.
       MI.getOperand(0).ChangeToRegister(0, false);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 2ae0d4df7b77..45f3005e8f57 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -373,31 +373,46 @@ ISD::NodeType ISD::getVecReduceBaseOpcode(unsigned VecReduceOpcode) {
     llvm_unreachable("Expected VECREDUCE opcode");
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_SEQ_FADD:
+  case ISD::VP_REDUCE_FADD:
+  case ISD::VP_REDUCE_SEQ_FADD:
     return ISD::FADD;
   case ISD::VECREDUCE_FMUL:
   case ISD::VECREDUCE_SEQ_FMUL:
+  case ISD::VP_REDUCE_FMUL:
+  case ISD::VP_REDUCE_SEQ_FMUL:
     return ISD::FMUL;
   case ISD::VECREDUCE_ADD:
+  case ISD::VP_REDUCE_ADD:
     return ISD::ADD;
   case ISD::VECREDUCE_MUL:
+  case ISD::VP_REDUCE_MUL:
     return ISD::MUL;
   case ISD::VECREDUCE_AND:
+  case ISD::VP_REDUCE_AND:
     return ISD::AND;
   case ISD::VECREDUCE_OR:
+  case ISD::VP_REDUCE_OR:
     return ISD::OR;
   case ISD::VECREDUCE_XOR:
+  case ISD::VP_REDUCE_XOR:
     return ISD::XOR;
   case ISD::VECREDUCE_SMAX:
+  case ISD::VP_REDUCE_SMAX:
     return ISD::SMAX;
   case ISD::VECREDUCE_SMIN:
+  case ISD::VP_REDUCE_SMIN:
     return ISD::SMIN;
   case ISD::VECREDUCE_UMAX:
+  case ISD::VP_REDUCE_UMAX:
     return ISD::UMAX;
   case ISD::VECREDUCE_UMIN:
+  case ISD::VP_REDUCE_UMIN:
     return ISD::UMIN;
   case ISD::VECREDUCE_FMAX:
+  case ISD::VP_REDUCE_FMAX:
     return ISD::FMAXNUM;
   case ISD::VECREDUCE_FMIN:
+  case ISD::VP_REDUCE_FMIN:
     return ISD::FMINNUM;
   }
 }
@@ -3066,7 +3081,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   case ISD::MUL: {
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    Known = KnownBits::mul(Known, Known2);
+    bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
+    Known = KnownBits::mul(Known, Known2, SelfMultiply);
     break;
   }
   case ISD::MULHU: {
@@ -3085,8 +3101,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result");
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
     if (Op.getResNo() == 0)
-      Known = KnownBits::mul(Known, Known2);
+      Known = KnownBits::mul(Known, Known2, SelfMultiply);
     else
       Known = KnownBits::mulhu(Known, Known2);
     break;
@@ -3095,8 +3112,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result");
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
     if (Op.getResNo() == 0)
-      Known = KnownBits::mul(Known, Known2);
+      Known = KnownBits::mul(Known, Known2, SelfMultiply);
     else
       Known = KnownBits::mulhs(Known, Known2);
     break;
@@ -3363,6 +3381,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   case ISD::AssertAlign: {
     unsigned LogOfAlign = Log2(cast<AssertAlignSDNode>(Op)->getAlign());
     assert(LogOfAlign != 0);
+
+    // TODO: Should use maximum with source
     // If a node is guaranteed to be aligned, set low zero bits accordingly as
     // well as clearing one bits.
     Known.Zero.setLowBits(LogOfAlign);
@@ -3584,6 +3604,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known = KnownBits::smin(Known, Known2);
     break;
   }
+  case ISD::FP_TO_UINT_SAT: {
+    // FP_TO_UINT_SAT produces an unsigned value that fits in the saturating VT.
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    Known.Zero |= APInt::getBitsSetFrom(BitWidth, VT.getScalarSizeInBits());
+    break;
+  }
   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
     if (Op.getResNo() == 1) {
       // The boolean result conforms to getBooleanContents.
@@ -3860,6 +3886,10 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     break;
   }
 
+  case ISD::FP_TO_SINT_SAT:
+    // FP_TO_SINT_SAT produces a signed value that fits in the saturating VT.
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarSizeInBits();
+    return VTBits - Tmp + 1;
   case ISD::SIGN_EXTEND:
     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1) + Tmp;
@@ -4252,7 +4282,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
           // scalar cases.
           Type *CstTy = Cst->getType();
           if (CstTy->isVectorTy() &&
-              (NumElts * VTBits) == CstTy->getPrimitiveSizeInBits()) {
+              (NumElts * VTBits) == CstTy->getPrimitiveSizeInBits() &&
+              VTBits == CstTy->getScalarSizeInBits()) {
             Tmp = VTBits;
             for (unsigned i = 0; i != NumElts; ++i) {
               if (!DemandedElts[i])
@@ -4294,31 +4325,18 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   KnownBits Known = computeKnownBits(Op, DemandedElts, Depth);
-
-  APInt Mask;
-  if (Known.isNonNegative()) {        // sign bit is 0
-    Mask = Known.Zero;
-  } else if (Known.isNegative()) {  // sign bit is 1;
-    Mask = Known.One;
-  } else {
-    // Nothing known.
-    return FirstAnswer;
-  }
-
-  // Okay, we know that the sign bit in Mask is set.  Use CLO to determine
-  // the number of identical bits in the top of the input value.
-  Mask <<= Mask.getBitWidth()-VTBits;
-  return std::max(FirstAnswer, Mask.countLeadingOnes());
+  return std::max(FirstAnswer, Known.countMinSignBits());
 }
 
-unsigned SelectionDAG::ComputeMinSignedBits(SDValue Op, unsigned Depth) const {
+unsigned SelectionDAG::ComputeMaxSignificantBits(SDValue Op,
+                                                 unsigned Depth) const {
   unsigned SignBits = ComputeNumSignBits(Op, Depth);
   return Op.getScalarValueSizeInBits() - SignBits + 1;
 }
 
-unsigned SelectionDAG::ComputeMinSignedBits(SDValue Op,
-                                            const APInt &DemandedElts,
-                                            unsigned Depth) const {
+unsigned SelectionDAG::ComputeMaxSignificantBits(SDValue Op,
+                                                 const APInt &DemandedElts,
+                                                 unsigned Depth) const {
   unsigned SignBits = ComputeNumSignBits(Op, DemandedElts, Depth);
   return Op.getScalarValueSizeInBits() - SignBits + 1;
 }
@@ -5102,6 +5120,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "BSWAP types must be a multiple of 16 bits!");
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
+    // bswap(bswap(X)) -> X.
+    if (OpOpcode == ISD::BSWAP)
+      return Operand.getOperand(0);
     break;
   case ISD::BITREVERSE:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
@@ -5398,6 +5419,19 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
     }
   }
 
+  // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)).
+  //      (shl step_vector(C0), C1) -> (step_vector(C0 << C1))
+  if ((Opcode == ISD::MUL || Opcode == ISD::SHL) &&
+      Ops[0].getOpcode() == ISD::STEP_VECTOR) {
+    APInt RHSVal;
+    if (ISD::isConstantSplatVector(Ops[1].getNode(), RHSVal)) {
+      APInt NewStep = Opcode == ISD::MUL
+                          ? Ops[0].getConstantOperandAPInt(0) * RHSVal
+                          : Ops[0].getConstantOperandAPInt(0) << RHSVal;
+      return getStepVector(DL, VT, NewStep);
+    }
+  }
+
   auto IsScalarOrSameVectorSize = [NumElts](const SDValue &Op) {
     return !Op.getValueType().isVector() ||
            Op.getValueType().getVectorElementCount() == NumElts;
@@ -5595,22 +5629,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   assert(N1.getOpcode() != ISD::DELETED_NODE &&
          N2.getOpcode() != ISD::DELETED_NODE &&
          "Operand is DELETED_NODE!");
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
-  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
-  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
-
   // Canonicalize constant to RHS if commutative.
   if (TLI->isCommutativeBinOp(Opcode)) {
-    if (N1C && !N2C) {
-      std::swap(N1C, N2C);
+    bool IsN1C = isConstantIntBuildVectorOrConstantInt(N1);
+    bool IsN2C = isConstantIntBuildVectorOrConstantInt(N2);
+    bool IsN1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+    bool IsN2CFP = isConstantFPBuildVectorOrConstantFP(N2);
+    if ((IsN1C && !IsN2C) || (IsN1CFP && !IsN2CFP))
       std::swap(N1, N2);
-    } else if (N1CFP && !N2CFP) {
-      std::swap(N1CFP, N2CFP);
-      std::swap(N1, N2);
-    }
   }
 
+  auto *N1C = dyn_cast<ConstantSDNode>(N1);
+  auto *N2C = dyn_cast<ConstantSDNode>(N2);
+
+  // Don't allow undefs in vector splats - we might be returning N2 when folding
+  // to zero etc.
+  ConstantSDNode *N2CV =
+      isConstOrConstSplat(N2, /*AllowUndefs*/ false, /*AllowTruncation*/ true);
+
   switch (Opcode) {
   default: break;
   case ISD::TokenFactor:
@@ -5640,9 +5676,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
     // worth handling here.
-    if (N2C && N2C->isZero())
+    if (N2CV && N2CV->isZero())
       return N2;
-    if (N2C && N2C->isAllOnes()) // X & -1 -> X
+    if (N2CV && N2CV->isAllOnes()) // X & -1 -> X
       return N1;
     break;
   case ISD::OR:
@@ -5654,7 +5690,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            N1.getValueType() == VT && "Binary operator types must match!");
     // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
     // it's worth handling here.
-    if (N2C && N2C->isZero())
+    if (N2CV && N2CV->isZero())
       return N1;
     if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && VT.isVector() &&
         VT.getVectorElementType() == MVT::i1)
@@ -5760,7 +5796,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // size of the value, the shift/rotate count is guaranteed to be zero.
     if (VT == MVT::i1)
       return N1;
-    if (N2C && N2C->isZero())
+    if (N2CV && N2CV->isZero())
       return N1;
     break;
   case ISD::FP_ROUND:
@@ -6358,7 +6394,7 @@ static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
   if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
     return DAG.getConstant(Val, dl, VT);
-  return SDValue(nullptr, 0);
+  return SDValue();
 }
 
 SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, TypeSize Offset,
@@ -7697,23 +7733,6 @@ SDValue SelectionDAG::getLoadVP(ISD::MemIndexedMode AM,
                                 SDValue Offset, SDValue Mask, SDValue EVL,
                                 EVT MemVT, MachineMemOperand *MMO,
                                 bool IsExpanding) {
-  if (VT == MemVT) {
-    ExtType = ISD::NON_EXTLOAD;
-  } else if (ExtType == ISD::NON_EXTLOAD) {
-    assert(VT == MemVT && "Non-extending load from different memory type!");
-  } else {
-    // Extending load.
-    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
-           "Should only be an extending load, not truncating!");
-    assert(VT.isInteger() == MemVT.isInteger() &&
-           "Cannot convert from FP to Int or Int -> FP!");
-    assert(VT.isVector() == MemVT.isVector() &&
-           "Cannot use an ext load to convert to or from a vector!");
-    assert((!VT.isVector() ||
-            VT.getVectorElementCount() == MemVT.getVectorElementCount()) &&
-           "Cannot use an ext load to change the number of vector elements!");
-  }
-
   bool Indexed = AM != ISD::UNINDEXED;
   assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
 
@@ -7802,48 +7821,29 @@ SDValue SelectionDAG::getIndexedLoadVP(SDValue OrigLoad, const SDLoc &dl,
 }
 
 SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
-                                 SDValue Ptr, SDValue Mask, SDValue EVL,
-                                 MachinePointerInfo PtrInfo, Align Alignment,
-                                 MachineMemOperand::Flags MMOFlags,
-                                 const AAMDNodes &AAInfo, bool IsCompressing) {
+                                 SDValue Ptr, SDValue Offset, SDValue Mask,
+                                 SDValue EVL, EVT MemVT, MachineMemOperand *MMO,
+                                 ISD::MemIndexedMode AM, bool IsTruncating,
+                                 bool IsCompressing) {
   assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-
-  MMOFlags |= MachineMemOperand::MOStore;
-  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
-
-  if (PtrInfo.V.isNull())
-    PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
-
-  MachineFunction &MF = getMachineFunction();
-  uint64_t Size =
-      MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize());
-  MachineMemOperand *MMO =
-      MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
-  return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing);
-}
-
-SDValue SelectionDAG::getStoreVP(SDValue Chain, const SDLoc &dl, SDValue Val,
-                                 SDValue Ptr, SDValue Mask, SDValue EVL,
-                                 MachineMemOperand *MMO, bool IsCompressing) {
-  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-  EVT VT = Val.getValueType();
-  SDVTList VTs = getVTList(MVT::Other);
-  SDValue Undef = getUNDEF(Ptr.getValueType());
-  SDValue Ops[] = {Chain, Val, Ptr, Undef, Mask, EVL};
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed vp_store with an offset!");
+  SDVTList VTs = Indexed ? getVTList(Ptr.getValueType(), MVT::Other)
+                         : getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Val, Ptr, Offset, Mask, EVL};
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::VP_STORE, VTs, Ops);
-  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(MemVT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<VPStoreSDNode>(
-      dl.getIROrder(), VTs, ISD::UNINDEXED, false, IsCompressing, VT, MMO));
+      dl.getIROrder(), VTs, AM, IsTruncating, IsCompressing, MemVT, MMO));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<VPStoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  auto *N =
-      newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
-                               ISD::UNINDEXED, false, IsCompressing, VT, MMO);
+  auto *N = newSDNode<VPStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                     IsTruncating, IsCompressing, MemVT, MMO);
   createOperands(N, Ops);
 
   CSEMap.InsertNode(N, IP);
@@ -7885,7 +7885,9 @@ SDValue SelectionDAG::getTruncStoreVP(SDValue Chain, const SDLoc &dl,
 
   assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
   if (VT == SVT)
-    return getStoreVP(Chain, dl, Val, Ptr, Mask, EVL, MMO, IsCompressing);
+    return getStoreVP(Chain, dl, Val, Ptr, getUNDEF(Ptr.getValueType()), Mask,
+                      EVL, VT, MMO, ISD::UNINDEXED,
+                      /*IsTruncating*/ false, IsCompressing);
 
   assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
          "Should only be a truncating store, not extending!");
@@ -10661,6 +10663,23 @@ SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
   return std::make_pair(Lo, Hi);
 }
 
+std::pair<SDValue, SDValue> SelectionDAG::SplitEVL(SDValue N, EVT VecVT,
+                                                   const SDLoc &DL) {
+  // Split the vector length parameter.
+  // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts).
+  EVT VT = N.getValueType();
+  assert(VecVT.getVectorElementCount().isKnownEven() &&
+         "Expecting the mask to be an evenly-sized vector");
+  unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2;
+  SDValue HalfNumElts =
+      VecVT.isFixedLengthVector()
+          ? getConstant(HalfMinNumElts, DL, VT)
+          : getVScale(DL, VT, APInt(VT.getScalarSizeInBits(), HalfMinNumElts));
+  SDValue Lo = getNode(ISD::UMIN, DL, VT, N, HalfNumElts);
+  SDValue Hi = getNode(ISD::USUBSAT, DL, VT, N, HalfNumElts);
+  return std::make_pair(Lo, Hi);
+}
+
 /// Widen the vector up to the next power of two using INSERT_SUBVECTOR.
 SDValue SelectionDAG::WidenVector(const SDValue &N, const SDLoc &DL) {
   EVT VT = N.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 63cd723cf6da..41460f78e1c2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1683,6 +1683,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const MetadataAsValue *MD = dyn_cast<MetadataAsValue>(V)) {
     return DAG.getMDNode(cast<MDNode>(MD->getMetadata()));
   }
+  if (const auto *BB = dyn_cast<BasicBlock>(V))
+    return DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
   llvm_unreachable("Can't get register for value!");
 }
 
@@ -4846,10 +4848,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   }
 
   if (!I.getType()->isVoidTy()) {
-    if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
-      EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
-      Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
-    } else
+    if (!isa<VectorType>(I.getType()))
       Result = lowerRangeToAssertZExt(DAG, I, Result);
 
     MaybeAlign Alignment = I.getRetAlign();
@@ -7327,8 +7326,6 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   Value *PtrOperand = VPIntrin.getArgOperand(0);
   MaybeAlign Alignment = VPIntrin.getPointerAlignment();
-  if (!Alignment)
-    Alignment = DAG.getEVTAlign(VT);
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
   const MDNode *Ranges = VPIntrin.getMetadata(LLVMContext::MD_range);
   SDValue LD;
@@ -7336,6 +7333,8 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
   if (!IsGather) {
     // Do not serialize variable-length loads of constant memory with
     // anything.
+    if (!Alignment)
+      Alignment = DAG.getEVTAlign(VT);
     MemoryLocation ML = MemoryLocation::getAfter(PtrOperand, AAInfo);
     AddToChain = !AA || !AA->pointsToConstantMemory(ML);
     SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
@@ -7345,6 +7344,8 @@ void SelectionDAGBuilder::visitVPLoadGather(const VPIntrinsic &VPIntrin, EVT VT,
     LD = DAG.getLoadVP(VT, DL, InChain, OpValues[0], OpValues[1], OpValues[2],
                        MMO, false /*IsExpanding */);
   } else {
+    if (!Alignment)
+      Alignment = DAG.getEVTAlign(VT.getScalarType());
     unsigned AS =
         PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
@@ -7385,18 +7386,22 @@ void SelectionDAGBuilder::visitVPStoreScatter(const VPIntrinsic &VPIntrin,
   Value *PtrOperand = VPIntrin.getArgOperand(1);
   EVT VT = OpValues[0].getValueType();
   MaybeAlign Alignment = VPIntrin.getPointerAlignment();
-  if (!Alignment)
-    Alignment = DAG.getEVTAlign(VT);
   AAMDNodes AAInfo = VPIntrin.getAAMetadata();
   SDValue ST;
   if (!IsScatter) {
+    if (!Alignment)
+      Alignment = DAG.getEVTAlign(VT);
+    SDValue Ptr = OpValues[1];
+    SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
         MemoryLocation::UnknownSize, *Alignment, AAInfo);
-    ST =
-        DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], OpValues[1],
-                       OpValues[2], OpValues[3], MMO, false /* IsTruncating */);
+    ST = DAG.getStoreVP(getMemoryRoot(), DL, OpValues[0], Ptr, Offset,
+                        OpValues[2], OpValues[3], VT, MMO, ISD::UNINDEXED,
+                        /* IsTruncating */ false, /*IsCompressing*/ false);
   } else {
+    if (!Alignment)
+      Alignment = DAG.getEVTAlign(VT.getScalarType());
     unsigned AS =
         PtrOperand->getType()->getScalarType()->getPointerAddressSpace();
     MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
@@ -8250,7 +8255,8 @@ public:
   /// corresponds to.  If there is no Value* for this operand, it returns
   /// MVT::Other.
   EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI,
-                           const DataLayout &DL) const {
+                           const DataLayout &DL,
+                           llvm::Type *ParamElemType) const {
     if (!CallOperandVal) return MVT::Other;
 
     if (isa<BasicBlock>(CallOperandVal))
@@ -8262,10 +8268,8 @@ public:
     // If this is an indirect operand, the operand is a pointer to the
     // accessed type.
     if (isIndirect) {
-      PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
-      if (!PtrTy)
-        report_fatal_error("Indirect operand for inline asm not a pointer!");
-      OpTy = PtrTy->getElementType();
+      OpTy = ParamElemType;
+      assert(OpTy && "Indirect opernad must have elementtype attribute");
     }
 
     // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
@@ -8559,37 +8563,19 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
 
   unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
   unsigned ResNo = 0;   // ResNo - The result number of the next output.
-  unsigned NumMatchingOps = 0;
   for (auto &T : TargetConstraints) {
     ConstraintOperands.push_back(SDISelAsmOperandInfo(T));
     SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
 
     // Compute the value type for each operand.
-    if (OpInfo.Type == InlineAsm::isInput ||
-        (OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
-      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++);
-
-      // Process the call argument. BasicBlocks are labels, currently appearing
-      // only in asm's.
-      if (isa<CallBrInst>(Call) &&
-          ArgNo - 1 >= (cast<CallBrInst>(&Call)->arg_size() -
-                        cast<CallBrInst>(&Call)->getNumIndirectDests() -
-                        NumMatchingOps) &&
-          (NumMatchingOps == 0 ||
-           ArgNo - 1 <
-               (cast<CallBrInst>(&Call)->arg_size() - NumMatchingOps))) {
-        const auto *BA = cast<BlockAddress>(OpInfo.CallOperandVal);
-        EVT VT = TLI.getValueType(DAG.getDataLayout(), BA->getType(), true);
-        OpInfo.CallOperand = DAG.getTargetBlockAddress(BA, VT);
-      } else if (const auto *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
-        OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
-      } else {
-        OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
-      }
-
+    if (OpInfo.hasArg()) {
+      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
+      OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
+      Type *ParamElemTy = Call.getAttributes().getParamElementType(ArgNo);
       EVT VT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI,
-                                           DAG.getDataLayout());
+                                           DAG.getDataLayout(), ParamElemTy);
       OpInfo.ConstraintVT = VT.isSimple() ? VT.getSimpleVT() : MVT::Other;
+      ArgNo++;
     } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
       // The return value of the call is this value.  As such, there is no
       // corresponding argument.
@@ -8607,9 +8593,6 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
       OpInfo.ConstraintVT = MVT::Other;
     }
 
-    if (OpInfo.hasMatchingInput())
-      ++NumMatchingOps;
-
     if (!HasSideEffect)
       HasSideEffect = OpInfo.hasMemory(TLI);
 
@@ -11246,12 +11229,6 @@ void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) {
 
   unsigned NumElts = VT.getVectorNumElements();
 
-  if ((-Imm > NumElts) || (Imm >= NumElts)) {
-    // Result is undefined if immediate is out-of-bounds.
-    setValue(&I, DAG.getUNDEF(VT));
-    return;
-  }
-
   uint64_t Idx = (NumElts + Imm) % NumElts;
 
   // Use VECTOR_SHUFFLE to maintain original behaviour for fixed-length vectors.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e6b06ab93d6b..a98c21f16c71 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -60,7 +60,7 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
   // Conservatively require the attributes of the call to match those of
   // the return. Ignore following attributes because they don't affect the
   // call sequence.
-  AttrBuilder CallerAttrs(F.getAttributes(), AttributeList::ReturnIndex);
+  AttrBuilder CallerAttrs(F.getContext(), F.getAttributes().getRetAttrs());
   for (const auto &Attr : {Attribute::Alignment, Attribute::Dereferenceable,
                            Attribute::DereferenceableOrNull, Attribute::NoAlias,
                            Attribute::NonNull})
@@ -1806,6 +1806,31 @@ bool TargetLowering::SimplifyDemandedBits(
   }
   case ISD::BSWAP: {
     SDValue Src = Op.getOperand(0);
+
+    // If the only bits demanded come from one byte of the bswap result,
+    // just shift the input byte into position to eliminate the bswap.
+    unsigned NLZ = DemandedBits.countLeadingZeros();
+    unsigned NTZ = DemandedBits.countTrailingZeros();
+
+    // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
+    // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
+    // have 14 leading zeros, round to 8.
+    NLZ = alignDown(NLZ, 8);
+    NTZ = alignDown(NTZ, 8);
+    // If we need exactly one byte, we can do this transformation.
+    if (BitWidth - NLZ - NTZ == 8) {
+      // Replace this with either a left or right shift to get the byte into
+      // the right place.
+      unsigned ShiftOpcode = NLZ > NTZ ? ISD::SRL : ISD::SHL;
+      if (!TLO.LegalOperations() || isOperationLegal(ShiftOpcode, VT)) {
+        EVT ShiftAmtTy = getShiftAmountTy(VT, DL);
+        unsigned ShiftAmount = NLZ > NTZ ? NLZ - NTZ : NTZ - NLZ;
+        SDValue ShAmt = TLO.DAG.getConstant(ShiftAmount, dl, ShiftAmtTy);
+        SDValue NewOp = TLO.DAG.getNode(ShiftOpcode, dl, VT, Src, ShAmt);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     APInt DemandedSrcBits = DemandedBits.byteSwap();
     if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, Known2, TLO,
                              Depth + 1))
@@ -1833,19 +1858,15 @@ bool TargetLowering::SimplifyDemandedBits(
     // If we only care about the highest bit, don't bother shifting right.
     if (DemandedBits.isSignMask()) {
       unsigned MinSignedBits =
-          TLO.DAG.ComputeMinSignedBits(Op0, DemandedElts, Depth + 1);
+          TLO.DAG.ComputeMaxSignificantBits(Op0, DemandedElts, Depth + 1);
       bool AlreadySignExtended = ExVTBits >= MinSignedBits;
       // However if the input is already sign extended we expect the sign
       // extension to be dropped altogether later and do not simplify.
       if (!AlreadySignExtended) {
         // Compute the correct shift amount type, which must be getShiftAmountTy
         // for scalar types after legalization.
-        EVT ShiftAmtTy = VT;
-        if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
-          ShiftAmtTy = getShiftAmountTy(ShiftAmtTy, DL);
-
-        SDValue ShiftAmt =
-            TLO.DAG.getConstant(BitWidth - ExVTBits, dl, ShiftAmtTy);
+        SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ExVTBits, dl,
+                                               getShiftAmountTy(VT, DL));
         return TLO.CombineTo(Op,
                              TLO.DAG.getNode(ISD::SHL, dl, VT, Op0, ShiftAmt));
       }
@@ -3233,17 +3254,29 @@ bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
 SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
                                          ISD::CondCode Cond, const SDLoc &DL,
                                          DAGCombinerInfo &DCI) const {
-  // Match these patterns in any of their permutations:
-  // (X & Y) == Y
-  // (X & Y) != Y
   if (N1.getOpcode() == ISD::AND && N0.getOpcode() != ISD::AND)
     std::swap(N0, N1);
 
+  SelectionDAG &DAG = DCI.DAG;
   EVT OpVT = N0.getValueType();
   if (N0.getOpcode() != ISD::AND || !OpVT.isInteger() ||
       (Cond != ISD::SETEQ && Cond != ISD::SETNE))
     return SDValue();
 
+  // (X & Y) != 0 --> zextOrTrunc(X & Y)
+  // iff everything but LSB is known zero:
+  if (Cond == ISD::SETNE && isNullConstant(N1) &&
+      (getBooleanContents(OpVT) == TargetLowering::UndefinedBooleanContent ||
+       getBooleanContents(OpVT) == TargetLowering::ZeroOrOneBooleanContent)) {
+    unsigned NumEltBits = OpVT.getScalarSizeInBits();
+    APInt UpperBits = APInt::getHighBitsSet(NumEltBits, NumEltBits - 1);
+    if (DAG.MaskedValueIsZero(N0, UpperBits))
+      return DAG.getBoolExtOrTrunc(N0, DL, VT, OpVT);
+  }
+
+  // Match these patterns in any of their permutations:
+  // (X & Y) == Y
+  // (X & Y) != Y
   SDValue X, Y;
   if (N0.getOperand(0) == N1) {
     X = N0.getOperand(1);
@@ -3255,7 +3288,6 @@ SDValue TargetLowering::foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
     return SDValue();
   }
 
-  SelectionDAG &DAG = DCI.DAG;
   SDValue Zero = DAG.getConstant(0, DL, OpVT);
   if (DAG.isKnownToBeAPowerOfTwo(Y)) {
     // Simplify X & Y == Y to X & Y != 0 if Y has exactly one bit set.
@@ -3678,9 +3710,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
 
       // Figure out how many bits we need to preserve this constant.
-      unsigned ReqdBits = Signed ?
-        C1.getBitWidth() - C1.getNumSignBits() + 1 :
-        C1.getActiveBits();
+      unsigned ReqdBits = Signed ? C1.getMinSignedBits() : C1.getActiveBits();
 
       // Make sure we're not losing bits from the constant.
       if (MinBits > 0 &&
@@ -4594,20 +4624,12 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default: break;
-  case 'X':     // Allows any operand; labels (basic block) use this.
-    if (Op.getOpcode() == ISD::BasicBlock ||
-        Op.getOpcode() == ISD::TargetBlockAddress) {
-      Ops.push_back(Op);
-      return;
-    }
-    LLVM_FALLTHROUGH;
+  case 'X':    // Allows any operand
   case 'i':    // Simple Integer or Relocatable Constant
   case 'n':    // Simple Integer
   case 's': {  // Relocatable Constant
 
-    GlobalAddressSDNode *GA;
     ConstantSDNode *C;
-    BlockAddressSDNode *BA;
     uint64_t Offset = 0;
 
     // Match (GA) or (C) or (GA+C) or (GA-C) or ((GA+C)+C) or (((GA+C)+C)+C),
@@ -4615,13 +4637,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     // SelectionDAG::FoldSymbolOffset because it expects the GA to be accessible
     // while in this case the GA may be furthest from the root node which is
     // likely an ISD::ADD.
-    while (1) {
-      if ((GA = dyn_cast<GlobalAddressSDNode>(Op)) && ConstraintLetter != 'n') {
-        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
-                                                 GA->getValueType(0),
-                                                 Offset + GA->getOffset()));
-        return;
-      }
+    while (true) {
       if ((C = dyn_cast<ConstantSDNode>(Op)) && ConstraintLetter != 's') {
         // gcc prints these as sign extended.  Sign extend value to 64 bits
         // now; without this it would get ZExt'd later in
@@ -4636,11 +4652,23 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
             DAG.getTargetConstant(Offset + ExtVal, SDLoc(C), MVT::i64));
         return;
       }
-      if ((BA = dyn_cast<BlockAddressSDNode>(Op)) && ConstraintLetter != 'n') {
-        Ops.push_back(DAG.getTargetBlockAddress(
-            BA->getBlockAddress(), BA->getValueType(0),
-            Offset + BA->getOffset(), BA->getTargetFlags()));
-        return;
+      if (ConstraintLetter != 'n') {
+        if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
+          Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
+                                                   GA->getValueType(0),
+                                                   Offset + GA->getOffset()));
+          return;
+        }
+        if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
+          Ops.push_back(DAG.getTargetBlockAddress(
+              BA->getBlockAddress(), BA->getValueType(0),
+              Offset + BA->getOffset(), BA->getTargetFlags()));
+          return;
+        }
+        if (isa<BasicBlockSDNode>(Op)) {
+          Ops.push_back(Op);
+          return;
+        }
       }
       const unsigned OpCode = Op.getOpcode();
       if (OpCode == ISD::ADD || OpCode == ISD::SUB) {
@@ -4753,7 +4781,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
     case InlineAsm::isOutput:
       // Indirect outputs just consume an argument.
       if (OpInfo.isIndirect) {
-        OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++);
+        OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
         break;
       }
 
@@ -4771,7 +4799,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
       ++ResNo;
       break;
     case InlineAsm::isInput:
-      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++);
+      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
       break;
     case InlineAsm::isClobber:
       // Nothing to do.
@@ -4781,10 +4809,8 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
     if (OpInfo.CallOperandVal) {
       llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
       if (OpInfo.isIndirect) {
-        llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
-        if (!PtrTy)
-          report_fatal_error("Indirect operand for inline asm not a pointer!");
-        OpTy = PtrTy->getElementType();
+        OpTy = Call.getAttributes().getParamElementType(ArgNo);
+        assert(OpTy && "Indirect opernad must have elementtype attribute");
       }
 
       // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
@@ -4814,6 +4840,8 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
       } else {
         OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
       }
+
+      ArgNo++;
     }
   }
 
@@ -5087,17 +5115,18 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
 
   // 'X' matches anything.
   if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
-    // Labels and constants are handled elsewhere ('X' is the only thing
-    // that matches labels).  For Functions, the type here is the type of
-    // the result, which is not what we want to look at; leave them alone.
+    // Constants are handled elsewhere.  For Functions, the type here is the
+    // type of the result, which is not what we want to look at; leave them
+    // alone.
     Value *v = OpInfo.CallOperandVal;
-    if (isa<BasicBlock>(v) || isa<ConstantInt>(v) || isa<Function>(v)) {
-      OpInfo.CallOperandVal = v;
+    if (isa<ConstantInt>(v) || isa<Function>(v)) {
       return;
     }
 
-    if (Op.getNode() && Op.getOpcode() == ISD::TargetBlockAddress)
+    if (isa<BasicBlock>(v) || isa<BlockAddress>(v)) {
+      OpInfo.ConstraintCode = "i";
       return;
+    }
 
     // Otherwise, try to resolve it to something we know about by looking at
     // the actual operand type.
@@ -6438,12 +6467,6 @@ bool TargetLowering::expandMUL_LOHI(unsigned Opcode, EVT VT, const SDLoc &dl,
 
   unsigned ShiftAmount = OuterBitSize - InnerBitSize;
   EVT ShiftAmountTy = getShiftAmountTy(VT, DAG.getDataLayout());
-  if (APInt::getMaxValue(ShiftAmountTy.getSizeInBits()).ult(ShiftAmount)) {
-    // FIXME getShiftAmountTy does not always return a sensible result when VT
-    // is an illegal type, and so the type may be too small to fit the shift
-    // amount. Override it with i32. The shift will have to be legalized.
-    ShiftAmountTy = MVT::i32;
-  }
   SDValue Shift = DAG.getConstant(ShiftAmount, dl, ShiftAmountTy);
 
   if (!LH.getNode() && !RH.getNode() &&
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index f89069e9f728..f6ad2b50abcd 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -273,6 +273,8 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
     LLVM_DEBUG(dbgs() << "Frame instruction: " << MI << '\n');
     return true;
   }
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   for (const MachineOperand &MO : MI.operands()) {
     bool UseOrDefCSR = false;
     if (MO.isReg()) {
@@ -288,8 +290,14 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
       // separately. An SP mentioned by a call instruction, we can ignore,
       // though, as it's harmless and we do not want to effectively disable tail
       // calls by forcing the restore point to post-dominate them.
-      UseOrDefCSR = (!MI.isCall() && PhysReg == SP) ||
-                    RCI.getLastCalleeSavedAlias(PhysReg);
+      // PPC's LR is also not normally described as a callee-saved register in
+      // calling convention definitions, so we need to watch for it, too. An LR
+      // mentioned implicitly by a return (or "branch to link register")
+      // instruction we can ignore, otherwise we may pessimize shrinkwrapping.
+      UseOrDefCSR =
+          (!MI.isCall() && PhysReg == SP) ||
+          RCI.getLastCalleeSavedAlias(PhysReg) ||
+          (!MI.isReturn() && TRI->isNonallocatableRegisterCalleeSave(PhysReg));
     } else if (MO.isRegMask()) {
       // Check if this regmask clobbers any of the CSRs.
       for (unsigned Reg : getCurrentCSRs(RS)) {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6fc6881f8736..ab574232e367 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -715,6 +715,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   SchedPreferenceInfo = Sched::ILP;
   GatherAllAliasesMaxDepth = 18;
   IsStrictFPEnabled = DisableStrictNodeMutation;
+  MaxBytesForAlignment = 0;
   // TODO: the default will be switched to 0 in the next commit, along
   // with the Target-specific changes necessary.
   MaxAtomicSizeInBitsSupported = 1024;
@@ -2040,6 +2041,11 @@ Align TargetLoweringBase::getPrefLoopAlignment(MachineLoop *ML) const {
   return PrefLoopAlignment;
 }
 
+unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment(
+    MachineBasicBlock *MBB) const {
+  return MaxBytesForAlignment;
+}
+
 //===----------------------------------------------------------------------===//
 //  Reciprocal Estimates
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index d1c2cdeb133b..ce350034d073 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -108,8 +108,7 @@ static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags,
 //                                  ELF
 //===----------------------------------------------------------------------===//
 
-TargetLoweringObjectFileELF::TargetLoweringObjectFileELF()
-    : TargetLoweringObjectFile() {
+TargetLoweringObjectFileELF::TargetLoweringObjectFileELF() {
   SupportDSOLocalEquivalentLowering = true;
 }
 
@@ -478,6 +477,11 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
   return K;
 }
 
+static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
+  return SectionName.consume_front(Prefix) &&
+         (SectionName.empty() || SectionName[0] == '.');
+}
+
 static unsigned getELFSectionType(StringRef Name, SectionKind K) {
   // Use SHT_NOTE for section whose name starts with ".note" to allow
   // emitting ELF notes from C variable declaration.
@@ -485,13 +489,13 @@ static unsigned getELFSectionType(StringRef Name, SectionKind K) {
   if (Name.startswith(".note"))
     return ELF::SHT_NOTE;
 
-  if (Name == ".init_array")
+  if (hasPrefix(Name, ".init_array"))
     return ELF::SHT_INIT_ARRAY;
 
-  if (Name == ".fini_array")
+  if (hasPrefix(Name, ".fini_array"))
     return ELF::SHT_FINI_ARRAY;
 
-  if (Name == ".preinit_array")
+  if (hasPrefix(Name, ".preinit_array"))
     return ELF::SHT_PREINIT_ARRAY;
 
   if (K.isBSS() || K.isThreadBSS())
@@ -1139,8 +1143,7 @@ TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
 //                                 MachO
 //===----------------------------------------------------------------------===//
 
-TargetLoweringObjectFileMachO::TargetLoweringObjectFileMachO()
-  : TargetLoweringObjectFile() {
+TargetLoweringObjectFileMachO::TargetLoweringObjectFileMachO() {
   SupportIndirectSymViaGOTPCRel = true;
 }
 
@@ -1185,6 +1188,7 @@ void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer,
   StringRef SectionVal;
 
   GetObjCImageInfo(M, VersionVal, ImageInfoFlags, SectionVal);
+  emitCGProfileMetadata(Streamer, M);
 
   // The section is mandatory. If we don't have it, then we don't have GC info.
   if (SectionVal.empty())
@@ -2543,8 +2547,7 @@ MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
 //===----------------------------------------------------------------------===//
 //                                  GOFF
 //===----------------------------------------------------------------------===//
-TargetLoweringObjectFileGOFF::TargetLoweringObjectFileGOFF()
-    : TargetLoweringObjectFile() {}
+TargetLoweringObjectFileGOFF::TargetLoweringObjectFileGOFF() {}
 
 MCSection *TargetLoweringObjectFileGOFF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 402e21d3708b..05004fb935df 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -328,7 +328,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
 
 // Find the FSProfile file name. The internal option takes the precedence
 // before getting from TargetMachine.
-static const std::string getFSProfileFile(const TargetMachine *TM) {
+static std::string getFSProfileFile(const TargetMachine *TM) {
   if (!FSProfileFile.empty())
     return FSProfileFile.getValue();
   const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
@@ -339,7 +339,7 @@ static const std::string getFSProfileFile(const TargetMachine *TM) {
 
 // Find the Profile remapping file name. The internal option takes the
 // precedence before getting from TargetMachine.
-static const std::string getFSRemappingFile(const TargetMachine *TM) {
+static std::string getFSRemappingFile(const TargetMachine *TM) {
   if (!FSRemappingFile.empty())
     return FSRemappingFile.getValue();
   const Optional<PGOOptions> &PGOOpt = TM->getPGOOption();
@@ -1399,6 +1399,9 @@ bool TargetPassConfig::addRegAssignAndRewriteOptimized() {
   // Finally rewrite virtual registers.
   addPass(&VirtRegRewriterID);
 
+  // Regalloc scoring for ML-driven eviction - noop except when learning a new
+  // eviction policy.
+  addPass(createRegAllocScoringPass());
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index f5cb518fce3e..6bcf79547056 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -552,7 +552,7 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes(
 
   // Abort if we cannot possibly implement the COPY with the given indexes.
   if (BestIdx == 0)
-    return 0;
+    return false;
 
   NeededIndexes.push_back(BestIdx);
 
@@ -581,7 +581,7 @@ bool TargetRegisterInfo::getCoveringSubRegIndexes(
     }
 
     if (BestIdx == 0)
-      return 0; // Impossible to handle
+      return false; // Impossible to handle
 
     NeededIndexes.push_back(BestIdx);
 
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index d042deefd746..01ea171e5ea2 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -116,11 +116,11 @@ class IRPromoter {
   SmallPtrSet<Value*, 8> Promoted;
 
   void ReplaceAllUsersOfWith(Value *From, Value *To);
-  void ExtendSources(void);
-  void ConvertTruncs(void);
-  void PromoteTree(void);
-  void TruncateSinks(void);
-  void Cleanup(void);
+  void ExtendSources();
+  void ConvertTruncs();
+  void PromoteTree();
+  void TruncateSinks();
+  void Cleanup();
 
 public:
   IRPromoter(LLVMContext &C, IntegerType *Ty, unsigned Width,
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
index cbc5d9ec169b..5f59cb4643f2 100644
--- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -293,7 +293,7 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
 
   const std::vector<unsigned> &MaxPressure =
       DAG->getRegPressure().MaxSetPressure;
-  HighPressureSets.assign(MaxPressure.size(), 0);
+  HighPressureSets.assign(MaxPressure.size(), false);
   for (unsigned i = 0, e = MaxPressure.size(); i < e; ++i) {
     unsigned Limit = DAG->getRegClassInfo()->getRegPressureSetLimit(i);
     HighPressureSets[i] =
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index ae0859e1ecfd..b56095ca9a96 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -124,6 +124,7 @@ static bool isTypeTag(uint16_t Tag) {
   case dwarf::DW_TAG_interface_type:
   case dwarf::DW_TAG_unspecified_type:
   case dwarf::DW_TAG_shared_type:
+  case dwarf::DW_TAG_immutable_type:
     return true;
   default:
     break;
@@ -1934,7 +1935,7 @@ uint32_t DWARFLinker::DIECloner::hashFullyQualifiedName(DWARFDie DIE,
   CompileUnit *CU = &U;
   Optional<DWARFFormValue> Ref;
 
-  while (1) {
+  while (true) {
     if (const char *CurrentName = DIE.getName(DINameKind::ShortName))
       Name = CurrentName;
 
@@ -2107,7 +2108,6 @@ Error DWARFLinker::loadClangModule(
       // Add this module.
       Unit = std::make_unique<CompileUnit>(*CU, UnitID++, !Options.NoODR,
                                            ModuleName);
-      Unit->setHasInterestingContent();
       analyzeContextInfo(CUDie, 0, *Unit, &ODRContexts.getRoot(), ODRContexts,
                          ModulesEndOffset, Options.ParseableSwiftInterfaces,
                          [&](const Twine &Warning, const DWARFDie &DIE) {
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
index 925ab3d295c2..acecb1788d10 100644
--- a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
@@ -40,8 +40,6 @@ StringRef CompileUnit::getSysRoot() {
 void CompileUnit::markEverythingAsKept() {
   unsigned Idx = 0;
 
-  setHasInterestingContent();
-
   for (auto &I : Info) {
     // Mark everything that wasn't explicit marked for pruning.
     I.Keep = !I.Prune;
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
index d9b3c4235b4d..5ab2ad0780a2 100644
--- a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
@@ -173,7 +173,7 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
              !(*ContextIter)->setLastSeenDIE(U, DIE)) {
     // The context was found, but it is ambiguous with another context
     // in the same file. Mark it invalid.
-    return PointerIntPair<DeclContext *, 1>(*ContextIter, /* Invalid= */ 1);
+    return PointerIntPair<DeclContext *, 1>(*ContextIter, /* IntVal= */ 1);
   }
 
   assert(ContextIter != Contexts.end());
@@ -183,7 +183,7 @@ DeclContextTree::getChildDeclContext(DeclContext &Context, const DWARFDie &DIE,
        Context.getTag() != dwarf::DW_TAG_structure_type &&
        Context.getTag() != dwarf::DW_TAG_class_type) ||
       (Tag == dwarf::DW_TAG_union_type))
-    return PointerIntPair<DeclContext *, 1>(*ContextIter, /* Invalid= */ 1);
+    return PointerIntPair<DeclContext *, 1>(*ContextIter, /* IntVal= */ 1);
 
   return PointerIntPair<DeclContext *, 1>(*ContextIter);
 }
diff --git a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
index b4a2a0031b2d..adf4ae519dae 100644
--- a/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/llvm/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -104,7 +104,7 @@ static const EnumEntry<codeview::SourceLanguage> SourceLanguages[] = {
     CV_ENUM_ENT(SourceLanguage, ILAsm),   CV_ENUM_ENT(SourceLanguage, Java),
     CV_ENUM_ENT(SourceLanguage, JScript), CV_ENUM_ENT(SourceLanguage, MSIL),
     CV_ENUM_ENT(SourceLanguage, HLSL),    CV_ENUM_ENT(SourceLanguage, D),
-    CV_ENUM_ENT(SourceLanguage, Swift),
+    CV_ENUM_ENT(SourceLanguage, Swift),   CV_ENUM_ENT(SourceLanguage, Rust),
 };
 
 static const EnumEntry<uint32_t> CompileSym2FlagNames[] = {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 6e30309ae94a..d68ecd4f8a42 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -15,6 +15,8 @@
 using namespace llvm;
 
 void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
+  if (DumpOpts.SummarizeTypes)
+    return;
   int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(getFormat());
   OS << format("0x%08" PRIx64, getOffset()) << ": Compile Unit:"
      << " length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 95135c95e8d2..ef50ad53650a 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -695,14 +695,30 @@ void DWARFContext::dump(
 
 DWARFTypeUnit *DWARFContext::getTypeUnitForHash(uint16_t Version, uint64_t Hash,
                                                 bool IsDWO) {
-  // FIXME: Check for/use the tu_index here, if there is one.
-  for (const auto &U : IsDWO ? dwo_units() : normal_units()) {
-    if (DWARFTypeUnit *TU = dyn_cast<DWARFTypeUnit>(U.get())) {
-      if (TU->getTypeHash() == Hash)
-        return TU;
+  parseDWOUnits(LazyParse);
+
+  if (const auto &TUI = getTUIndex()) {
+    if (const auto *R = TUI.getFromHash(Hash))
+      return dyn_cast_or_null<DWARFTypeUnit>(
+          DWOUnits.getUnitForIndexEntry(*R));
+    return nullptr;
+  }
+
+  struct UnitContainers {
+    const DWARFUnitVector &Units;
+    Optional<DenseMap<uint64_t, DWARFTypeUnit *>> &Map;
+  };
+  UnitContainers Units = IsDWO ? UnitContainers{DWOUnits, DWOTypeUnits}
+                               : UnitContainers{NormalUnits, NormalTypeUnits};
+  if (!Units.Map) {
+    Units.Map.emplace();
+    for (const auto &U : IsDWO ? dwo_units() : normal_units()) {
+      if (DWARFTypeUnit *TU = dyn_cast<DWARFTypeUnit>(U.get()))
+        (*Units.Map)[TU->getTypeHash()] = TU;
     }
   }
-  return nullptr;
+
+  return (*Units.Map)[Hash];
 }
 
 DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
@@ -1098,6 +1114,7 @@ static Optional<uint64_t> getTypeSize(DWARFDie Type, uint64_t PointerSize) {
     return PointerSize;
   }
   case DW_TAG_const_type:
+  case DW_TAG_immutable_type:
   case DW_TAG_volatile_type:
   case DW_TAG_restrict_type:
   case DW_TAG_typedef: {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
index d91a630256d6..ee54fc754803 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
@@ -74,7 +74,7 @@ std::string DWARFAbbreviationDeclarationSet::getCodeRange() const {
   for (const auto &Decl : Decls)
     Codes.push_back(Decl.getCode());
 
-  std::string Buffer = "";
+  std::string Buffer;
   raw_string_ostream Stream(Buffer);
   // Each iteration through this loop represents a single contiguous range in
   // the set of codes.
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 5421b2d59a1b..ec7889a3728a 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -215,15 +215,16 @@ struct DWARFTypePrinter {
       OS << "void";
       return DWARFDie();
     }
-    DWARFDie Inner = resolveReferencedType(D);
+    DWARFDie InnerDIE;
+    auto Inner = [&] { return InnerDIE = resolveReferencedType(D); };
     const dwarf::Tag T = D.getTag();
     switch (T) {
     case DW_TAG_pointer_type: {
-      appendPointerLikeTypeBefore(D, Inner, "*");
+      appendPointerLikeTypeBefore(D, Inner(), "*");
       break;
     }
     case DW_TAG_subroutine_type: {
-      appendQualifiedNameBefore(Inner);
+      appendQualifiedNameBefore(Inner());
       if (Word) {
         OS << ' ';
       }
@@ -231,18 +232,18 @@ struct DWARFTypePrinter {
       break;
     }
     case DW_TAG_array_type: {
-      appendQualifiedNameBefore(Inner);
+      appendQualifiedNameBefore(Inner());
       break;
     }
     case DW_TAG_reference_type:
-      appendPointerLikeTypeBefore(D, Inner, "&");
+      appendPointerLikeTypeBefore(D, Inner(), "&");
       break;
     case DW_TAG_rvalue_reference_type:
-      appendPointerLikeTypeBefore(D, Inner, "&&");
+      appendPointerLikeTypeBefore(D, Inner(), "&&");
       break;
     case DW_TAG_ptr_to_member_type: {
-      appendQualifiedNameBefore(Inner);
-      if (needsParens(Inner))
+      appendQualifiedNameBefore(Inner());
+      if (needsParens(InnerDIE))
         OS << '(';
       else if (Word)
         OS << ' ';
@@ -284,7 +285,7 @@ struct DWARFTypePrinter {
       const char *NamePtr = dwarf::toString(D.find(DW_AT_name), nullptr);
       if (!NamePtr) {
         appendTypeTagName(D.getTag());
-        return Inner;
+        return DWARFDie();
       }
       Word = true;
       StringRef Name = NamePtr;
@@ -317,7 +318,7 @@ struct DWARFTypePrinter {
       break;
     }
     }
-    return Inner;
+    return InnerDIE;
   }
 
   void appendUnqualifiedNameAfter(DWARFDie D, DWARFDie Inner,
@@ -610,7 +611,8 @@ struct DWARFTypePrinter {
     bool First = true;
     bool RealFirst = true;
     for (DWARFDie P : D) {
-      if (P.getTag() != DW_TAG_formal_parameter)
+      if (P.getTag() != DW_TAG_formal_parameter &&
+          P.getTag() != DW_TAG_unspecified_parameters)
         return;
       DWARFDie T = resolveReferencedType(P);
       if (SkipFirstParamIfArtificial && RealFirst && P.find(DW_AT_artificial)) {
@@ -622,7 +624,10 @@ struct DWARFTypePrinter {
         OS << ", ";
       }
       First = false;
-      appendQualifiedName(T);
+      if (P.getTag() == DW_TAG_unspecified_parameters)
+        OS << "...";
+      else
+        appendQualifiedName(T);
     }
     EndedWithTemplate = false;
     OS << ')';
@@ -767,7 +772,7 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
     DWARFDie D = resolveReferencedType(Die, FormValue);
     if (D && !D.isNULL()) {
       OS << Space << "\"";
-      DWARFTypePrinter(OS).appendQualifiedName(D);
+      dumpTypeQualifiedName(D, OS);
       OS << '"';
     }
   } else if (Attr == DW_AT_APPLE_property_attribute) {
@@ -801,7 +806,9 @@ void DWARFDie::getFullName(raw_string_ostream &OS,
   const char *NamePtr = getShortName();
   if (!NamePtr)
     return;
-  DWARFTypePrinter(OS).appendUnqualifiedName(*this, OriginalFullName);
+  if (getTag() == DW_TAG_GNU_template_parameter_pack)
+    return;
+  dumpTypeUnqualifiedName(*this, OS, OriginalFullName);
 }
 
 bool DWARFDie::isSubprogramDIE() const { return getTag() == DW_TAG_subprogram; }
@@ -1263,3 +1270,16 @@ bool DWARFAttribute::mayHaveLocationExpr(dwarf::Attribute Attr) {
     return false;
   }
 }
+
+namespace llvm {
+
+void dumpTypeQualifiedName(const DWARFDie &DIE, raw_ostream &OS) {
+  DWARFTypePrinter(OS).appendQualifiedName(DIE);
+}
+
+void dumpTypeUnqualifiedName(const DWARFDie &DIE, raw_ostream &OS,
+                             std::string *OriginalFullName) {
+  DWARFTypePrinter(OS).appendUnqualifiedName(DIE, OriginalFullName);
+}
+
+} // namespace llvm
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 6424c2f59844..ca7ac785b550 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -173,7 +173,7 @@ bool DWARFVerifier::verifyName(const DWARFDie &Die) {
   Die.getFullName(OS, &OriginalFullName);
   OS.flush();
   if (OriginalFullName.empty() || OriginalFullName == ReconstructedName)
-    return 0;
+    return false;
 
   error() << "Simplified template DW_AT_name could not be reconstituted:\n"
           << formatv("         original: {0}\n"
@@ -181,7 +181,7 @@ bool DWARFVerifier::verifyName(const DWARFDie &Die) {
                      OriginalFullName, ReconstructedName);
   dump(Die) << '\n';
   dump(Die.getDwarfUnit()->getUnitDIE()) << '\n';
-  return 1;
+  return true;
 }
 
 unsigned DWARFVerifier::verifyUnitContents(DWARFUnit &Unit,
@@ -322,12 +322,19 @@ unsigned DWARFVerifier::verifyUnits(const DWARFUnitVector &Units) {
   unsigned NumDebugInfoErrors = 0;
   ReferenceMap CrossUnitReferences;
 
+  unsigned Index = 1;
   for (const auto &Unit : Units) {
-      ReferenceMap UnitLocalReferences;
-      NumDebugInfoErrors +=
-          verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
-      NumDebugInfoErrors += verifyDebugInfoReferences(
-          UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); });
+    OS << "Verifying unit: " << Index << " / " << Units.getNumUnits();
+    if (const char* Name = Unit->getUnitDIE(true).getShortName())
+      OS << ", \"" << Name << '\"';
+    OS << '\n';
+    OS.flush();
+    ReferenceMap UnitLocalReferences;
+    NumDebugInfoErrors +=
+        verifyUnitContents(*Unit, UnitLocalReferences, CrossUnitReferences);
+    NumDebugInfoErrors += verifyDebugInfoReferences(
+        UnitLocalReferences, [&](uint64_t Offset) { return Unit.get(); });
+    ++Index;
   }
 
   NumDebugInfoErrors += verifyDebugInfoReferences(
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
index ac217df1ee48..2524e10cb6c5 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumTypes.cpp
@@ -23,7 +23,7 @@ using namespace llvm::pdb;
 NativeEnumTypes::NativeEnumTypes(NativeSession &PDBSession,
                                  LazyRandomTypeCollection &Types,
                                  std::vector<codeview::TypeLeafKind> Kinds)
-    : Matches(), Index(0), Session(PDBSession) {
+    : Index(0), Session(PDBSession) {
   Optional<TypeIndex> TI = Types.getFirst();
   while (TI) {
     CVType CVT = Types.getType(*TI);
diff --git a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
index 25962e5152eb..a6d7ca0da7a9 100644
--- a/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -231,6 +231,7 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_Lang &Lang) {
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, HLSL, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, D, OS)
     CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Swift, OS)
+    CASE_OUTPUT_ENUM_CLASS_NAME(PDB_Lang, Rust, OS)
   }
   return OS;
 }
diff --git a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index 9b2883546305..529100b23ba5 100644
--- a/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -100,6 +100,7 @@ std::string PDBSymbolCompiland::getSourceFileFullPath() const {
               .Case(".c", Lang == PDB_Lang::C)
               .Case(".asm", Lang == PDB_Lang::Masm)
               .Case(".swift", Lang == PDB_Lang::Swift)
+              .Case(".rs", Lang == PDB_Lang::Rust)
               .Default(false))
         return File->getFileName();
     }
diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index 555d29fe184b..e29968d113bd 100644
--- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -33,8 +33,8 @@ namespace symbolize {
 class SourceCode {
   std::unique_ptr<MemoryBuffer> MemBuf;
 
-  const Optional<StringRef> load(StringRef FileName,
-                                 const Optional<StringRef> &EmbeddedSource) {
+  Optional<StringRef> load(StringRef FileName,
+                           const Optional<StringRef> &EmbeddedSource) {
     if (Lines <= 0)
       return None;
 
@@ -50,7 +50,7 @@ class SourceCode {
     }
   }
 
-  const Optional<StringRef> pruneSource(const Optional<StringRef> &Source) {
+  Optional<StringRef> pruneSource(const Optional<StringRef> &Source) {
     if (!Source)
       return None;
     size_t FirstLinePos = StringRef::npos, Pos = 0;
diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp
index 389b18fd62ac..27614572766d 100644
--- a/llvm/lib/Debuginfod/Debuginfod.cpp
+++ b/llvm/lib/Debuginfod/Debuginfod.cpp
@@ -21,8 +21,10 @@
 #include "llvm/Debuginfod/HTTPClient.h"
 #include "llvm/Support/CachePruning.h"
 #include "llvm/Support/Caching.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/xxhash.h"
 
 namespace llvm {
@@ -36,7 +38,7 @@ static std::string buildIDToString(BuildIDRef ID) {
 
 Expected<SmallVector<StringRef>> getDefaultDebuginfodUrls() {
   const char *DebuginfodUrlsEnv = std::getenv("DEBUGINFOD_URLS");
-  if (DebuginfodUrlsEnv == NULL)
+  if (DebuginfodUrlsEnv == nullptr)
     return SmallVector<StringRef>();
 
   SmallVector<StringRef> DebuginfodUrls;
@@ -52,6 +54,7 @@ Expected<std::string> getDefaultDebuginfodCacheDirectory() {
   if (!sys::path::cache_directory(CacheDirectory))
     return createStringError(
         errc::io_error, "Unable to determine appropriate cache directory.");
+  sys::path::append(CacheDirectory, "llvm-debuginfod", "client");
   return std::string(CacheDirectory);
 }
 
diff --git a/llvm/lib/Demangle/DLangDemangle.cpp b/llvm/lib/Demangle/DLangDemangle.cpp
index 0cefbd63a7ae..7cecd8007087 100644
--- a/llvm/lib/Demangle/DLangDemangle.cpp
+++ b/llvm/lib/Demangle/DLangDemangle.cpp
@@ -68,7 +68,53 @@ private:
   /// \note A result larger than UINT_MAX is considered a failure.
   ///
   /// \see https://dlang.org/spec/abi.html#Number .
-  const char *decodeNumber(const char *Mangled, unsigned long *Ret);
+  const char *decodeNumber(const char *Mangled, unsigned long &Ret);
+
+  /// Extract the back reference position from a given string.
+  ///
+  /// \param Mangled string to extract the back reference position.
+  /// \param Ret assigned result value.
+  ///
+  /// \return the remaining string on success or nullptr on failure.
+  ///
+  /// \note Ret is always >= 0 on success, and unspecified on failure
+  ///
+  /// \see https://dlang.org/spec/abi.html#back_ref .
+  /// \see https://dlang.org/spec/abi.html#NumberBackRef .
+  const char *decodeBackrefPos(const char *Mangled, long &Ret);
+
+  /// Extract the symbol pointed by the back reference form a given string.
+  ///
+  /// \param Mangled string to extract the back reference position.
+  /// \param Ret assigned result value.
+  ///
+  /// \return the remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#back_ref .
+  const char *decodeBackref(const char *Mangled, const char *&Ret);
+
+  /// Extract and demangle backreferenced symbol from a given mangled symbol
+  /// and append it to the output string.
+  ///
+  /// \param Demangled output buffer to write the demangled name.
+  /// \param Mangled mangled symbol to be demangled.
+  ///
+  /// \return the remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#back_ref .
+  /// \see https://dlang.org/spec/abi.html#IdentifierBackRef .
+  const char *parseSymbolBackref(OutputBuffer *Demangled, const char *Mangled);
+
+  /// Extract and demangle backreferenced type from a given mangled symbol
+  /// and append it to the output string.
+  ///
+  /// \param Mangled mangled symbol to be demangled.
+  ///
+  /// \return the remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#back_ref .
+  /// \see https://dlang.org/spec/abi.html#TypeBackRef .
+  const char *parseTypeBackref(const char *Mangled);
 
   /// Check whether it is the beginning of a symbol name.
   ///
@@ -115,13 +161,25 @@ private:
   /// \see https://dlang.org/spec/abi.html#QualifiedName .
   const char *parseQualified(OutputBuffer *Demangled, const char *Mangled);
 
+  /// Extract and demangle a type from a given mangled symbol append it to
+  /// the output string.
+  ///
+  /// \param Mangled mangled symbol to be demangled.
+  ///
+  /// \return the remaining string on success or nullptr on failure.
+  ///
+  /// \see https://dlang.org/spec/abi.html#Type .
+  const char *parseType(const char *Mangled);
+
   /// The string we are demangling.
   const char *Str;
+  /// The index of the last back reference.
+  int LastBackref;
 };
 
 } // namespace
 
-const char *Demangler::decodeNumber(const char *Mangled, unsigned long *Ret) {
+const char *Demangler::decodeNumber(const char *Mangled, unsigned long &Ret) {
   // Return nullptr if trying to extract something that isn't a digit.
   if (Mangled == nullptr || !std::isdigit(*Mangled))
     return nullptr;
@@ -142,16 +200,145 @@ const char *Demangler::decodeNumber(const char *Mangled, unsigned long *Ret) {
   if (*Mangled == '\0')
     return nullptr;
 
-  *Ret = Val;
+  Ret = Val;
+  return Mangled;
+}
+
+const char *Demangler::decodeBackrefPos(const char *Mangled, long &Ret) {
+  // Return nullptr if trying to extract something that isn't a digit
+  if (Mangled == nullptr || !std::isalpha(*Mangled))
+    return nullptr;
+
+  // Any identifier or non-basic type that has been emitted to the mangled
+  // symbol before will not be emitted again, but is referenced by a special
+  // sequence encoding the relative position of the original occurrence in the
+  // mangled symbol name.
+  // Numbers in back references are encoded with base 26 by upper case letters
+  // A-Z for higher digits but lower case letters a-z for the last digit.
+  //    NumberBackRef:
+  //        [a-z]
+  //        [A-Z] NumberBackRef
+  //        ^
+  unsigned long Val = 0;
+
+  while (std::isalpha(*Mangled)) {
+    // Check for overflow
+    if (Val > (std::numeric_limits<unsigned long>::max() - 25) / 26)
+      break;
+
+    Val *= 26;
+
+    if (Mangled[0] >= 'a' && Mangled[0] <= 'z') {
+      Val += Mangled[0] - 'a';
+      if ((long)Val <= 0)
+        break;
+      Ret = Val;
+      return Mangled + 1;
+    }
+
+    Val += Mangled[0] - 'A';
+    ++Mangled;
+  }
+
+  return nullptr;
+}
+
+const char *Demangler::decodeBackref(const char *Mangled, const char *&Ret) {
+  assert(Mangled != nullptr && *Mangled == 'Q' && "Invalid back reference!");
+  Ret = nullptr;
+
+  // Position of 'Q'
+  const char *Qpos = Mangled;
+  long RefPos;
+  ++Mangled;
+
+  Mangled = decodeBackrefPos(Mangled, RefPos);
+  if (Mangled == nullptr)
+    return nullptr;
+
+  if (RefPos > Qpos - Str)
+    return nullptr;
+
+  // Set the position of the back reference.
+  Ret = Qpos - RefPos;
+
+  return Mangled;
+}
+
+const char *Demangler::parseSymbolBackref(OutputBuffer *Demangled,
+                                          const char *Mangled) {
+  // An identifier back reference always points to a digit 0 to 9.
+  //    IdentifierBackRef:
+  //        Q NumberBackRef
+  //        ^
+  const char *Backref;
+  unsigned long Len;
+
+  // Get position of the back reference
+  Mangled = decodeBackref(Mangled, Backref);
+
+  // Must point to a simple identifier
+  Backref = decodeNumber(Backref, Len);
+  if (Backref == nullptr || strlen(Backref) < Len)
+    return nullptr;
+
+  Backref = parseLName(Demangled, Backref, Len);
+  if (Backref == nullptr)
+    return nullptr;
+
+  return Mangled;
+}
+
+const char *Demangler::parseTypeBackref(const char *Mangled) {
+  // A type back reference always points to a letter.
+  //    TypeBackRef:
+  //        Q NumberBackRef
+  //        ^
+  const char *Backref;
+
+  // If we appear to be moving backwards through the mangle string, then
+  // bail as this may be a recursive back reference.
+  if (Mangled - Str >= LastBackref)
+    return nullptr;
+
+  int SaveRefPos = LastBackref;
+  LastBackref = Mangled - Str;
+
+  // Get position of the back reference.
+  Mangled = decodeBackref(Mangled, Backref);
+
+  // Can't decode back reference.
+  if (Backref == nullptr)
+    return nullptr;
+
+  // TODO: Add support for function type back references.
+  Backref = parseType(Backref);
+
+  LastBackref = SaveRefPos;
+
+  if (Backref == nullptr)
+    return nullptr;
+
   return Mangled;
 }
 
 bool Demangler::isSymbolName(const char *Mangled) {
+  long Ret;
+  const char *Qref = Mangled;
+
   if (std::isdigit(*Mangled))
     return true;
 
-  // TODO: Handle symbol back references and template instances.
-  return false;
+  // TODO: Handle template instances.
+
+  if (*Mangled != 'Q')
+    return false;
+
+  Mangled = decodeBackrefPos(Mangled + 1, Ret);
+  if (Mangled == nullptr || Ret > Qref - Str)
+    return false;
+
+  return std::isdigit(Qref[-Ret]);
 }
 
 const char *Demangler::parseMangle(OutputBuffer *Demangled,
@@ -174,8 +361,7 @@ const char *Demangler::parseMangle(OutputBuffer *Demangled,
     if (*Mangled == 'Z')
       ++Mangled;
     else {
-      // TODO: Implement symbols with types.
-      return nullptr;
+      Mangled = parseType(Mangled);
     }
   }
 
@@ -228,9 +414,12 @@ const char *Demangler::parseIdentifier(OutputBuffer *Demangled,
   if (Mangled == nullptr || *Mangled == '\0')
     return nullptr;
 
-  // TODO: Parse back references and lengthless template instances.
+  if (*Mangled == 'Q')
+    return parseSymbolBackref(Demangled, Mangled);
+
+  // TODO: Parse lengthless template instances.
 
-  const char *Endptr = decodeNumber(Mangled, &Len);
+  const char *Endptr = decodeNumber(Mangled, Len);
 
   if (Endptr == nullptr || Len == 0)
     return nullptr;
@@ -262,6 +451,34 @@ const char *Demangler::parseIdentifier(OutputBuffer *Demangled,
   return parseLName(Demangled, Mangled, Len);
 }
 
+const char *Demangler::parseType(const char *Mangled) {
+  if (*Mangled == '\0')
+    return nullptr;
+
+  switch (*Mangled) {
+  // TODO: Parse type qualifiers.
+  // TODO: Parse function types.
+  // TODO: Parse compound types.
+  // TODO: Parse delegate types.
+  // TODO: Parse tuple types.
+
+  // Basic types.
+  case 'i':
+    ++Mangled;
+    // TODO: Add type name dumping
+    return Mangled;
+
+    // TODO: Add support for the rest of the basic types.
+
+  // Back referenced type.
+  case 'Q':
+    return parseTypeBackref(Mangled);
+
+  default: // unhandled.
+    return nullptr;
+  }
+}
+
 const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled,
                                   unsigned long Len) {
   switch (Len) {
@@ -319,7 +536,8 @@ const char *Demangler::parseLName(OutputBuffer *Demangled, const char *Mangled,
   return Mangled;
 }
 
-Demangler::Demangler(const char *Mangled) : Str(Mangled) {}
+Demangler::Demangler(const char *Mangled)
+    : Str(Mangled), LastBackref(strlen(Mangled)) {}
 
 const char *Demangler::parseMangle(OutputBuffer *Demangled) {
   return parseMangle(Demangled, this->Str);
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index 3f68f76761ce..1a5db755e37b 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -19,9 +19,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <functional>
-#include <numeric>
 #include <utility>
-#include <vector>
 
 using namespace llvm;
 using namespace llvm::itanium_demangle;
diff --git a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
index 32d8dff66c3f..d07d05a08c55 100644
--- a/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangleNodes.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Demangle/MicrosoftDemangleNodes.h"
-#include "llvm/Demangle/DemangleConfig.h"
 #include "llvm/Demangle/Utility.h"
 #include <cctype>
 #include <string>
diff --git a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
index e15bce0d6c4b..1fb37ce7c57c 100644
--- a/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
+++ b/llvm/lib/ExecutionEngine/GDBRegistrationListener.cpp
@@ -96,7 +96,7 @@ class GDBJITRegistrationListener : public JITEventListener {
 
 public:
   /// Instantiates the JIT service.
-  GDBJITRegistrationListener() : ObjectBufferMap() {}
+  GDBJITRegistrationListener() {}
 
   /// Unregisters each object that was previously registered and releases all
   /// internal resources.
diff --git a/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h b/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h
index 8ae3bc2bf61d..159880e4b152 100644
--- a/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h
+++ b/llvm/lib/ExecutionEngine/JITLink/DefineExternalSectionStartAndEndSymbols.h
@@ -52,13 +52,13 @@ public:
         auto &SR = getSectionRange(*D.Sec);
         if (D.IsStart) {
           if (SR.empty())
-            G.makeAbsolute(*Sym, 0);
+            G.makeAbsolute(*Sym, orc::ExecutorAddr());
           else
             G.makeDefined(*Sym, *SR.getFirstBlock(), 0, 0, Linkage::Strong,
                           Scope::Local, false);
         } else {
           if (SR.empty())
-            G.makeAbsolute(*Sym, 0);
+            G.makeAbsolute(*Sym, orc::ExecutorAddr());
           else
             G.makeDefined(*Sym, *SR.getLastBlock(),
                           SR.getLastBlock()->getSize(), 0, Linkage::Strong,
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
index 4d7d5ce26668..2ae193595fc0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupport.cpp
@@ -65,10 +65,7 @@ Error EHFrameSplitter::operator()(LinkGraph &G) {
 
 Error EHFrameSplitter::processBlock(LinkGraph &G, Block &B,
                                     LinkGraph::SplitBlockCache &Cache) {
-  LLVM_DEBUG({
-    dbgs() << "  Processing block at " << formatv("{0:x16}", B.getAddress())
-           << "\n";
-  });
+  LLVM_DEBUG(dbgs() << "  Processing block at " << B.getAddress() << "\n");
 
   // eh-frame should not contain zero-fill blocks.
   if (B.isZeroFill())
@@ -400,7 +397,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
                                    BlockEdgeMap &BlockEdges) {
   LLVM_DEBUG(dbgs() << "      Record is FDE\n");
 
-  JITTargetAddress RecordAddress = B.getAddress() + RecordOffset;
+  orc::ExecutorAddr RecordAddress = B.getAddress() + RecordOffset;
 
   auto RecordContent = B.getContent().slice(RecordOffset, RecordLength);
   BinaryStreamReader RecordReader(
@@ -418,8 +415,9 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
   {
     // Process the CIE pointer field.
     auto CIEEdgeItr = BlockEdges.find(RecordOffset + CIEDeltaFieldOffset);
-    JITTargetAddress CIEAddress =
-        RecordAddress + CIEDeltaFieldOffset - CIEDelta;
+    orc::ExecutorAddr CIEAddress =
+        RecordAddress + orc::ExecutorAddrDiff(CIEDeltaFieldOffset) -
+        orc::ExecutorAddrDiff(CIEDelta);
     if (CIEEdgeItr == BlockEdges.end()) {
 
       LLVM_DEBUG({
@@ -456,7 +454,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
   {
     // Process the PC-Begin field.
     Block *PCBeginBlock = nullptr;
-    JITTargetAddress PCBeginFieldOffset = RecordReader.getOffset();
+    orc::ExecutorAddrDiff PCBeginFieldOffset = RecordReader.getOffset();
     auto PCEdgeItr = BlockEdges.find(RecordOffset + PCBeginFieldOffset);
     if (PCEdgeItr == BlockEdges.end()) {
       auto PCBeginPtrInfo =
@@ -464,12 +462,12 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
                              RecordAddress + PCBeginFieldOffset, RecordReader);
       if (!PCBeginPtrInfo)
         return PCBeginPtrInfo.takeError();
-      JITTargetAddress PCBegin = PCBeginPtrInfo->first;
+      orc::ExecutorAddr PCBegin = PCBeginPtrInfo->first;
       Edge::Kind PCBeginEdgeKind = PCBeginPtrInfo->second;
       LLVM_DEBUG({
         dbgs() << "        Adding edge at "
-               << formatv("{0:x16}", RecordAddress + PCBeginFieldOffset)
-               << " to PC at " << formatv("{0:x16}", PCBegin) << "\n";
+               << (RecordAddress + PCBeginFieldOffset) << " to PC at "
+               << formatv("{0:x16}", PCBegin) << "\n";
       });
       auto PCBeginSym = getOrCreateSymbol(PC, PCBegin);
       if (!PCBeginSym)
@@ -522,7 +520,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
     if (auto Err = RecordReader.readULEB128(AugmentationDataSize))
       return Err;
 
-    JITTargetAddress LSDAFieldOffset = RecordReader.getOffset();
+    orc::ExecutorAddrDiff LSDAFieldOffset = RecordReader.getOffset();
     auto LSDAEdgeItr = BlockEdges.find(RecordOffset + LSDAFieldOffset);
     if (LSDAEdgeItr == BlockEdges.end()) {
       auto LSDAPointerInfo =
@@ -530,7 +528,7 @@ Error EHFrameEdgeFixer::processFDE(ParseContext &PC, Block &B,
                              RecordAddress + LSDAFieldOffset, RecordReader);
       if (!LSDAPointerInfo)
         return LSDAPointerInfo.takeError();
-      JITTargetAddress LSDA = LSDAPointerInfo->first;
+      orc::ExecutorAddr LSDA = LSDAPointerInfo->first;
       Edge::Kind LSDAEdgeKind = LSDAPointerInfo->second;
       auto LSDASym = getOrCreateSymbol(PC, LSDA);
       if (!LSDASym)
@@ -645,12 +643,10 @@ unsigned EHFrameEdgeFixer::getPointerEncodingDataSize(uint8_t PointerEncoding) {
   }
 }
 
-Expected<std::pair<JITTargetAddress, Edge::Kind>>
+Expected<std::pair<orc::ExecutorAddr, Edge::Kind>>
 EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding,
-                                     JITTargetAddress PointerFieldAddress,
+                                     orc::ExecutorAddr PointerFieldAddress,
                                      BinaryStreamReader &RecordReader) {
-  static_assert(sizeof(JITTargetAddress) == sizeof(uint64_t),
-                "Result must be able to hold a uint64_t");
   assert(isSupportedPointerEncoding(PointerEncoding) &&
          "Unsupported pointer encoding");
 
@@ -663,7 +659,7 @@ EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding,
   if (EffectiveType == DW_EH_PE_absptr)
     EffectiveType = (PointerSize == 8) ? DW_EH_PE_udata8 : DW_EH_PE_udata4;
 
-  JITTargetAddress Addr;
+  orc::ExecutorAddr Addr;
   Edge::Kind PointerEdgeKind = Edge::Invalid;
   switch (EffectiveType) {
   case DW_EH_PE_udata4: {
@@ -709,7 +705,7 @@ EHFrameEdgeFixer::readEncodedPointer(uint8_t PointerEncoding,
 }
 
 Expected<Symbol &> EHFrameEdgeFixer::getOrCreateSymbol(ParseContext &PC,
-                                                       JITTargetAddress Addr) {
+                                                       orc::ExecutorAddr Addr) {
   Symbol *CanonicalSym = nullptr;
 
   auto UpdateCanonicalSym = [&](Symbol *Sym) {
@@ -753,8 +749,9 @@ Error EHFrameNullTerminator::operator()(LinkGraph &G) {
            << EHFrameSectionName << "\n";
   });
 
-  auto &NullTerminatorBlock = G.createContentBlock(
-      *EHFrame, NullTerminatorBlockContent, 0xfffffffffffffffc, 1, 0);
+  auto &NullTerminatorBlock =
+      G.createContentBlock(*EHFrame, NullTerminatorBlockContent,
+                           orc::ExecutorAddr(~uint64_t(4)), 1, 0);
   G.addAnonymousSymbol(NullTerminatorBlock, 0, 4, false, true);
   return Error::success();
 }
@@ -762,17 +759,15 @@ Error EHFrameNullTerminator::operator()(LinkGraph &G) {
 EHFrameRegistrar::~EHFrameRegistrar() {}
 
 Error InProcessEHFrameRegistrar::registerEHFrames(
-    JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) {
-  return orc::registerEHFrameSection(
-      jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
-      EHFrameSectionSize);
+    orc::ExecutorAddrRange EHFrameSection) {
+  return orc::registerEHFrameSection(EHFrameSection.Start.toPtr<void *>(),
+                                     EHFrameSection.size());
 }
 
 Error InProcessEHFrameRegistrar::deregisterEHFrames(
-    JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) {
-  return orc::deregisterEHFrameSection(
-      jitTargetAddressToPointer<void *>(EHFrameSectionAddr),
-      EHFrameSectionSize);
+    orc::ExecutorAddrRange EHFrameSection) {
+  return orc::deregisterEHFrameSection(EHFrameSection.Start.toPtr<void *>(),
+                                       EHFrameSection.size());
 }
 
 LinkGraphPassFunction
@@ -789,14 +784,14 @@ createEHFrameRecorderPass(const Triple &TT,
        StoreFrameRange = std::move(StoreRangeAddress)](LinkGraph &G) -> Error {
     // Search for a non-empty eh-frame and record the address of the first
     // symbol in it.
-    JITTargetAddress Addr = 0;
+    orc::ExecutorAddr Addr;
     size_t Size = 0;
     if (auto *S = G.findSectionByName(EHFrameSectionName)) {
       auto R = SectionRange(*S);
       Addr = R.getStart();
       Size = R.getSize();
     }
-    if (Addr == 0 && Size != 0)
+    if (!Addr && Size != 0)
       return make_error<JITLinkError>(
           StringRef(EHFrameSectionName) +
           " section can not have zero address with non-zero size");
diff --git a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
index b4c4b0f7b097..ef4b47b9aa28 100644
--- a/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
+++ b/llvm/lib/ExecutionEngine/JITLink/EHFrameSupportImpl.h
@@ -71,12 +71,12 @@ private:
   };
 
   using BlockEdgeMap = DenseMap<Edge::OffsetT, EdgeTarget>;
-  using CIEInfosMap = DenseMap<JITTargetAddress, CIEInformation>;
+  using CIEInfosMap = DenseMap<orc::ExecutorAddr, CIEInformation>;
 
   struct ParseContext {
     ParseContext(LinkGraph &G) : G(G) {}
 
-    Expected<CIEInformation *> findCIEInfo(JITTargetAddress Address) {
+    Expected<CIEInformation *> findCIEInfo(orc::ExecutorAddr Address) {
       auto I = CIEInfos.find(Address);
       if (I == CIEInfos.end())
         return make_error<JITLinkError>("No CIE found at address " +
@@ -102,12 +102,13 @@ private:
 
   static bool isSupportedPointerEncoding(uint8_t PointerEncoding);
   unsigned getPointerEncodingDataSize(uint8_t PointerEncoding);
-  Expected<std::pair<JITTargetAddress, Edge::Kind>>
+  Expected<std::pair<orc::ExecutorAddr, Edge::Kind>>
   readEncodedPointer(uint8_t PointerEncoding,
-                     JITTargetAddress PointerFieldAddress,
+                     orc::ExecutorAddr PointerFieldAddress,
                      BinaryStreamReader &RecordReader);
 
-  Expected<Symbol &> getOrCreateSymbol(ParseContext &PC, JITTargetAddress Addr);
+  Expected<Symbol &> getOrCreateSymbol(ParseContext &PC,
+                                       orc::ExecutorAddr Addr);
 
   StringRef EHFrameSectionName;
   unsigned PointerSize;
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index f9101d71dfa8..2ab7ed61f71b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -77,14 +77,14 @@ protected:
     return Obj.getHeader().e_type == llvm::ELF::ET_REL;
   }
 
-  void setGraphSection(ELFSectionIndex SecIndex, Section &Sec) {
-    assert(!GraphSections.count(SecIndex) && "Duplicate section at index");
-    GraphSections[SecIndex] = &Sec;
+  void setGraphBlock(ELFSectionIndex SecIndex, Block *B) {
+    assert(!GraphBlocks.count(SecIndex) && "Duplicate section at index");
+    GraphBlocks[SecIndex] = B;
   }
 
-  Section *getGraphSection(ELFSectionIndex SecIndex) {
-    auto I = GraphSections.find(SecIndex);
-    if (I == GraphSections.end())
+  Block *getGraphBlock(ELFSectionIndex SecIndex) {
+    auto I = GraphBlocks.find(SecIndex);
+    if (I == GraphBlocks.end())
       return nullptr;
     return I->second;
   }
@@ -139,9 +139,9 @@ protected:
   const typename ELFFile::Elf_Shdr *SymTabSec = nullptr;
   StringRef SectionStringTab;
 
-  // Maps ELF section indexes to LinkGraph Sections.
-  // Only SHF_ALLOC sections will have graph sections.
-  DenseMap<ELFSectionIndex, Section *> GraphSections;
+  // Maps ELF section indexes to LinkGraph Blocks.
+  // Only SHF_ALLOC sections will have graph blocks.
+  DenseMap<ELFSectionIndex, Block *> GraphBlocks;
   DenseMap<ELFSymbolIndex, Symbol *> GraphSymbols;
   DenseMap<const typename ELFFile::Elf_Shdr *,
            ArrayRef<typename ELFFile::Elf_Word>>
@@ -316,18 +316,27 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() {
     else
       Prot = MemProt::Read | MemProt::Write;
 
-    auto &GraphSec = G->createSection(*Name, Prot);
+    // Look for existing sections first.
+    auto *GraphSec = G->findSectionByName(*Name);
+    if (!GraphSec)
+      GraphSec = &G->createSection(*Name, Prot);
+    assert(GraphSec->getMemProt() == Prot && "MemProt should match");
+
+    Block *B = nullptr;
     if (Sec.sh_type != ELF::SHT_NOBITS) {
       auto Data = Obj.template getSectionContentsAsArray<char>(Sec);
       if (!Data)
         return Data.takeError();
 
-      G->createContentBlock(GraphSec, *Data, Sec.sh_addr, Sec.sh_addralign, 0);
+      B = &G->createContentBlock(*GraphSec, *Data,
+                                 orc::ExecutorAddr(Sec.sh_addr),
+                                 Sec.sh_addralign, 0);
     } else
-      G->createZeroFillBlock(GraphSec, Sec.sh_size, Sec.sh_addr,
-                             Sec.sh_addralign, 0);
+      B = &G->createZeroFillBlock(*GraphSec, Sec.sh_size,
+                                  orc::ExecutorAddr(Sec.sh_addr),
+                                  Sec.sh_addralign, 0);
 
-    setGraphSection(SecIndex, GraphSec);
+    setGraphBlock(SecIndex, B);
   }
 
   return Error::success();
@@ -393,9 +402,9 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
 
     // Handle common symbols specially.
     if (Sym.isCommon()) {
-      Symbol &GSym =
-          G->addCommonSymbol(*Name, Scope::Default, getCommonSection(), 0,
-                             Sym.st_size, Sym.getValue(), false);
+      Symbol &GSym = G->addCommonSymbol(*Name, Scope::Default,
+                                        getCommonSection(), orc::ExecutorAddr(),
+                                        Sym.st_size, Sym.getValue(), false);
       setGraphSymbol(SymIndex, GSym);
       continue;
     }
@@ -425,28 +434,24 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySymbols() {
           return NdxOrErr.takeError();
         Shndx = *NdxOrErr;
       }
-      if (auto *GraphSec = getGraphSection(Shndx)) {
-        Block *B = nullptr;
-        {
-          auto Blocks = GraphSec->blocks();
-          assert(Blocks.begin() != Blocks.end() && "No blocks for section");
-          assert(std::next(Blocks.begin()) == Blocks.end() &&
-                 "Multiple blocks for section");
-          B = *Blocks.begin();
-        }
-
+      if (auto *B = getGraphBlock(Shndx)) {
         LLVM_DEBUG({
           dbgs() << "      " << SymIndex
                  << ": Creating defined graph symbol for ELF symbol \"" << *Name
                  << "\"\n";
         });
 
-        if (Sym.getType() == ELF::STT_SECTION)
-          *Name = GraphSec->getName();
-
+        // In RISCV, temporary symbols (Used to generate dwarf, eh_frame
+        // sections...) will appear in object code's symbol table, and LLVM does
+        // not use names on these temporary symbols (RISCV gnu toolchain uses
+        // names on these temporary symbols). If the symbol is unnamed, add an
+        // anonymous symbol.
         auto &GSym =
-            G->addDefinedSymbol(*B, Sym.getValue(), *Name, Sym.st_size, L, S,
-                                Sym.getType() == ELF::STT_FUNC, false);
+            Name->empty()
+                ? G->addAnonymousSymbol(*B, Sym.getValue(), Sym.st_size,
+                                        false, false)
+                : G->addDefinedSymbol(*B, Sym.getValue(), *Name, Sym.st_size, L,
+                                      S, Sym.getType() == ELF::STT_FUNC, false);
         setGraphSymbol(SymIndex, GSym);
       }
     } else if (Sym.isUndefined() && Sym.isExternal()) {
@@ -498,8 +503,8 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelocation(
   }
 
   // Lookup the link-graph node corresponding to the target section name.
-  Section *GraphSect = G->findSectionByName(*Name);
-  if (!GraphSect)
+  auto *BlockToFix = getGraphBlock(RelSect.sh_info);
+  if (!BlockToFix)
     return make_error<StringError>(
         "Refencing a section that wasn't added to the graph: " + *Name,
         inconvertibleErrorCode());
@@ -510,7 +515,7 @@ Error ELFLinkGraphBuilder<ELFT>::forEachRelocation(
 
   // Let the callee process relocation entries one by one.
   for (const typename ELFT::Rela &R : *RelEntries)
-    if (Error Err = Func(R, **FixupSection, *GraphSect))
+    if (Error Err = Func(R, **FixupSection, *BlockToFix))
       return Err;
 
   LLVM_DEBUG(dbgs() << "\n");
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
index dc183dfddfae..dd3eb97c21a0 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch64.cpp
@@ -16,6 +16,7 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/aarch64.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "jitlink"
 
@@ -41,16 +42,17 @@ private:
 
     char *BlockWorkingMem = B.getAlreadyMutableContent().data();
     char *FixupPtr = BlockWorkingMem + E.getOffset();
-    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
+    auto FixupAddress = B.getAddress() + E.getOffset();
     switch (E.getKind()) {
     case aarch64::R_AARCH64_CALL26: {
-      assert((FixupAddress & 0x3) == 0 && "Call-inst is not 32-bit aligned");
+      assert((FixupAddress.getValue() & 0x3) == 0 &&
+             "Call-inst is not 32-bit aligned");
       int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
 
       if (static_cast<uint64_t>(Value) & 0x3)
         return make_error<JITLinkError>("Call target is not 32-bit aligned");
 
-      if (!fitsRangeSignedInt<27>(Value))
+      if (!isInt<28>(Value))
         return makeTargetOutOfRangeError(G, B, E);
 
       uint32_t RawInstr = *(little32_t *)FixupPtr;
@@ -64,10 +66,6 @@ private:
     }
     return Error::success();
   }
-
-  template <uint8_t Bits> static bool fitsRangeSignedInt(int64_t Value) {
-    return Value >= -(1ll << Bits) && Value < (1ll << Bits);
-  }
 };
 
 template <typename ELFT>
@@ -100,7 +98,7 @@ private:
 
   Error addSingleRelocation(const typename ELFT::Rela &Rel,
                             const typename ELFT::Shdr &FixupSect,
-                            Section &GraphSection) {
+                            Block &BlockToFix) {
     using Base = ELFLinkGraphBuilder<ELFT>;
 
     uint32_t SymbolIndex = Rel.getSymbol(false);
@@ -123,17 +121,17 @@ private:
       return Kind.takeError();
 
     int64_t Addend = Rel.r_addend;
-    Block *BlockToFix = *(GraphSection.blocks().begin());
-    JITTargetAddress FixupAddress = FixupSect.sh_addr + Rel.r_offset;
-    Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress();
+    orc::ExecutorAddr FixupAddress =
+        orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
     Edge GE(*Kind, Offset, *GraphSymbol, Addend);
     LLVM_DEBUG({
       dbgs() << "    ";
-      printEdge(dbgs(), *BlockToFix, GE, aarch64::getEdgeKindName(*Kind));
+      printEdge(dbgs(), BlockToFix, GE, aarch64::getEdgeKindName(*Kind));
       dbgs() << "\n";
     });
 
-    BlockToFix->addEdge(std::move(GE));
+    BlockToFix.addEdge(std::move(GE));
     return Error::success();
   }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index b057788ce3ef..f83001417e94 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ExecutionEngine/JITLink/riscv.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/Endian.h"
 
 #define DEBUG_TYPE "jitlink"
 using namespace llvm;
@@ -44,15 +45,16 @@ public:
   bool isGOTEdgeToFix(Edge &E) const { return E.getKind() == R_RISCV_GOT_HI20; }
 
   Symbol &createGOTEntry(Symbol &Target) {
-    Block &GOTBlock = G.createContentBlock(
-        getGOTSection(), getGOTEntryBlockContent(), 0, G.getPointerSize(), 0);
+    Block &GOTBlock =
+        G.createContentBlock(getGOTSection(), getGOTEntryBlockContent(),
+                             orc::ExecutorAddr(), G.getPointerSize(), 0);
     GOTBlock.addEdge(isRV64() ? R_RISCV_64 : R_RISCV_32, 0, Target, 0);
     return G.addAnonymousSymbol(GOTBlock, 0, G.getPointerSize(), false, false);
   }
 
   Symbol &createPLTStub(Symbol &Target) {
-    Block &StubContentBlock =
-        G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 4, 0);
+    Block &StubContentBlock = G.createContentBlock(
+        getStubsSection(), getStubBlockContent(), orc::ExecutorAddr(), 4, 0);
     auto &GOTEntrySymbol = getGOTEntry(Target);
     StubContentBlock.addEdge(R_RISCV_CALL, 0, GOTEntrySymbol, 0);
     return G.addAnonymousSymbol(StubContentBlock, 0, StubEntrySize, true,
@@ -134,13 +136,13 @@ static Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) {
 
   const Symbol &Sym = E.getTarget();
   const Block &B = Sym.getBlock();
-  JITTargetAddress Offset = Sym.getOffset();
+  orc::ExecutorAddrDiff Offset = Sym.getOffset();
 
   struct Comp {
-    bool operator()(const Edge &Lhs, JITTargetAddress Offset) {
+    bool operator()(const Edge &Lhs, orc::ExecutorAddrDiff Offset) {
       return Lhs.getOffset() < Offset;
     }
-    bool operator()(JITTargetAddress Offset, const Edge &Rhs) {
+    bool operator()(orc::ExecutorAddrDiff Offset, const Edge &Rhs) {
       return Offset < Rhs.getOffset();
     }
   };
@@ -157,8 +159,24 @@ static Expected<const Edge &> getRISCVPCRelHi20(const Edge &E) {
       "No HI20 PCREL relocation type be found for LO12 PCREL relocation type");
 }
 
-static uint32_t extractBits(uint64_t Num, unsigned High, unsigned Low) {
-  return (Num & ((1ULL << (High + 1)) - 1)) >> Low;
+static uint32_t extractBits(uint32_t Num, unsigned Low, unsigned Size) {
+  return (Num & (((1ULL << (Size + 1)) - 1) << Low)) >> Low;
+}
+
+inline Error checkAlignment(llvm::orc::ExecutorAddr loc, uint64_t v, int n,
+                            const Edge &E) {
+  if (v & (n - 1))
+    return make_error<JITLinkError>("0x" + llvm::utohexstr(loc.getValue()) +
+                                    " improper alignment for relocation " +
+                                    formatv("{0:d}", E.getKind()) + ": 0x" +
+                                    llvm::utohexstr(v) + " is not aligned to " +
+                                    Twine(n) + " bytes");
+  return Error::success();
+}
+
+static inline bool isInRangeForImmS32(int64_t Value) {
+  return (Value >= std::numeric_limits<int32_t>::min() &&
+          Value <= std::numeric_limits<int32_t>::max());
 }
 
 class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
@@ -176,27 +194,47 @@ private:
 
     char *BlockWorkingMem = B.getAlreadyMutableContent().data();
     char *FixupPtr = BlockWorkingMem + E.getOffset();
-    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
+    orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset();
     switch (E.getKind()) {
     case R_RISCV_32: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend();
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       *(little32_t *)FixupPtr = static_cast<uint32_t>(Value);
       break;
     }
     case R_RISCV_64: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend();
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       *(little64_t *)FixupPtr = static_cast<uint64_t>(Value);
       break;
     }
+    case R_RISCV_BRANCH: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      Error AlignmentIssue = checkAlignment(FixupAddress, Value, 2, E);
+      if (AlignmentIssue) {
+        return AlignmentIssue;
+      }
+      int64_t Lo = Value & 0xFFF;
+      uint32_t Imm31_25 = extractBits(Lo, 5, 6) << 25 | extractBits(Lo, 12, 1)
+                                                            << 31;
+      uint32_t Imm11_7 = extractBits(Lo, 1, 4) << 8 | extractBits(Lo, 11, 1)
+                                                          << 7;
+      uint32_t RawInstr = *(little32_t *)FixupPtr;
+      *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7;
+      break;
+    }
     case R_RISCV_HI20: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend();
-      int32_t Hi = (Value + 0x800) & 0xFFFFF000;
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
+      int64_t Hi = Value + 0x800;
+      if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi)))
+        return makeTargetOutOfRangeError(G, B, E);
       uint32_t RawInstr = *(little32_t *)FixupPtr;
-      *(little32_t *)FixupPtr = (RawInstr & 0xFFF) | static_cast<uint32_t>(Hi);
+      *(little32_t *)FixupPtr =
+          (RawInstr & 0xFFF) | (static_cast<uint32_t>(Hi & 0xFFFFF000));
       break;
     }
     case R_RISCV_LO12_I: {
-      int64_t Value = E.getTarget().getAddress() + E.getAddend();
+      // FIXME: We assume that R_RISCV_HI20 is present in object code and pairs
+      // with current relocation R_RISCV_LO12_I. So here may need a check.
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
       int32_t Lo = Value & 0xFFF;
       uint32_t RawInstr = *(little32_t *)FixupPtr;
       *(little32_t *)FixupPtr =
@@ -205,23 +243,32 @@ private:
     }
     case R_RISCV_CALL: {
       int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      int32_t Hi = (Value + 0x800) & 0xFFFFF000;
+      int64_t Hi = Value + 0x800;
+      if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi)))
+        return makeTargetOutOfRangeError(G, B, E);
       int32_t Lo = Value & 0xFFF;
       uint32_t RawInstrAuipc = *(little32_t *)FixupPtr;
       uint32_t RawInstrJalr = *(little32_t *)(FixupPtr + 4);
-      *(little32_t *)FixupPtr = RawInstrAuipc | static_cast<uint32_t>(Hi);
+      *(little32_t *)FixupPtr =
+          RawInstrAuipc | (static_cast<uint32_t>(Hi & 0xFFFFF000));
       *(little32_t *)(FixupPtr + 4) =
           RawInstrJalr | (static_cast<uint32_t>(Lo) << 20);
       break;
     }
     case R_RISCV_PCREL_HI20: {
       int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
-      int32_t Hi = (Value + 0x800) & 0xFFFFF000;
+      int64_t Hi = Value + 0x800;
+      if (LLVM_UNLIKELY(!isInRangeForImmS32(Hi)))
+        return makeTargetOutOfRangeError(G, B, E);
       uint32_t RawInstr = *(little32_t *)FixupPtr;
-      *(little32_t *)FixupPtr = (RawInstr & 0xFFF) | static_cast<uint32_t>(Hi);
+      *(little32_t *)FixupPtr =
+          (RawInstr & 0xFFF) | (static_cast<uint32_t>(Hi & 0xFFFFF000));
       break;
     }
     case R_RISCV_PCREL_LO12_I: {
+      // FIXME: We assume that R_RISCV_PCREL_HI20 is present in object code and
+      // pairs with current relocation R_RISCV_PCREL_LO12_I. So here may need a
+      // check.
       auto RelHI20 = getRISCVPCRelHi20(E);
       if (!RelHI20)
         return RelHI20.takeError();
@@ -234,17 +281,117 @@ private:
       break;
     }
     case R_RISCV_PCREL_LO12_S: {
+      // FIXME: We assume that R_RISCV_PCREL_HI20 is present in object code and
+      // pairs with current relocation R_RISCV_PCREL_LO12_S. So here may need a
+      // check.
       auto RelHI20 = getRISCVPCRelHi20(E);
       int64_t Value = RelHI20->getTarget().getAddress() +
                       RelHI20->getAddend() - E.getTarget().getAddress();
       int64_t Lo = Value & 0xFFF;
-      uint32_t Imm31_25 = extractBits(Lo, 11, 5) << 25;
-      uint32_t Imm11_7 = extractBits(Lo, 4, 0) << 7;
+      uint32_t Imm31_25 = extractBits(Lo, 5, 7) << 25;
+      uint32_t Imm11_7 = extractBits(Lo, 0, 5) << 7;
       uint32_t RawInstr = *(little32_t *)FixupPtr;
 
       *(little32_t *)FixupPtr = (RawInstr & 0x1FFF07F) | Imm31_25 | Imm11_7;
       break;
     }
+    case R_RISCV_ADD64: {
+      int64_t Value = (E.getTarget().getAddress() +
+                       support::endian::read64le(reinterpret_cast<const void *>(
+                           FixupAddress.getValue())) +
+                       E.getAddend())
+                          .getValue();
+      *(little64_t *)FixupPtr = static_cast<uint64_t>(Value);
+      break;
+    }
+    case R_RISCV_ADD32: {
+      int64_t Value = (E.getTarget().getAddress() +
+                       support::endian::read32le(reinterpret_cast<const void *>(
+                           FixupAddress.getValue())) +
+                       E.getAddend())
+                          .getValue();
+      *(little32_t *)FixupPtr = static_cast<uint32_t>(Value);
+      break;
+    }
+    case R_RISCV_ADD16: {
+      int64_t Value = (E.getTarget().getAddress() +
+                       support::endian::read16le(reinterpret_cast<const void *>(
+                           FixupAddress.getValue())) +
+                       E.getAddend())
+                          .getValue();
+      *(little16_t *)FixupPtr = static_cast<uint32_t>(Value);
+      break;
+    }
+    case R_RISCV_ADD8: {
+      int64_t Value =
+          (E.getTarget().getAddress() +
+           *(reinterpret_cast<const uint8_t *>(FixupAddress.getValue())) +
+           E.getAddend())
+              .getValue();
+      *FixupPtr = static_cast<uint8_t>(Value);
+      break;
+    }
+    case R_RISCV_SUB64: {
+      int64_t Value = support::endian::read64le(reinterpret_cast<const void *>(
+                          FixupAddress.getValue())) -
+                      E.getTarget().getAddress().getValue() - E.getAddend();
+      *(little64_t *)FixupPtr = static_cast<uint64_t>(Value);
+      break;
+    }
+    case R_RISCV_SUB32: {
+      int64_t Value = support::endian::read32le(reinterpret_cast<const void *>(
+                          FixupAddress.getValue())) -
+                      E.getTarget().getAddress().getValue() - E.getAddend();
+      *(little32_t *)FixupPtr = static_cast<uint32_t>(Value);
+      break;
+    }
+    case R_RISCV_SUB16: {
+      int64_t Value = support::endian::read16le(reinterpret_cast<const void *>(
+                          FixupAddress.getValue())) -
+                      E.getTarget().getAddress().getValue() - E.getAddend();
+      *(little16_t *)FixupPtr = static_cast<uint32_t>(Value);
+      break;
+    }
+    case R_RISCV_SUB8: {
+      int64_t Value =
+          *(reinterpret_cast<const uint8_t *>(FixupAddress.getValue())) -
+          E.getTarget().getAddress().getValue() - E.getAddend();
+      *FixupPtr = static_cast<uint8_t>(Value);
+      break;
+    }
+    case R_RISCV_SET6: {
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
+      uint32_t RawData = *(little32_t *)FixupPtr;
+      int64_t Word6 = Value & 0x3f;
+      *(little32_t *)FixupPtr = (RawData & 0xffffffc0) | Word6;
+      break;
+    }
+    case R_RISCV_SET8: {
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
+      uint32_t RawData = *(little32_t *)FixupPtr;
+      int64_t Word8 = Value & 0xff;
+      *(little32_t *)FixupPtr = (RawData & 0xffffff00) | Word8;
+      break;
+    }
+    case R_RISCV_SET16: {
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
+      uint32_t RawData = *(little32_t *)FixupPtr;
+      int64_t Word16 = Value & 0xffff;
+      *(little32_t *)FixupPtr = (RawData & 0xffff0000) | Word16;
+      break;
+    }
+    case R_RISCV_SET32: {
+      int64_t Value = (E.getTarget().getAddress() + E.getAddend()).getValue();
+      int64_t Word32 = Value & 0xffffffff;
+      *(little32_t *)FixupPtr = Word32;
+      break;
+    }
+    case R_RISCV_32_PCREL: {
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      int64_t Word32 = Value & 0xffffffff;
+      *(little32_t *)FixupPtr = Word32;
+      break;
+    }
     }
     return Error::success();
   }
@@ -261,6 +408,8 @@ private:
       return EdgeKind_riscv::R_RISCV_32;
     case ELF::R_RISCV_64:
       return EdgeKind_riscv::R_RISCV_64;
+    case ELF::R_RISCV_BRANCH:
+      return EdgeKind_riscv::R_RISCV_BRANCH;
     case ELF::R_RISCV_HI20:
       return EdgeKind_riscv::R_RISCV_HI20;
     case ELF::R_RISCV_LO12_I:
@@ -277,6 +426,32 @@ private:
       return EdgeKind_riscv::R_RISCV_GOT_HI20;
     case ELF::R_RISCV_CALL_PLT:
       return EdgeKind_riscv::R_RISCV_CALL_PLT;
+    case ELF::R_RISCV_ADD64:
+      return EdgeKind_riscv::R_RISCV_ADD64;
+    case ELF::R_RISCV_ADD32:
+      return EdgeKind_riscv::R_RISCV_ADD32;
+    case ELF::R_RISCV_ADD16:
+      return EdgeKind_riscv::R_RISCV_ADD16;
+    case ELF::R_RISCV_ADD8:
+      return EdgeKind_riscv::R_RISCV_ADD8;
+    case ELF::R_RISCV_SUB64:
+      return EdgeKind_riscv::R_RISCV_SUB64;
+    case ELF::R_RISCV_SUB32:
+      return EdgeKind_riscv::R_RISCV_SUB32;
+    case ELF::R_RISCV_SUB16:
+      return EdgeKind_riscv::R_RISCV_SUB16;
+    case ELF::R_RISCV_SUB8:
+      return EdgeKind_riscv::R_RISCV_SUB8;
+    case ELF::R_RISCV_SET6:
+      return EdgeKind_riscv::R_RISCV_SET6;
+    case ELF::R_RISCV_SET8:
+      return EdgeKind_riscv::R_RISCV_SET8;
+    case ELF::R_RISCV_SET16:
+      return EdgeKind_riscv::R_RISCV_SET16;
+    case ELF::R_RISCV_SET32:
+      return EdgeKind_riscv::R_RISCV_SET32;
+    case ELF::R_RISCV_32_PCREL:
+      return EdgeKind_riscv::R_RISCV_32_PCREL;
     }
 
     return make_error<JITLinkError>("Unsupported riscv relocation:" +
@@ -298,7 +473,7 @@ private:
 
   Error addSingleRelocation(const typename ELFT::Rela &Rel,
                             const typename ELFT::Shdr &FixupSect,
-                            Section &GraphSection) {
+                            Block &BlockToFix) {
     using Base = ELFLinkGraphBuilder<ELFT>;
 
     uint32_t SymbolIndex = Rel.getSymbol(false);
@@ -321,17 +496,16 @@ private:
       return Kind.takeError();
 
     int64_t Addend = Rel.r_addend;
-    Block *BlockToFix = *(GraphSection.blocks().begin());
-    JITTargetAddress FixupAddress = FixupSect.sh_addr + Rel.r_offset;
-    Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress();
+    auto FixupAddress = orc::ExecutorAddr(FixupSect.sh_addr) + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
     Edge GE(*Kind, Offset, *GraphSymbol, Addend);
     LLVM_DEBUG({
       dbgs() << "    ";
-      printEdge(dbgs(), *BlockToFix, GE, riscv::getEdgeKindName(*Kind));
+      printEdge(dbgs(), BlockToFix, GE, riscv::getEdgeKindName(*Kind));
       dbgs() << "\n";
     });
 
-    BlockToFix->addEdge(std::move(GE));
+    BlockToFix.addEdge(std::move(GE));
     return Error::success();
   }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index 27d8833ae19e..79d2cdbb30f1 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -59,8 +59,8 @@ public:
     // the TLS Info entry's key value will be written by the fixTLVSectionByName
     // pass, so create mutable content.
     auto &TLSInfoEntry = G.createMutableContentBlock(
-        getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()), 0, 8,
-        0);
+        getTLSInfoSection(G), G.allocateContent(getTLSInfoEntryContent()),
+        orc::ExecutorAddr(), 8, 0);
     TLSInfoEntry.addEdge(x86_64::Pointer64, 8, Target, 0);
     return G.addAnonymousSymbol(TLSInfoEntry, 0, 16, false, false);
   }
@@ -172,7 +172,7 @@ private:
 
   Error addSingleRelocation(const typename ELFT::Rela &Rel,
                             const typename ELFT::Shdr &FixupSection,
-                            Section &GraphSection) {
+                            Block &BlockToFix) {
     using Base = ELFLinkGraphBuilder<ELFT>;
 
     uint32_t SymbolIndex = Rel.getSymbol(false);
@@ -248,17 +248,16 @@ private:
     }
     }
 
-    Block *BlockToFix = *(GraphSection.blocks().begin());
-    JITTargetAddress FixupAddress = FixupSection.sh_addr + Rel.r_offset;
-    Edge::OffsetT Offset = FixupAddress - BlockToFix->getAddress();
+    auto FixupAddress = orc::ExecutorAddr(FixupSection.sh_addr) + Rel.r_offset;
+    Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
     Edge GE(Kind, Offset, *GraphSymbol, Addend);
     LLVM_DEBUG({
       dbgs() << "    ";
-      printEdge(dbgs(), *BlockToFix, GE, x86_64::getEdgeKindName(Kind));
+      printEdge(dbgs(), BlockToFix, GE, x86_64::getEdgeKindName(Kind));
       dbgs() << "\n";
     });
 
-    BlockToFix->addEdge(std::move(GE));
+    BlockToFix.addEdge(std::move(GE));
     return Error::success();
   }
 
@@ -322,8 +321,9 @@ private:
       // If there's no defined symbol then create one.
       SectionRange SR(*GOTSection);
       if (SR.empty())
-        GOTSymbol = &G.addAbsoluteSymbol(ELFGOTSymbolName, 0, 0,
-                                         Linkage::Strong, Scope::Local, true);
+        GOTSymbol =
+            &G.addAbsoluteSymbol(ELFGOTSymbolName, orc::ExecutorAddr(), 0,
+                                 Linkage::Strong, Scope::Local, true);
       else
         GOTSymbol =
             &G.addDefinedSymbol(*SR.getFirstBlock(), 0, ELFGOTSymbolName, 0,
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 51dcc1c35fad..78a603cfed17 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -90,8 +90,8 @@ const char *getScopeName(Scope S) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const Block &B) {
-  return OS << formatv("{0:x16}", B.getAddress()) << " -- "
-            << formatv("{0:x8}", B.getAddress() + B.getSize()) << ": "
+  return OS << B.getAddress() << " -- " << (B.getAddress() + B.getSize())
+            << ": "
             << "size = " << formatv("{0:x8}", B.getSize()) << ", "
             << (B.isZeroFill() ? "zero-fill" : "content")
             << ", align = " << B.getAlignment()
@@ -100,9 +100,8 @@ raw_ostream &operator<<(raw_ostream &OS, const Block &B) {
 }
 
 raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) {
-  OS << formatv("{0:x16}", Sym.getAddress()) << " ("
-     << (Sym.isDefined() ? "block" : "addressable") << " + "
-     << formatv("{0:x8}", Sym.getOffset())
+  OS << Sym.getAddress() << " (" << (Sym.isDefined() ? "block" : "addressable")
+     << " + " << formatv("{0:x8}", Sym.getOffset())
      << "): size: " << formatv("{0:x8}", Sym.getSize())
      << ", linkage: " << formatv("{0:6}", getLinkageName(Sym.getLinkage()))
      << ", scope: " << formatv("{0:8}", getScopeName(Sym.getScope())) << ", "
@@ -113,9 +112,9 @@ raw_ostream &operator<<(raw_ostream &OS, const Symbol &Sym) {
 
 void printEdge(raw_ostream &OS, const Block &B, const Edge &E,
                StringRef EdgeKindName) {
-  OS << "edge@" << formatv("{0:x16}", B.getAddress() + E.getOffset()) << ": "
-     << formatv("{0:x16}", B.getAddress()) << " + "
-     << formatv("{0:x}", E.getOffset()) << " -- " << EdgeKindName << " -> ";
+  OS << "edge@" << B.getAddress() + E.getOffset() << ": " << B.getAddress()
+     << " + " << formatv("{0:x}", E.getOffset()) << " -- " << EdgeKindName
+     << " -> ";
 
   auto &TargetSym = E.getTarget();
   if (TargetSym.hasName())
@@ -123,17 +122,16 @@ void printEdge(raw_ostream &OS, const Block &B, const Edge &E,
   else {
     auto &TargetBlock = TargetSym.getBlock();
     auto &TargetSec = TargetBlock.getSection();
-    JITTargetAddress SecAddress = ~JITTargetAddress(0);
+    orc::ExecutorAddr SecAddress(~uint64_t(0));
     for (auto *B : TargetSec.blocks())
       if (B->getAddress() < SecAddress)
         SecAddress = B->getAddress();
 
-    JITTargetAddress SecDelta = TargetSym.getAddress() - SecAddress;
-    OS << formatv("{0:x16}", TargetSym.getAddress()) << " (section "
-       << TargetSec.getName();
+    orc::ExecutorAddrDiff SecDelta = TargetSym.getAddress() - SecAddress;
+    OS << TargetSym.getAddress() << " (section " << TargetSec.getName();
     if (SecDelta)
       OS << " + " << formatv("{0:x}", SecDelta);
-    OS << " / block " << formatv("{0:x16}", TargetBlock.getAddress());
+    OS << " / block " << TargetBlock.getAddress();
     if (TargetSym.getOffset())
       OS << " + " << formatv("{0:x}", TargetSym.getOffset());
     OS << ")";
@@ -265,7 +263,7 @@ void LinkGraph::dump(raw_ostream &OS) {
     });
 
     for (auto *B : SortedBlocks) {
-      OS << "  block " << formatv("{0:x16}", B->getAddress())
+      OS << "  block " << B->getAddress()
          << " size = " << formatv("{0:x8}", B->getSize())
          << ", align = " << B->getAlignment()
          << ", alignment-offset = " << B->getAlignmentOffset();
@@ -290,9 +288,8 @@ void LinkGraph::dump(raw_ostream &OS) {
           return LHS.getOffset() < RHS.getOffset();
         });
         for (auto &E : SortedEdges) {
-          OS << "      " << formatv("{0:x16}", B->getFixupAddress(E))
-             << " (block + " << formatv("{0:x8}", E.getOffset())
-             << "), addend = ";
+          OS << "      " << B->getFixupAddress(E) << " (block + "
+             << formatv("{0:x8}", E.getOffset()) << "), addend = ";
           if (E.getAddend() >= 0)
             OS << formatv("+{0:x8}", E.getAddend());
           else
@@ -315,16 +312,14 @@ void LinkGraph::dump(raw_ostream &OS) {
   OS << "Absolute symbols:\n";
   if (!llvm::empty(absolute_symbols())) {
     for (auto *Sym : absolute_symbols())
-      OS << "  " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym
-         << "\n";
+      OS << "  " << Sym->getAddress() << ": " << *Sym << "\n";
   } else
     OS << "  none\n";
 
   OS << "\nExternal symbols:\n";
   if (!llvm::empty(external_symbols())) {
     for (auto *Sym : external_symbols())
-      OS << "  " << format("0x%016" PRIx64, Sym->getAddress()) << ": " << *Sym
-         << "\n";
+      OS << "  " << Sym->getAddress() << ": " << *Sym << "\n";
   } else
     OS << "  none\n";
 }
@@ -370,10 +365,13 @@ Error makeTargetOutOfRangeError(const LinkGraph &G, const Block &B,
     Section &Sec = B.getSection();
     ErrStream << "In graph " << G.getName() << ", section " << Sec.getName()
               << ": relocation target ";
-    if (E.getTarget().hasName())
-      ErrStream << "\"" << E.getTarget().getName() << "\" ";
-    ErrStream << "at address " << formatv("{0:x}", E.getTarget().getAddress());
-    ErrStream << " is out of range of " << G.getEdgeKindName(E.getKind())
+    if (E.getTarget().hasName()) {
+      ErrStream << "\"" << E.getTarget().getName() << "\"";
+    } else
+      ErrStream << E.getTarget().getBlock().getSection().getName() << " + "
+                << formatv("{0:x}", E.getOffset());
+    ErrStream << " at address " << formatv("{0:x}", E.getTarget().getAddress())
+              << " is out of range of " << G.getEdgeKindName(E.getKind())
               << " fixup at " << formatv("{0:x}", B.getFixupAddress(E)) << " (";
 
     Symbol *BestSymbolForBlock = nullptr;
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 706688aba4ec..35ee050c8566 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -192,7 +192,7 @@ JITLinkContext::LookupMap JITLinkerBase::getExternalSymbolNames() const {
   // Identify unresolved external symbols.
   JITLinkContext::LookupMap UnresolvedExternals;
   for (auto *Sym : G->external_symbols()) {
-    assert(Sym->getAddress() == 0 &&
+    assert(!Sym->getAddress() &&
            "External has already been assigned an address");
     assert(Sym->getName() != StringRef() && Sym->getName() != "" &&
            "Externals must be named");
@@ -209,11 +209,12 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
   for (auto *Sym : G->external_symbols()) {
     assert(Sym->getOffset() == 0 &&
            "External symbol is not at the start of its addressable block");
-    assert(Sym->getAddress() == 0 && "Symbol already resolved");
+    assert(!Sym->getAddress() && "Symbol already resolved");
     assert(!Sym->isDefined() && "Symbol being resolved is already defined");
     auto ResultI = Result.find(Sym->getName());
     if (ResultI != Result.end())
-      Sym->getAddressable().setAddress(ResultI->second.getAddress());
+      Sym->getAddressable().setAddress(
+          orc::ExecutorAddr(ResultI->second.getAddress()));
     else
       assert(Sym->getLinkage() == Linkage::Weak &&
              "Failed to resolve non-weak reference");
@@ -223,7 +224,7 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
     dbgs() << "Externals after applying lookup result:\n";
     for (auto *Sym : G->external_symbols())
       dbgs() << "  " << Sym->getName() << ": "
-             << formatv("{0:x16}", Sym->getAddress()) << "\n";
+             << formatv("{0:x16}", Sym->getAddress().getValue()) << "\n";
   });
 }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index e4fdda0783a4..1095fa5ce701 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -19,9 +19,6 @@
 #define DEBUG_TYPE "jitlink"
 
 namespace llvm {
-
-class MemoryBufferRef;
-
 namespace jitlink {
 
 /// Base class for a JIT linker.
@@ -161,4 +158,4 @@ void prune(LinkGraph &G);
 
 #undef DEBUG_TYPE // "jitlink"
 
-#endif // LLVM_EXECUTIONENGINE_JITLINK_JITLINKGENERIC_H
+#endif // LIB_EXECUTIONENGINE_JITLINK_JITLINKGENERIC_H
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 831b9b26d2fd..9315ac4f6120 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -15,63 +15,12 @@
 
 using namespace llvm;
 
-namespace {
-
-// FIXME: Remove this copy of CWrapperFunctionResult as soon as JITLink can
-// depend on shared utils from Orc.
-
-// Must be kept in-sync with compiler-rt/lib/orc/c-api.h.
-union CWrapperFunctionResultDataUnion {
-  char *ValuePtr;
-  char Value[sizeof(ValuePtr)];
-};
-
-// Must be kept in-sync with compiler-rt/lib/orc/c-api.h.
-typedef struct {
-  CWrapperFunctionResultDataUnion Data;
-  size_t Size;
-} CWrapperFunctionResult;
-
-Error toError(CWrapperFunctionResult R) {
-  bool HasError = false;
-  std::string ErrMsg;
-  if (R.Size) {
-    bool Large = R.Size > sizeof(CWrapperFunctionResultDataUnion);
-    char *Content = Large ? R.Data.ValuePtr : R.Data.Value;
-    if (Content[0]) {
-      HasError = true;
-      constexpr unsigned StrStart = 1 + sizeof(uint64_t);
-      ErrMsg.resize(R.Size - StrStart);
-      memcpy(&ErrMsg[0], Content + StrStart, R.Size - StrStart);
-    }
-    if (Large)
-      free(R.Data.ValuePtr);
-  } else if (R.Data.ValuePtr) {
-    HasError = true;
-    ErrMsg = R.Data.ValuePtr;
-    free(R.Data.ValuePtr);
-  }
-
-  if (HasError)
-    return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
-  return Error::success();
-}
-} // namespace
-
 namespace llvm {
 namespace jitlink {
 
 JITLinkMemoryManager::~JITLinkMemoryManager() = default;
 JITLinkMemoryManager::InFlightAlloc::~InFlightAlloc() = default;
 
-static Error runAllocAction(JITLinkMemoryManager::AllocActionCall &C) {
-  using WrapperFnTy = CWrapperFunctionResult (*)(const void *, size_t);
-  auto *Fn = jitTargetAddressToPointer<WrapperFnTy>(C.FnAddr);
-
-  return toError(Fn(jitTargetAddressToPointer<const void *>(C.CtxAddr),
-                    static_cast<size_t>(C.CtxSize)));
-}
-
 BasicLayout::BasicLayout(LinkGraph &G) : G(G) {
 
   for (auto &Sec : G.sections()) {
@@ -189,7 +138,7 @@ Error BasicLayout::apply() {
   return Error::success();
 }
 
-JITLinkMemoryManager::AllocActions &BasicLayout::graphAllocActions() {
+orc::shared::AllocActions &BasicLayout::graphAllocActions() {
   return G.allocActions();
 }
 
@@ -209,7 +158,7 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
       std::make_unique<LinkGraph>("", Triple(), 0, support::native, nullptr);
   AllocGroupSmallMap<Block *> ContentBlocks;
 
-  JITTargetAddress NextAddr = 0x100000;
+  orc::ExecutorAddr NextAddr(0x100000);
   for (auto &KV : Segments) {
     auto &AG = KV.first;
     auto &Seg = KV.second;
@@ -222,7 +171,8 @@ void SimpleSegmentAlloc::Create(JITLinkMemoryManager &MemMgr,
     Sec.setMemDeallocPolicy(AG.getMemDeallocPolicy());
 
     if (Seg.ContentSize != 0) {
-      NextAddr = alignTo(NextAddr, Seg.ContentAlign);
+      NextAddr =
+          orc::ExecutorAddr(alignTo(NextAddr.getValue(), Seg.ContentAlign));
       auto &B =
           G->createMutableContentBlock(Sec, G->allocateBuffer(Seg.ContentSize),
                                        NextAddr, Seg.ContentAlign.value(), 0);
@@ -297,19 +247,11 @@ public:
     }
 
     // Run finalization actions.
-    // FIXME: Roll back previous successful actions on failure.
-    std::vector<AllocActionCall> DeallocActions;
-    DeallocActions.reserve(G.allocActions().size());
-    for (auto &ActPair : G.allocActions()) {
-      if (ActPair.Finalize.FnAddr)
-        if (auto Err = runAllocAction(ActPair.Finalize)) {
-          OnFinalized(std::move(Err));
-          return;
-        }
-      if (ActPair.Dealloc.FnAddr)
-        DeallocActions.push_back(ActPair.Dealloc);
+    auto DeallocActions = runFinalizeActions(G.allocActions());
+    if (!DeallocActions) {
+      OnFinalized(DeallocActions.takeError());
+      return;
     }
-    G.allocActions().clear();
 
     // Release the finalize segments slab.
     if (auto EC = sys::Memory::releaseMappedMemory(FinalizationSegments)) {
@@ -319,7 +261,7 @@ public:
 
     // Continue with finalized allocation.
     OnFinalized(MemMgr.createFinalizedAlloc(std::move(StandardSegments),
-                                            std::move(DeallocActions)));
+                                            std::move(*DeallocActions)));
   }
 
   void abandon(OnAbandonedFunction OnAbandoned) override {
@@ -428,8 +370,8 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
                        static_cast<size_t>(SegsSizes->FinalizeSegs)};
   }
 
-  auto NextStandardSegAddr = pointerToJITTargetAddress(StandardSegsMem.base());
-  auto NextFinalizeSegAddr = pointerToJITTargetAddress(FinalizeSegsMem.base());
+  auto NextStandardSegAddr = orc::ExecutorAddr::fromPtr(StandardSegsMem.base());
+  auto NextFinalizeSegAddr = orc::ExecutorAddr::fromPtr(FinalizeSegsMem.base());
 
   LLVM_DEBUG({
     dbgs() << "InProcessMemoryManager allocated:\n";
@@ -456,7 +398,7 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
                         ? NextStandardSegAddr
                         : NextFinalizeSegAddr;
 
-    Seg.WorkingMem = jitTargetAddressToPointer<char *>(SegAddr);
+    Seg.WorkingMem = SegAddr.toPtr<char *>();
     Seg.Addr = SegAddr;
 
     SegAddr += alignTo(Seg.ContentSize + Seg.ZeroFillSize, PageSize);
@@ -475,13 +417,12 @@ void InProcessMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
 void InProcessMemoryManager::deallocate(std::vector<FinalizedAlloc> Allocs,
                                         OnDeallocatedFunction OnDeallocated) {
   std::vector<sys::MemoryBlock> StandardSegmentsList;
-  std::vector<std::vector<AllocActionCall>> DeallocActionsList;
+  std::vector<std::vector<orc::shared::WrapperFunctionCall>> DeallocActionsList;
 
   {
     std::lock_guard<std::mutex> Lock(FinalizedAllocsMutex);
     for (auto &Alloc : Allocs) {
-      auto *FA =
-          jitTargetAddressToPointer<FinalizedAllocInfo *>(Alloc.release());
+      auto *FA = Alloc.release().toPtr<FinalizedAllocInfo *>();
       StandardSegmentsList.push_back(std::move(FA->StandardSegments));
       if (!FA->DeallocActions.empty())
         DeallocActionsList.push_back(std::move(FA->DeallocActions));
@@ -498,7 +439,7 @@ void InProcessMemoryManager::deallocate(std::vector<FinalizedAlloc> Allocs,
 
     /// Run any deallocate calls.
     while (!DeallocActions.empty()) {
-      if (auto Err = runAllocAction(DeallocActions.back()))
+      if (auto Err = DeallocActions.back().runWithSPSRetErrorMerged())
         DeallocErr = joinErrors(std::move(DeallocErr), std::move(Err));
       DeallocActions.pop_back();
     }
@@ -517,12 +458,12 @@ void InProcessMemoryManager::deallocate(std::vector<FinalizedAlloc> Allocs,
 JITLinkMemoryManager::FinalizedAlloc
 InProcessMemoryManager::createFinalizedAlloc(
     sys::MemoryBlock StandardSegments,
-    std::vector<AllocActionCall> DeallocActions) {
+    std::vector<orc::shared::WrapperFunctionCall> DeallocActions) {
   std::lock_guard<std::mutex> Lock(FinalizedAllocsMutex);
   auto *FA = FinalizedAllocInfos.Allocate<FinalizedAllocInfo>();
   new (FA) FinalizedAllocInfo(
       {std::move(StandardSegments), std::move(DeallocActions)});
-  return FinalizedAlloc(pointerToJITTargetAddress(FA));
+  return FinalizedAlloc(orc::ExecutorAddr::fromPtr(FA));
 }
 
 } // end namespace jitlink
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index d588b63d9e88..62574604458c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -134,7 +134,7 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
       memcpy(&NSec.SegName, Sec64.segname, 16);
       NSec.SegName[16] = '\0';
 
-      NSec.Address = Sec64.addr;
+      NSec.Address = orc::ExecutorAddr(Sec64.addr);
       NSec.Size = Sec64.size;
       NSec.Alignment = 1ULL << Sec64.align;
       NSec.Flags = Sec64.flags;
@@ -147,7 +147,7 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
       memcpy(&NSec.SegName, Sec32.segname, 16);
       NSec.SegName[16] = '\0';
 
-      NSec.Address = Sec32.addr;
+      NSec.Address = orc::ExecutorAddr(Sec32.addr);
       NSec.Size = Sec32.size;
       NSec.Alignment = 1ULL << Sec32.align;
       NSec.Flags = Sec32.flags;
@@ -287,7 +287,8 @@ Error MachOLinkGraphBuilder::createNormalizedSymbols() {
       if (!NSec)
         return NSec.takeError();
 
-      if (Value < NSec->Address || Value > NSec->Address + NSec->Size)
+      if (orc::ExecutorAddr(Value) < NSec->Address ||
+          orc::ExecutorAddr(Value) > NSec->Address + NSec->Size)
         return make_error<JITLinkError>("Address " + formatv("{0:x}", Value) +
                                         " for symbol " + *Name +
                                         " does not fall within section");
@@ -311,8 +312,9 @@ Error MachOLinkGraphBuilder::createNormalizedSymbols() {
 }
 
 void MachOLinkGraphBuilder::addSectionStartSymAndBlock(
-    unsigned SecIndex, Section &GraphSec, uint64_t Address, const char *Data,
-    uint64_t Size, uint32_t Alignment, bool IsLive) {
+    unsigned SecIndex, Section &GraphSec, orc::ExecutorAddr Address,
+    const char *Data, orc::ExecutorAddrDiff Size, uint32_t Alignment,
+    bool IsLive) {
   Block &B =
       Data ? G->createContentBlock(GraphSec, ArrayRef<char>(Data, Size),
                                    Address, Alignment, 0)
@@ -346,7 +348,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
           return make_error<JITLinkError>("Anonymous common symbol at index " +
                                           Twine(KV.first));
         NSym.GraphSymbol = &G->addCommonSymbol(
-            *NSym.Name, NSym.S, getCommonSection(), 0, NSym.Value,
+            *NSym.Name, NSym.S, getCommonSection(), orc::ExecutorAddr(),
+            orc::ExecutorAddrDiff(NSym.Value),
             1ull << MachO::GET_COMM_ALIGN(NSym.Desc),
             NSym.Desc & MachO::N_NO_DEAD_STRIP);
       } else {
@@ -364,8 +367,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
         return make_error<JITLinkError>("Anonymous absolute symbol at index " +
                                         Twine(KV.first));
       NSym.GraphSymbol = &G->addAbsoluteSymbol(
-          *NSym.Name, NSym.Value, 0, Linkage::Strong, Scope::Default,
-          NSym.Desc & MachO::N_NO_DEAD_STRIP);
+          *NSym.Name, orc::ExecutorAddr(NSym.Value), 0, Linkage::Strong,
+          Scope::Default, NSym.Desc & MachO::N_NO_DEAD_STRIP);
       break;
     case MachO::N_SECT:
       SecIndexToSymbols[NSym.Sect - 1].push_back(&NSym);
@@ -468,13 +471,13 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
 
     // If the section is non-empty but there is no symbol covering the start
     // address then add an anonymous one.
-    if (SecNSymStack.back()->Value != NSec.Address) {
-      auto AnonBlockSize = SecNSymStack.back()->Value - NSec.Address;
+    if (orc::ExecutorAddr(SecNSymStack.back()->Value) != NSec.Address) {
+      auto AnonBlockSize =
+          orc::ExecutorAddr(SecNSymStack.back()->Value) - NSec.Address;
       LLVM_DEBUG({
         dbgs() << "    Section start not covered by symbol. "
-               << "Creating anonymous block to cover [ "
-               << formatv("{0:x16}", NSec.Address) << " -- "
-               << formatv("{0:x16}", NSec.Address + AnonBlockSize) << " ]\n";
+               << "Creating anonymous block to cover [ " << NSec.Address
+               << " -- " << (NSec.Address + AnonBlockSize) << " ]\n";
       });
       addSectionStartSymAndBlock(SecIndex, *NSec.GraphSection, NSec.Address,
                                  NSec.Data, AnonBlockSize, NSec.Alignment,
@@ -496,12 +499,12 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
       }
 
       // BlockNSyms now contains the block symbols in reverse canonical order.
-      JITTargetAddress BlockStart = BlockSyms.front()->Value;
-      JITTargetAddress BlockEnd = SecNSymStack.empty()
-                                      ? NSec.Address + NSec.Size
-                                      : SecNSymStack.back()->Value;
-      JITTargetAddress BlockOffset = BlockStart - NSec.Address;
-      JITTargetAddress BlockSize = BlockEnd - BlockStart;
+      auto BlockStart = orc::ExecutorAddr(BlockSyms.front()->Value);
+      orc::ExecutorAddr BlockEnd =
+          SecNSymStack.empty() ? NSec.Address + NSec.Size
+                               : orc::ExecutorAddr(SecNSymStack.back()->Value);
+      orc::ExecutorAddrDiff BlockOffset = BlockStart - NSec.Address;
+      orc::ExecutorAddrDiff BlockSize = BlockEnd - BlockStart;
 
       LLVM_DEBUG({
         dbgs() << "    Creating block for " << formatv("{0:x16}", BlockStart)
@@ -521,8 +524,8 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
                                        BlockStart, NSec.Alignment,
                                        BlockStart % NSec.Alignment);
 
-      Optional<JITTargetAddress> LastCanonicalAddr;
-      JITTargetAddress SymEnd = BlockEnd;
+      Optional<orc::ExecutorAddr> LastCanonicalAddr;
+      auto SymEnd = BlockEnd;
       while (!BlockSyms.empty()) {
         auto &NSym = *BlockSyms.back();
         BlockSyms.pop_back();
@@ -530,9 +533,9 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
         bool SymLive =
             (NSym.Desc & MachO::N_NO_DEAD_STRIP) || SectionIsNoDeadStrip;
 
-        auto &Sym = createStandardGraphSymbol(NSym, B, SymEnd - NSym.Value,
-                                              SectionIsText, SymLive,
-                                              LastCanonicalAddr != NSym.Value);
+        auto &Sym = createStandardGraphSymbol(
+            NSym, B, SymEnd - orc::ExecutorAddr(NSym.Value), SectionIsText,
+            SymLive, LastCanonicalAddr != orc::ExecutorAddr(NSym.Value));
 
         if (LastCanonicalAddr != Sym.getAddress()) {
           if (LastCanonicalAddr)
@@ -568,11 +571,12 @@ Symbol &MachOLinkGraphBuilder::createStandardGraphSymbol(NormalizedSymbol &NSym,
     dbgs() << "\n";
   });
 
-  auto &Sym = NSym.Name ? G->addDefinedSymbol(B, NSym.Value - B.getAddress(),
-                                              *NSym.Name, Size, NSym.L, NSym.S,
-                                              IsText, IsNoDeadStrip)
-                        : G->addAnonymousSymbol(B, NSym.Value - B.getAddress(),
-                                                Size, IsText, IsNoDeadStrip);
+  auto SymOffset = orc::ExecutorAddr(NSym.Value) - B.getAddress();
+  auto &Sym =
+      NSym.Name
+          ? G->addDefinedSymbol(B, SymOffset, *NSym.Name, Size, NSym.L, NSym.S,
+                                IsText, IsNoDeadStrip)
+          : G->addAnonymousSymbol(B, SymOffset, Size, IsText, IsNoDeadStrip);
   NSym.GraphSymbol = &Sym;
 
   if (IsCanonical)
@@ -635,12 +639,12 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
 
   bool SectionIsNoDeadStrip = NSec.Flags & MachO::S_ATTR_NO_DEAD_STRIP;
   bool SectionIsText = NSec.Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
-  JITTargetAddress BlockStart = 0;
+  orc::ExecutorAddrDiff BlockStart = 0;
 
   // Scan section for null characters.
   for (size_t I = 0; I != NSec.Size; ++I)
     if (NSec.Data[I] == '\0') {
-      JITTargetAddress BlockEnd = I + 1;
+      orc::ExecutorAddrDiff BlockEnd = I + 1;
       size_t BlockSize = BlockEnd - BlockStart;
       // Create a block for this null terminated string.
       auto &B = G->createContentBlock(*NSec.GraphSection,
@@ -654,7 +658,8 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
       });
 
       // If there's no symbol at the start of this block then create one.
-      if (NSyms.empty() || NSyms.back()->Value != B.getAddress()) {
+      if (NSyms.empty() ||
+          orc::ExecutorAddr(NSyms.back()->Value) != B.getAddress()) {
         auto &S = G->addAnonymousSymbol(B, 0, BlockSize, false, false);
         setCanonicalSymbol(NSec, S);
         LLVM_DEBUG({
@@ -666,18 +671,19 @@ Error MachOLinkGraphBuilder::graphifyCStringSection(
       }
 
       // Process any remaining symbols that point into this block.
-      JITTargetAddress LastCanonicalAddr = B.getAddress() + BlockEnd;
-      while (!NSyms.empty() &&
-             NSyms.back()->Value < (B.getAddress() + BlockSize)) {
+      auto LastCanonicalAddr = B.getAddress() + BlockEnd;
+      while (!NSyms.empty() && orc::ExecutorAddr(NSyms.back()->Value) <
+                                   B.getAddress() + BlockSize) {
         auto &NSym = *NSyms.back();
-        size_t SymSize = (B.getAddress() + BlockSize) - NSyms.back()->Value;
+        size_t SymSize = (B.getAddress() + BlockSize) -
+                         orc::ExecutorAddr(NSyms.back()->Value);
         bool SymLive =
             (NSym.Desc & MachO::N_NO_DEAD_STRIP) || SectionIsNoDeadStrip;
 
         bool IsCanonical = false;
-        if (LastCanonicalAddr != NSym.Value) {
+        if (LastCanonicalAddr != orc::ExecutorAddr(NSym.Value)) {
           IsCanonical = true;
-          LastCanonicalAddr = NSym.Value;
+          LastCanonicalAddr = orc::ExecutorAddr(NSym.Value);
         }
 
         createStandardGraphSymbol(NSym, B, SymSize, SectionIsText, SymLive,
@@ -785,7 +791,7 @@ Error CompactUnwindSplitter::operator()(LinkGraph &G) {
                 E.getTarget().getName() + " is an external symbol");
           auto &TgtBlock = E.getTarget().getBlock();
           auto &CURecSym =
-              G.addAnonymousSymbol(CURec, 0, CURecordSize, 0, false);
+              G.addAnonymousSymbol(CURec, 0, CURecordSize, false, false);
           TgtBlock.addEdge(Edge::KeepAlive, 0, CURecSym, 0);
           AddedKeepAlive = true;
         } else if (E.getOffset() != PersonalityEdgeOffset &&
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
index d29732ebdba8..2951a8533098 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -71,13 +71,13 @@ protected:
   public:
     char SectName[17];
     char SegName[17];
-    uint64_t Address = 0;
+    orc::ExecutorAddr Address;
     uint64_t Size = 0;
     uint64_t Alignment = 0;
     uint32_t Flags = 0;
     const char *Data = nullptr;
     Section *GraphSection = nullptr;
-    std::map<JITTargetAddress, Symbol *> CanonicalSymbols;
+    std::map<orc::ExecutorAddr, Symbol *> CanonicalSymbols;
   };
 
   using SectionParserFunction = std::function<Error(NormalizedSection &S)>;
@@ -137,7 +137,7 @@ protected:
   /// Returns the symbol with the highest address not greater than the search
   /// address, or null if no such symbol exists.
   Symbol *getSymbolByAddress(NormalizedSection &NSec,
-                             JITTargetAddress Address) {
+                             orc::ExecutorAddr Address) {
     auto I = NSec.CanonicalSymbols.upper_bound(Address);
     if (I == NSec.CanonicalSymbols.begin())
       return nullptr;
@@ -147,7 +147,7 @@ protected:
   /// Returns the symbol with the highest address not greater than the search
   /// address, or an error if no such symbol exists.
   Expected<Symbol &> findSymbolByAddress(NormalizedSection &NSec,
-                                         JITTargetAddress Address) {
+                                         orc::ExecutorAddr Address) {
     auto *Sym = getSymbolByAddress(NSec, Address);
     if (Sym)
       if (Address <= Sym->getAddress() + Sym->getSize())
@@ -193,9 +193,9 @@ private:
 
   Section &getCommonSection();
   void addSectionStartSymAndBlock(unsigned SecIndex, Section &GraphSec,
-                                  uint64_t Address, const char *Data,
-                                  uint64_t Size, uint32_t Alignment,
-                                  bool IsLive);
+                                  orc::ExecutorAddr Address, const char *Data,
+                                  orc::ExecutorAddrDiff Size,
+                                  uint32_t Alignment, bool IsLive);
 
   Error createNormalizedSections();
   Error createNormalizedSymbols();
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index f2a029d35cd5..3ca2e40c7263 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -109,7 +109,7 @@ private:
   Expected<PairRelocInfo>
   parsePairRelocation(Block &BlockToFix, Edge::Kind SubtractorKind,
                       const MachO::relocation_info &SubRI,
-                      JITTargetAddress FixupAddress, const char *FixupContent,
+                      orc::ExecutorAddr FixupAddress, const char *FixupContent,
                       object::relocation_iterator &UnsignedRelItr,
                       object::relocation_iterator &RelEnd) {
     using namespace support;
@@ -162,7 +162,7 @@ private:
         return ToSymbolSec.takeError();
       ToSymbol = getSymbolByAddress(*ToSymbolSec, ToSymbolSec->Address);
       assert(ToSymbol && "No symbol for section");
-      FixupValue -= ToSymbol->getAddress();
+      FixupValue -= ToSymbol->getAddress().getValue();
     }
 
     MachOARM64RelocationKind DeltaKind;
@@ -195,7 +195,7 @@ private:
 
     for (auto &S : Obj.sections()) {
 
-      JITTargetAddress SectionAddress = S.getAddress();
+      orc::ExecutorAddr SectionAddress(S.getAddress());
 
       // Skip relocations virtual sections.
       if (S.isVirtual()) {
@@ -234,7 +234,8 @@ private:
           return Kind.takeError();
 
         // Find the address of the value to fix up.
-        JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address;
+        orc::ExecutorAddr FixupAddress =
+            SectionAddress + (uint32_t)RI.r_address;
         LLVM_DEBUG({
           dbgs() << "  " << NSec->SectName << " + "
                  << formatv("{0:x8}", RI.r_address) << ":\n";
@@ -249,7 +250,7 @@ private:
           BlockToFix = &SymbolToFixOrErr->getBlock();
         }
 
-        if (FixupAddress + static_cast<JITTargetAddress>(1ULL << RI.r_length) >
+        if (FixupAddress + orc::ExecutorAddrDiff(1ULL << RI.r_length) >
             BlockToFix->getAddress() + BlockToFix->getContent().size())
           return make_error<JITLinkError>(
               "Relocation content extends past end of fixup block");
@@ -290,7 +291,7 @@ private:
           });
 
           // Find the address of the value to fix up.
-          JITTargetAddress PairedFixupAddress =
+          orc::ExecutorAddr PairedFixupAddress =
               SectionAddress + (uint32_t)RI.r_address;
           if (PairedFixupAddress != FixupAddress)
             return make_error<JITLinkError>("Paired relocation points at "
@@ -324,7 +325,7 @@ private:
           Addend = *(const ulittle64_t *)FixupContent;
           break;
         case Pointer64Anon: {
-          JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent;
+          orc::ExecutorAddr TargetAddress(*(const ulittle64_t *)FixupContent);
           auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
           if (!TargetNSec)
             return TargetNSec.takeError();
@@ -435,7 +436,7 @@ public:
 
   Symbol &createGOTEntry(Symbol &Target) {
     auto &GOTEntryBlock = G.createContentBlock(
-        getGOTSection(), getGOTEntryBlockContent(), 0, 8, 0);
+        getGOTSection(), getGOTEntryBlockContent(), orc::ExecutorAddr(), 8, 0);
     GOTEntryBlock.addEdge(Pointer64, 0, Target, 0);
     return G.addAnonymousSymbol(GOTEntryBlock, 0, 8, false, false);
   }
@@ -457,8 +458,8 @@ public:
   }
 
   Symbol &createPLTStub(Symbol &Target) {
-    auto &StubContentBlock =
-        G.createContentBlock(getStubsSection(), getStubBlockContent(), 0, 1, 0);
+    auto &StubContentBlock = G.createContentBlock(
+        getStubsSection(), getStubBlockContent(), orc::ExecutorAddr(), 1, 0);
     // Re-use GOT entries for stub targets.
     auto &GOTEntrySymbol = getGOTEntry(Target);
     StubContentBlock.addEdge(LDRLiteral19, 0, GOTEntrySymbol, 0);
@@ -474,7 +475,7 @@ public:
 private:
   Section &getGOTSection() {
     if (!GOTSection)
-      GOTSection = &G.createSection("$__GOT", MemProt::Read);
+      GOTSection = &G.createSection("$__GOT", MemProt::Read | MemProt::Exec);
     return *GOTSection;
   }
 
@@ -545,11 +546,12 @@ private:
 
     char *BlockWorkingMem = B.getAlreadyMutableContent().data();
     char *FixupPtr = BlockWorkingMem + E.getOffset();
-    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
+    orc::ExecutorAddr FixupAddress = B.getAddress() + E.getOffset();
 
     switch (E.getKind()) {
     case Branch26: {
-      assert((FixupAddress & 0x3) == 0 && "Branch-inst is not 32-bit aligned");
+      assert((FixupAddress.getValue() & 0x3) == 0 &&
+             "Branch-inst is not 32-bit aligned");
 
       int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
 
@@ -569,7 +571,7 @@ private:
       break;
     }
     case Pointer32: {
-      uint64_t Value = E.getTarget().getAddress() + E.getAddend();
+      uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend();
       if (Value > std::numeric_limits<uint32_t>::max())
         return makeTargetOutOfRangeError(G, B, E);
       *(ulittle32_t *)FixupPtr = Value;
@@ -577,7 +579,7 @@ private:
     }
     case Pointer64:
     case Pointer64Anon: {
-      uint64_t Value = E.getTarget().getAddress() + E.getAddend();
+      uint64_t Value = E.getTarget().getAddress().getValue() + E.getAddend();
       *(ulittle64_t *)FixupPtr = Value;
       break;
     }
@@ -587,9 +589,10 @@ private:
       assert((E.getKind() != GOTPage21 || E.getAddend() == 0) &&
              "GOTPAGE21 with non-zero addend");
       uint64_t TargetPage =
-          (E.getTarget().getAddress() + E.getAddend()) &
-            ~static_cast<uint64_t>(4096 - 1);
-      uint64_t PCPage = FixupAddress & ~static_cast<uint64_t>(4096 - 1);
+          (E.getTarget().getAddress().getValue() + E.getAddend()) &
+          ~static_cast<uint64_t>(4096 - 1);
+      uint64_t PCPage =
+          FixupAddress.getValue() & ~static_cast<uint64_t>(4096 - 1);
 
       int64_t PageDelta = TargetPage - PCPage;
       if (PageDelta < -(1 << 30) || PageDelta > ((1 << 30) - 1))
@@ -606,7 +609,7 @@ private:
     }
     case PageOffset12: {
       uint64_t TargetOffset =
-        (E.getTarget().getAddress() + E.getAddend()) & 0xfff;
+          (E.getTarget().getAddress() + E.getAddend()).getValue() & 0xfff;
 
       uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
       unsigned ImmShift = getPageOffset12Shift(RawInstr);
@@ -627,7 +630,7 @@ private:
       assert((RawInstr & 0xfffffc00) == 0xf9400000 &&
              "RawInstr isn't a 64-bit LDR immediate");
 
-      uint32_t TargetOffset = E.getTarget().getAddress() & 0xfff;
+      uint32_t TargetOffset = E.getTarget().getAddress().getValue() & 0xfff;
       assert((TargetOffset & 0x7) == 0 && "GOT entry is not 8-byte aligned");
       uint32_t EncodedImm = (TargetOffset >> 3) << 10;
       uint32_t FixedInstr = RawInstr | EncodedImm;
@@ -635,7 +638,8 @@ private:
       break;
     }
     case LDRLiteral19: {
-      assert((FixupAddress & 0x3) == 0 && "LDR is not 32-bit aligned");
+      assert((FixupAddress.getValue() & 0x3) == 0 &&
+             "LDR is not 32-bit aligned");
       assert(E.getAddend() == 0 && "LDRLiteral19 with non-zero addend");
       uint32_t RawInstr = *(ulittle32_t *)FixupPtr;
       assert(RawInstr == 0x58000010 && "RawInstr isn't a 64-bit LDR literal");
@@ -705,6 +709,13 @@ void link_MachO_arm64(std::unique_ptr<LinkGraph> G,
     Config.PrePrunePasses.push_back(
         CompactUnwindSplitter("__LD,__compact_unwind"));
 
+    // Add eh-frame passses.
+    // FIXME: Prune eh-frames for which compact-unwind is available once
+    // we support compact-unwind registration with libunwind.
+    Config.PrePrunePasses.push_back(EHFrameSplitter("__TEXT,__eh_frame"));
+    Config.PrePrunePasses.push_back(
+        EHFrameEdgeFixer("__TEXT,__eh_frame", 8, Delta64, Delta32, NegDelta32));
+
     // Add an in-place GOT/Stubs pass.
     Config.PostPrunePasses.push_back(
         PerGraphGOTAndPLTStubsBuilder_MachO_arm64::asPass);
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index a4fcd3b9a5f5..82afaa3aa3c5 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -119,7 +119,7 @@ private:
   // returns the edge kind and addend to be used.
   Expected<PairRelocInfo> parsePairRelocation(
       Block &BlockToFix, MachONormalizedRelocationType SubtractorKind,
-      const MachO::relocation_info &SubRI, JITTargetAddress FixupAddress,
+      const MachO::relocation_info &SubRI, orc::ExecutorAddr FixupAddress,
       const char *FixupContent, object::relocation_iterator &UnsignedRelItr,
       object::relocation_iterator &RelEnd) {
     using namespace support;
@@ -172,7 +172,7 @@ private:
         return ToSymbolSec.takeError();
       ToSymbol = getSymbolByAddress(*ToSymbolSec, ToSymbolSec->Address);
       assert(ToSymbol && "No symbol for section");
-      FixupValue -= ToSymbol->getAddress();
+      FixupValue -= ToSymbol->getAddress().getValue();
     }
 
     Edge::Kind DeltaKind;
@@ -206,7 +206,7 @@ private:
 
     for (auto &S : Obj.sections()) {
 
-      JITTargetAddress SectionAddress = S.getAddress();
+      orc::ExecutorAddr SectionAddress(S.getAddress());
 
       // Skip relocations virtual sections.
       if (S.isVirtual()) {
@@ -241,7 +241,7 @@ private:
         MachO::relocation_info RI = getRelocationInfo(RelItr);
 
         // Find the address of the value to fix up.
-        JITTargetAddress FixupAddress = SectionAddress + (uint32_t)RI.r_address;
+        auto FixupAddress = SectionAddress + (uint32_t)RI.r_address;
 
         LLVM_DEBUG({
           dbgs() << "  " << NSec->SectName << " + "
@@ -257,7 +257,7 @@ private:
           BlockToFix = &SymbolToFixOrErr->getBlock();
         }
 
-        if (FixupAddress + static_cast<JITTargetAddress>(1ULL << RI.r_length) >
+        if (FixupAddress + orc::ExecutorAddrDiff(1ULL << RI.r_length) >
             BlockToFix->getAddress() + BlockToFix->getContent().size())
           return make_error<JITLinkError>(
               "Relocation extends past end of fixup block");
@@ -343,7 +343,7 @@ private:
           Kind = x86_64::Pointer64;
           break;
         case MachOPointer64Anon: {
-          JITTargetAddress TargetAddress = *(const ulittle64_t *)FixupContent;
+          orc::ExecutorAddr TargetAddress(*(const ulittle64_t *)FixupContent);
           auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
           if (!TargetNSec)
             return TargetNSec.takeError();
@@ -367,8 +367,8 @@ private:
           Kind = x86_64::Delta32;
           break;
         case MachOPCRel32Anon: {
-          JITTargetAddress TargetAddress =
-              FixupAddress + 4 + *(const little32_t *)FixupContent;
+          orc::ExecutorAddr TargetAddress(FixupAddress + 4 +
+                                          *(const little32_t *)FixupContent);
           auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
           if (!TargetNSec)
             return TargetNSec.takeError();
@@ -384,10 +384,10 @@ private:
         case MachOPCRel32Minus1Anon:
         case MachOPCRel32Minus2Anon:
         case MachOPCRel32Minus4Anon: {
-          JITTargetAddress Delta =
-              4 + static_cast<JITTargetAddress>(
+          orc::ExecutorAddrDiff Delta =
+              4 + orc::ExecutorAddrDiff(
                       1ULL << (*MachORelocKind - MachOPCRel32Minus1Anon));
-          JITTargetAddress TargetAddress =
+          orc::ExecutorAddr TargetAddress =
               FixupAddress + Delta + *(const little32_t *)FixupContent;
           auto TargetNSec = findSectionByIndex(RI.r_symbolnum - 1);
           if (!TargetNSec)
diff --git a/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h b/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h
index 6e9df9c75a65..6e325f92bafb 100644
--- a/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/PerGraphGOTAndPLTStubsBuilder.h
@@ -47,16 +47,16 @@ public:
         if (impl().isGOTEdgeToFix(E)) {
           LLVM_DEBUG({
             dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind())
-                   << " edge at " << formatv("{0:x}", B->getFixupAddress(E))
-                   << " (" << formatv("{0:x}", B->getAddress()) << " + "
+                   << " edge at " << B->getFixupAddress(E) << " ("
+                   << B->getAddress() << " + "
                    << formatv("{0:x}", E.getOffset()) << ")\n";
           });
           impl().fixGOTEdge(E, getGOTEntry(E.getTarget()));
         } else if (impl().isExternalBranchEdge(E)) {
           LLVM_DEBUG({
             dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind())
-                   << " edge at " << formatv("{0:x}", B->getFixupAddress(E))
-                   << " (" << formatv("{0:x}", B->getAddress()) << " + "
+                   << " edge at " << B->getFixupAddress(E) << " ("
+                   << B->getAddress() << " + "
                    << formatv("{0:x}", E.getOffset()) << ")\n";
           });
           impl().fixPLTEdge(E, getPLTStub(E.getTarget()));
diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
index 6b73ff95a3b0..3ce2cf10a24c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
@@ -24,6 +24,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "R_RISCV_32";
   case R_RISCV_64:
     return "R_RISCV_64";
+  case R_RISCV_BRANCH:
+    return "R_RISCV_BRANCH";
   case R_RISCV_HI20:
     return "R_RISCV_HI20";
   case R_RISCV_LO12_I:
@@ -36,6 +38,32 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "R_RISCV_PCREL_LO12_S";
   case R_RISCV_CALL:
     return "R_RISCV_CALL";
+  case R_RISCV_32_PCREL:
+    return "R_RISCV_32_PCREL";
+  case R_RISCV_ADD64:
+    return "R_RISCV_ADD64";
+  case R_RISCV_ADD32:
+    return "R_RISCV_ADD32";
+  case R_RISCV_ADD16:
+    return "R_RISCV_ADD16";
+  case R_RISCV_ADD8:
+    return "R_RISCV_ADD8";
+  case R_RISCV_SUB64:
+    return "R_RISCV_SUB64";
+  case R_RISCV_SUB32:
+    return "R_RISCV_SUB32";
+  case R_RISCV_SUB16:
+    return "R_RISCV_SUB16";
+  case R_RISCV_SUB8:
+    return "R_RISCV_SUB8";
+  case R_RISCV_SET6:
+    return "R_RISCV_SET6";
+  case R_RISCV_SET8:
+    return "R_RISCV_SET8";
+  case R_RISCV_SET16:
+    return "R_RISCV_SET16";
+  case R_RISCV_SET32:
+    return "R_RISCV_SET32";
   }
   return getGenericEdgeKindName(K);
 }
diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
index 48521280059d..df9979b47e88 100644
--- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
@@ -95,10 +95,10 @@ Error optimizeGOTAndStubAccesses(LinkGraph &G) {
         assert(GOTEntryBlock.edges_size() == 1 &&
                "GOT entry should only have one outgoing edge");
         auto &GOTTarget = GOTEntryBlock.edges().begin()->getTarget();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
-        JITTargetAddress EdgeAddr = B->getFixupAddress(E);
+        orc::ExecutorAddr TargetAddr = GOTTarget.getAddress();
+        orc::ExecutorAddr EdgeAddr = B->getFixupAddress(E);
         int64_t Displacement = TargetAddr - EdgeAddr + 4;
-        bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr);
+        bool TargetInRangeForImmU32 = isInRangeForImmU32(TargetAddr.getValue());
         bool DisplacementInRangeForImmS32 = isInRangeForImmS32(Displacement);
 
         // If both of the Target and displacement is out of range, then
@@ -165,8 +165,8 @@ Error optimizeGOTAndStubAccesses(LinkGraph &G) {
                "GOT block should only have one outgoing edge");
 
         auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
-        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
-        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+        orc::ExecutorAddr EdgeAddr = B->getAddress() + E.getOffset();
+        orc::ExecutorAddr TargetAddr = GOTTarget.getAddress();
 
         int64_t Displacement = TargetAddr - EdgeAddr + 4;
         if (isInRangeForImmS32(Displacement)) {
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index aa82cf38c45d..e5cb8103919a 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -1933,9 +1933,14 @@ Error ExecutionSession::removeJITDylib(JITDylib &JD) {
     JDs.erase(I);
   });
 
-  // Clear the JITDylib.
+  // Clear the JITDylib. Hold on to any error while we clean up the
+  // JITDylib members below.
   auto Err = JD.clear();
 
+  // Notify the platform of the teardown.
+  if (P)
+    Err = joinErrors(std::move(Err), P->teardownJITDylib(JD));
+
   // Set JD to closed state. Clear remaining data structures.
   runSessionLocked([&] {
     assert(JD.State == JITDylib::Closing && "JD should be closing");
@@ -1953,19 +1958,22 @@ Error ExecutionSession::removeJITDylib(JITDylib &JD) {
   return Err;
 }
 
-std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
+Expected<std::vector<JITDylibSP>>
+JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
   if (JDs.empty())
-    return {};
+    return std::vector<JITDylibSP>();
 
   auto &ES = JDs.front()->getExecutionSession();
-  return ES.runSessionLocked([&]() {
+  return ES.runSessionLocked([&]() -> Expected<std::vector<JITDylibSP>> {
     DenseSet<JITDylib *> Visited;
     std::vector<JITDylibSP> Result;
 
     for (auto &JD : JDs) {
 
-      assert(JD->State == Open && "JD is defunct");
-
+      if (JD->State != Open)
+        return make_error<StringError>(
+            "Error building link order: " + JD->getName() + " is defunct",
+            inconvertibleErrorCode());
       if (Visited.count(JD.get()))
         continue;
 
@@ -1990,18 +1998,19 @@ std::vector<JITDylibSP> JITDylib::getDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
   });
 }
 
-std::vector<JITDylibSP>
+Expected<std::vector<JITDylibSP>>
 JITDylib::getReverseDFSLinkOrder(ArrayRef<JITDylibSP> JDs) {
-  auto Tmp = getDFSLinkOrder(JDs);
-  std::reverse(Tmp.begin(), Tmp.end());
-  return Tmp;
+  auto Result = getDFSLinkOrder(JDs);
+  if (Result)
+    std::reverse(Result->begin(), Result->end());
+  return Result;
 }
 
-std::vector<JITDylibSP> JITDylib::getDFSLinkOrder() {
+Expected<std::vector<JITDylibSP>> JITDylib::getDFSLinkOrder() {
   return getDFSLinkOrder({this});
 }
 
-std::vector<JITDylibSP> JITDylib::getReverseDFSLinkOrder() {
+Expected<std::vector<JITDylibSP>> JITDylib::getReverseDFSLinkOrder() {
   return getReverseDFSLinkOrder({this});
 }
 
@@ -2201,7 +2210,7 @@ void ExecutionSession::dump(raw_ostream &OS) {
 
 void ExecutionSession::dispatchOutstandingMUs() {
   LLVM_DEBUG(dbgs() << "Dispatching MaterializationUnits...\n");
-  while (1) {
+  while (true) {
     Optional<std::pair<std::unique_ptr<MaterializationUnit>,
                        std::unique_ptr<MaterializationResponsibility>>>
         JMU;
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
index fcfe389f82a8..4ff6b7fd54df 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugObjectManagerPlugin.cpp
@@ -67,9 +67,9 @@ private:
 template <typename ELFT>
 void ELFDebugObjectSection<ELFT>::setTargetMemoryRange(SectionRange Range) {
   // Only patch load-addresses for executable and data sections.
-  if (isTextOrDataSection()) {
-    Header->sh_addr = static_cast<typename ELFT::uint>(Range.getStart());
-  }
+  if (isTextOrDataSection())
+    Header->sh_addr =
+        static_cast<typename ELFT::uint>(Range.getStart().getValue());
 }
 
 template <typename ELFT>
diff --git a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
index fe62138c790c..6916ee4a827f 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebuggerSupportPlugin.cpp
@@ -129,8 +129,8 @@ public:
       Section *Sec = nullptr;
       StringRef SegName;
       StringRef SecName;
-      JITTargetAddress Alignment = 0;
-      JITTargetAddress StartAddr = 0;
+      uint64_t Alignment = 0;
+      orc::ExecutorAddr StartAddr;
       uint64_t Size = 0;
     };
 
@@ -153,7 +153,8 @@ public:
           return Error::success();
         }
         DebugSecInfos.push_back({&Sec, Sec.getName().substr(0, SepPos),
-                                 Sec.getName().substr(SepPos + 1), 0, 0});
+                                 Sec.getName().substr(SepPos + 1), 0,
+                                 orc::ExecutorAddr(), 0});
       } else {
         NonDebugSections.push_back(&Sec);
 
@@ -182,11 +183,11 @@ public:
     size_t ContainerBlockSize =
         sizeof(typename MachOTraits::Header) + SegmentLCSize;
     auto ContainerBlockContent = G.allocateBuffer(ContainerBlockSize);
-    MachOContainerBlock =
-        &G.createMutableContentBlock(SDOSec, ContainerBlockContent, 0, 8, 0);
+    MachOContainerBlock = &G.createMutableContentBlock(
+        SDOSec, ContainerBlockContent, orc::ExecutorAddr(), 8, 0);
 
     // Copy debug section blocks and symbols.
-    JITTargetAddress NextBlockAddr = MachOContainerBlock->getSize();
+    orc::ExecutorAddr NextBlockAddr(MachOContainerBlock->getSize());
     for (auto &SI : DebugSecInfos) {
       assert(!llvm::empty(SI.Sec->blocks()) && "Empty debug info section?");
 
@@ -219,7 +220,8 @@ public:
       G.mergeSections(SDOSec, *SI.Sec);
       SI.Sec = nullptr;
     }
-    size_t DebugSectionsSize = NextBlockAddr - MachOContainerBlock->getSize();
+    size_t DebugSectionsSize =
+        NextBlockAddr - orc::ExecutorAddr(MachOContainerBlock->getSize());
 
     // Write MachO header and debug section load commands.
     MachOStructWriter Writer(MachOContainerBlock->getAlreadyMutableContent());
@@ -266,9 +268,9 @@ public:
       memset(&Sec, 0, sizeof(Sec));
       memcpy(Sec.sectname, SI.SecName.data(), SI.SecName.size());
       memcpy(Sec.segname, SI.SegName.data(), SI.SegName.size());
-      Sec.addr = SI.StartAddr;
+      Sec.addr = SI.StartAddr.getValue();
       Sec.size = SI.Size;
-      Sec.offset = SI.StartAddr;
+      Sec.offset = SI.StartAddr.getValue();
       Sec.align = SI.Alignment;
       Sec.reloff = 0;
       Sec.nreloc = 0;
@@ -336,7 +338,7 @@ public:
       memset(&SecCmd, 0, sizeof(SecCmd));
       memcpy(SecCmd.sectname, SecName.data(), SecName.size());
       memcpy(SecCmd.segname, SegName.data(), SegName.size());
-      SecCmd.addr = R.getStart();
+      SecCmd.addr = R.getStart().getValue();
       SecCmd.size = R.getSize();
       SecCmd.offset = 0;
       SecCmd.align = R.getFirstBlock()->getAlignment();
@@ -347,8 +349,10 @@ public:
     }
 
     SectionRange R(MachOContainerBlock->getSection());
-    G.allocActions().push_back(
-        {{RegisterActionAddr.getValue(), R.getStart(), R.getSize()}, {}});
+    G.allocActions().push_back({cantFail(shared::WrapperFunctionCall::Create<
+                                         SPSArgList<SPSExecutorAddrRange>>(
+                                    RegisterActionAddr, R.getRange())),
+                                {}});
     return Error::success();
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index eded54f4bfb3..d02760703f06 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -58,7 +58,8 @@ public:
     auto &DSOHandleSection =
         G->createSection(".data.__dso_handle", jitlink::MemProt::Read);
     auto &DSOHandleBlock = G->createContentBlock(
-        DSOHandleSection, getDSOHandleContent(PointerSize), 0, 8, 0);
+        DSOHandleSection, getDSOHandleContent(PointerSize), orc::ExecutorAddr(),
+        8, 0);
     auto &DSOHandleSymbol = G->addDefinedSymbol(
         DSOHandleBlock, 0, *R->getInitializerSymbol(), DSOHandleBlock.getSize(),
         jitlink::Linkage::Strong, jitlink::Scope::Default, false, true);
@@ -154,6 +155,10 @@ Error ELFNixPlatform::setupJITDylib(JITDylib &JD) {
       std::make_unique<DSOHandleMaterializationUnit>(*this, DSOHandleSymbol));
 }
 
+Error ELFNixPlatform::teardownJITDylib(JITDylib &JD) {
+  return Error::success();
+}
+
 Error ELFNixPlatform::notifyAdding(ResourceTracker &RT,
                                    const MaterializationUnit &MU) {
   auto &JD = RT.getJITDylib();
@@ -315,9 +320,14 @@ void ELFNixPlatform::getInitializersLookupPhase(
     SendInitializerSequenceFn SendResult, JITDylib &JD) {
 
   auto DFSLinkOrder = JD.getDFSLinkOrder();
+  if (!DFSLinkOrder) {
+    SendResult(DFSLinkOrder.takeError());
+    return;
+  }
+
   DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
   ES.runSessionLocked([&]() {
-    for (auto &InitJD : DFSLinkOrder) {
+    for (auto &InitJD : *DFSLinkOrder) {
       auto RISItr = RegisteredInitSymbols.find(InitJD.get());
       if (RISItr != RegisteredInitSymbols.end()) {
         NewInitSymbols[InitJD.get()] = std::move(RISItr->second);
@@ -330,7 +340,7 @@ void ELFNixPlatform::getInitializersLookupPhase(
   // phase.
   if (NewInitSymbols.empty()) {
     getInitializersBuildSequencePhase(std::move(SendResult), JD,
-                                      std::move(DFSLinkOrder));
+                                      std::move(*DFSLinkOrder));
     return;
   }
 
@@ -375,7 +385,7 @@ void ELFNixPlatform::rt_getDeinitializers(
 
   {
     std::lock_guard<std::mutex> Lock(PlatformMutex);
-    auto I = HandleAddrToJITDylib.find(Handle.getValue());
+    auto I = HandleAddrToJITDylib.find(Handle);
     if (I != HandleAddrToJITDylib.end())
       JD = I->second;
   }
@@ -406,7 +416,7 @@ void ELFNixPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
 
   {
     std::lock_guard<std::mutex> Lock(PlatformMutex);
-    auto I = HandleAddrToJITDylib.find(Handle.getValue());
+    auto I = HandleAddrToJITDylib.find(Handle);
     if (I != HandleAddrToJITDylib.end())
       JD = I->second;
   }
@@ -630,12 +640,11 @@ void ELFNixPlatform::ELFNixPlatformPlugin::addDSOHandleSupportPasses(
     assert(I != G.defined_symbols().end() && "Missing DSO handle symbol");
     {
       std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
-      JITTargetAddress HandleAddr = (*I)->getAddress();
+      auto HandleAddr = (*I)->getAddress();
       MP.HandleAddrToJITDylib[HandleAddr] = &JD;
       assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists");
       MP.InitSeqs.insert(std::make_pair(
-          &JD,
-          ELFNixJITDylibInitializers(JD.getName(), ExecutorAddr(HandleAddr))));
+          &JD, ELFNixJITDylibInitializers(JD.getName(), HandleAddr)));
     }
     return Error::success();
   });
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp b/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp
index 4c0fab8aa9fa..256ce94690f0 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCEHFrameRegistrar.cpp
@@ -56,18 +56,15 @@ EPCEHFrameRegistrar::Create(ExecutionSession &ES) {
       ExecutorAddr(DeregisterEHFrameWrapperFnAddr));
 }
 
-Error EPCEHFrameRegistrar::registerEHFrames(JITTargetAddress EHFrameSectionAddr,
-                                            size_t EHFrameSectionSize) {
-  return ES.callSPSWrapper<void(SPSExecutorAddr, uint64_t)>(
-      RegisterEHFrameWrapperFnAddr, ExecutorAddr(EHFrameSectionAddr),
-      static_cast<uint64_t>(EHFrameSectionSize));
+Error EPCEHFrameRegistrar::registerEHFrames(ExecutorAddrRange EHFrameSection) {
+  return ES.callSPSWrapper<void(SPSExecutorAddrRange)>(
+      RegisterEHFrameWrapperFnAddr, EHFrameSection);
 }
 
 Error EPCEHFrameRegistrar::deregisterEHFrames(
-    JITTargetAddress EHFrameSectionAddr, size_t EHFrameSectionSize) {
-  return ES.callSPSWrapper<void(SPSExecutorAddr, uint64_t)>(
-      DeregisterEHFrameWrapperFnAddr, ExecutorAddr(EHFrameSectionAddr),
-      static_cast<uint64_t>(EHFrameSectionSize));
+    ExecutorAddrRange EHFrameSection) {
+  return ES.callSPSWrapper<void(SPSExecutorAddrRange)>(
+      DeregisterEHFrameWrapperFnAddr, EHFrameSection);
 }
 
 } // end namespace orc
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
index 9b712cb8f7ca..75cc30753f41 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericJITLinkMemoryManager.cpp
@@ -56,17 +56,7 @@ public:
     }
 
     // Transfer allocation actions.
-    // FIXME: Merge JITLink and ORC SupportFunctionCall and Action list types,
-    //        turn this into a std::swap.
-    FR.Actions.reserve(G.allocActions().size());
-    for (auto &ActPair : G.allocActions())
-      FR.Actions.push_back({{ExecutorAddr(ActPair.Finalize.FnAddr),
-                             {ExecutorAddr(ActPair.Finalize.CtxAddr),
-                              ExecutorAddrDiff(ActPair.Finalize.CtxSize)}},
-                            {ExecutorAddr(ActPair.Dealloc.FnAddr),
-                             {ExecutorAddr(ActPair.Dealloc.CtxAddr),
-                              ExecutorAddrDiff(ActPair.Dealloc.CtxSize)}}});
-    G.allocActions().clear();
+    std::swap(FR.Actions, G.allocActions());
 
     Parent.EPC.callSPSWrapperAsync<
         rt::SPSSimpleExecutorMemoryManagerFinalizeSignature>(
@@ -80,7 +70,7 @@ public:
           } else if (FinalizeErr)
             OnFinalize(std::move(FinalizeErr));
           else
-            OnFinalize(FinalizedAlloc(AllocAddr.getValue()));
+            OnFinalize(FinalizedAlloc(AllocAddr));
         },
         Parent.SAs.Allocator, std::move(FR));
   }
@@ -161,7 +151,7 @@ void EPCGenericJITLinkMemoryManager::completeAllocation(
     const auto &AG = KV.first;
     auto &Seg = KV.second;
 
-    Seg.Addr = NextSegAddr.getValue();
+    Seg.Addr = NextSegAddr;
     KV.second.WorkingMem = BL.getGraph().allocateBuffer(Seg.ContentSize).data();
     NextSegAddr += ExecutorAddrDiff(
         alignTo(Seg.ContentSize + Seg.ZeroFillSize, EPC.getPageSize()));
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
index 1d98e104a4d7..cdac367e11a3 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.cpp
@@ -14,6 +14,8 @@
 
 #define DEBUG_TYPE "orc"
 
+using namespace llvm::orc::shared;
+
 namespace llvm {
 namespace orc {
 
@@ -27,10 +29,8 @@ EPCGenericRTDyldMemoryManager::CreateWithDefaultBootstrapSymbols(
            {SAs.Finalize, rt::SimpleExecutorMemoryManagerFinalizeWrapperName},
            {SAs.Deallocate,
             rt::SimpleExecutorMemoryManagerDeallocateWrapperName},
-           {SAs.RegisterEHFrame,
-            rt::RegisterEHFrameSectionCustomDirectWrapperName},
-           {SAs.DeregisterEHFrame,
-            rt::DeregisterEHFrameSectionCustomDirectWrapperName}}))
+           {SAs.RegisterEHFrame, rt::RegisterEHFrameSectionWrapperName},
+           {SAs.DeregisterEHFrame, rt::DeregisterEHFrameSectionWrapperName}}))
     return std::move(Err);
   return std::make_unique<EPCGenericRTDyldMemoryManager>(EPC, std::move(SAs));
 }
@@ -263,10 +263,12 @@ bool EPCGenericRTDyldMemoryManager::finalizeMemory(std::string *ErrMsg) {
 
     for (auto &Frame : ObjAllocs.UnfinalizedEHFrames)
       FR.Actions.push_back(
-          {{SAs.RegisterEHFrame,
-            {ExecutorAddr(Frame.Addr), ExecutorAddrDiff(Frame.Size)}},
-           {SAs.DeregisterEHFrame,
-            {ExecutorAddr(Frame.Addr), ExecutorAddrDiff(Frame.Size)}}});
+          {cantFail(
+               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+                   SAs.RegisterEHFrame, Frame)),
+           cantFail(
+               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+                   SAs.DeregisterEHFrame, Frame))});
 
     // We'll also need to make an extra allocation for the eh-frame wrapper call
     // arguments.
diff --git a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
index 818b6b52ff83..b901a2d2da23 100644
--- a/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/EPCIndirectionUtils.cpp
@@ -119,10 +119,12 @@ Error EPCTrampolinePool::grow() {
   unsigned NumTrampolines = TrampolinesPerPage;
 
   auto SegInfo = Alloc->getSegInfo(MemProt::Read | MemProt::Exec);
-  EPCIU.getABISupport().writeTrampolines(
-      SegInfo.WorkingMem.data(), SegInfo.Addr, ResolverAddress, NumTrampolines);
+  EPCIU.getABISupport().writeTrampolines(SegInfo.WorkingMem.data(),
+                                         SegInfo.Addr.getValue(),
+                                         ResolverAddress, NumTrampolines);
   for (unsigned I = 0; I < NumTrampolines; ++I)
-    AvailableTrampolines.push_back(SegInfo.Addr + (I * TrampolineSize));
+    AvailableTrampolines.push_back(SegInfo.Addr.getValue() +
+                                   (I * TrampolineSize));
 
   auto FA = Alloc->finalize();
   if (!FA)
@@ -300,15 +302,15 @@ EPCIndirectionUtils::writeResolverBlock(JITTargetAddress ReentryFnAddr,
     return Alloc.takeError();
 
   auto SegInfo = Alloc->getSegInfo(MemProt::Read | MemProt::Exec);
-  ABI->writeResolverCode(SegInfo.WorkingMem.data(), SegInfo.Addr, ReentryFnAddr,
-                         ReentryCtxAddr);
+  ABI->writeResolverCode(SegInfo.WorkingMem.data(), SegInfo.Addr.getValue(),
+                         ReentryFnAddr, ReentryCtxAddr);
 
   auto FA = Alloc->finalize();
   if (!FA)
     return FA.takeError();
 
   ResolverBlock = std::move(*FA);
-  return SegInfo.Addr;
+  return SegInfo.Addr.getValue();
 }
 
 std::unique_ptr<IndirectStubsManager>
@@ -369,8 +371,9 @@ EPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) {
     auto StubSeg = Alloc->getSegInfo(StubProt);
     auto PtrSeg = Alloc->getSegInfo(PtrProt);
 
-    ABI->writeIndirectStubsBlock(StubSeg.WorkingMem.data(), StubSeg.Addr,
-                                 PtrSeg.Addr, NumStubsToAllocate);
+    ABI->writeIndirectStubsBlock(StubSeg.WorkingMem.data(),
+                                 StubSeg.Addr.getValue(),
+                                 PtrSeg.Addr.getValue(), NumStubsToAllocate);
 
     auto FA = Alloc->finalize();
     if (!FA)
@@ -381,8 +384,8 @@ EPCIndirectionUtils::getIndirectStubs(unsigned NumStubs) {
     auto StubExecutorAddr = StubSeg.Addr;
     auto PtrExecutorAddr = PtrSeg.Addr;
     for (unsigned I = 0; I != NumStubsToAllocate; ++I) {
-      AvailableIndirectStubs.push_back(
-          IndirectStubInfo(StubExecutorAddr, PtrExecutorAddr));
+      AvailableIndirectStubs.push_back(IndirectStubInfo(
+          StubExecutorAddr.getValue(), PtrExecutorAddr.getValue()));
       StubExecutorAddr += ABI->getStubSize();
       PtrExecutorAddr += ABI->getPointerSize();
     }
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index f427271bb45d..7a71d2f781d7 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -410,7 +410,7 @@ Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym,
   while (I < Content.size()) {
     MCInst Instr;
     uint64_t InstrSize = 0;
-    uint64_t InstrStart = SymAddress + I;
+    uint64_t InstrStart = SymAddress.getValue() + I;
     auto DecodeStatus = Disassembler.getInstruction(
         Instr, InstrSize, Content.drop_front(I), InstrStart, CommentStream);
     if (DecodeStatus != MCDisassembler::Success) {
@@ -426,7 +426,7 @@ Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym,
     // Check for a PC-relative address equal to the symbol itself.
     auto PCRelAddr =
         MIA.evaluateMemoryOperandAddress(Instr, &STI, InstrStart, InstrSize);
-    if (!PCRelAddr.hasValue() || PCRelAddr.getValue() != SymAddress)
+    if (!PCRelAddr || *PCRelAddr != SymAddress.getValue())
       continue;
 
     auto RelocOffInInstr =
@@ -438,8 +438,8 @@ Error addFunctionPointerRelocationsToCurrentSymbol(jitlink::Symbol &Sym,
       continue;
     }
 
-    auto RelocOffInBlock =
-        InstrStart + *RelocOffInInstr - SymAddress + Sym.getOffset();
+    auto RelocOffInBlock = orc::ExecutorAddr(InstrStart) + *RelocOffInInstr -
+                           SymAddress + Sym.getOffset();
     if (ExistingRelocations.contains(RelocOffInBlock))
       continue;
 
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 0ab0d7d2e2b6..91949c9d7eeb 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -89,6 +89,7 @@ class GenericLLVMIRPlatform : public Platform {
 public:
   GenericLLVMIRPlatform(GenericLLVMIRPlatformSupport &S) : S(S) {}
   Error setupJITDylib(JITDylib &JD) override;
+  Error teardownJITDylib(JITDylib &JD) override;
   Error notifyAdding(ResourceTracker &RT,
                      const MaterializationUnit &MU) override;
   Error notifyRemoving(ResourceTracker &RT) override {
@@ -276,17 +277,22 @@ private:
     DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols;
     std::vector<JITDylibSP> DFSLinkOrder;
 
-    getExecutionSession().runSessionLocked([&]() {
-      DFSLinkOrder = JD.getDFSLinkOrder();
-
-      for (auto &NextJD : DFSLinkOrder) {
-        auto IFItr = InitFunctions.find(NextJD.get());
-        if (IFItr != InitFunctions.end()) {
-          LookupSymbols[NextJD.get()] = std::move(IFItr->second);
-          InitFunctions.erase(IFItr);
-        }
-      }
-    });
+    if (auto Err = getExecutionSession().runSessionLocked([&]() -> Error {
+          if (auto DFSLinkOrderOrErr = JD.getDFSLinkOrder())
+            DFSLinkOrder = std::move(*DFSLinkOrderOrErr);
+          else
+            return DFSLinkOrderOrErr.takeError();
+
+          for (auto &NextJD : DFSLinkOrder) {
+            auto IFItr = InitFunctions.find(NextJD.get());
+            if (IFItr != InitFunctions.end()) {
+              LookupSymbols[NextJD.get()] = std::move(IFItr->second);
+              InitFunctions.erase(IFItr);
+            }
+          }
+          return Error::success();
+        }))
+      return std::move(Err);
 
     LLVM_DEBUG({
       dbgs() << "JITDylib init order is [ ";
@@ -326,20 +332,25 @@ private:
     DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols;
     std::vector<JITDylibSP> DFSLinkOrder;
 
-    ES.runSessionLocked([&]() {
-      DFSLinkOrder = JD.getDFSLinkOrder();
-
-      for (auto &NextJD : DFSLinkOrder) {
-        auto &JDLookupSymbols = LookupSymbols[NextJD.get()];
-        auto DIFItr = DeInitFunctions.find(NextJD.get());
-        if (DIFItr != DeInitFunctions.end()) {
-          LookupSymbols[NextJD.get()] = std::move(DIFItr->second);
-          DeInitFunctions.erase(DIFItr);
-        }
-        JDLookupSymbols.add(LLJITRunAtExits,
-                            SymbolLookupFlags::WeaklyReferencedSymbol);
-      }
-    });
+    if (auto Err = ES.runSessionLocked([&]() -> Error {
+          if (auto DFSLinkOrderOrErr = JD.getDFSLinkOrder())
+            DFSLinkOrder = std::move(*DFSLinkOrderOrErr);
+          else
+            return DFSLinkOrderOrErr.takeError();
+
+          for (auto &NextJD : DFSLinkOrder) {
+            auto &JDLookupSymbols = LookupSymbols[NextJD.get()];
+            auto DIFItr = DeInitFunctions.find(NextJD.get());
+            if (DIFItr != DeInitFunctions.end()) {
+              LookupSymbols[NextJD.get()] = std::move(DIFItr->second);
+              DeInitFunctions.erase(DIFItr);
+            }
+            JDLookupSymbols.add(LLJITRunAtExits,
+                                SymbolLookupFlags::WeaklyReferencedSymbol);
+          }
+          return Error::success();
+        }))
+      return std::move(Err);
 
     LLVM_DEBUG({
       dbgs() << "JITDylib deinit order is [ ";
@@ -380,17 +391,22 @@ private:
     DenseMap<JITDylib *, SymbolLookupSet> RequiredInitSymbols;
     std::vector<JITDylibSP> DFSLinkOrder;
 
-    getExecutionSession().runSessionLocked([&]() {
-      DFSLinkOrder = JD.getDFSLinkOrder();
-
-      for (auto &NextJD : DFSLinkOrder) {
-        auto ISItr = InitSymbols.find(NextJD.get());
-        if (ISItr != InitSymbols.end()) {
-          RequiredInitSymbols[NextJD.get()] = std::move(ISItr->second);
-          InitSymbols.erase(ISItr);
-        }
-      }
-    });
+    if (auto Err = getExecutionSession().runSessionLocked([&]() -> Error {
+          if (auto DFSLinkOrderOrErr = JD.getDFSLinkOrder())
+            DFSLinkOrder = std::move(*DFSLinkOrderOrErr);
+          else
+            return DFSLinkOrderOrErr.takeError();
+
+          for (auto &NextJD : DFSLinkOrder) {
+            auto ISItr = InitSymbols.find(NextJD.get());
+            if (ISItr != InitSymbols.end()) {
+              RequiredInitSymbols[NextJD.get()] = std::move(ISItr->second);
+              InitSymbols.erase(ISItr);
+            }
+          }
+          return Error::success();
+        }))
+      return Err;
 
     return Platform::lookupInitSymbols(getExecutionSession(),
                                        RequiredInitSymbols)
@@ -460,6 +476,10 @@ Error GenericLLVMIRPlatform::setupJITDylib(JITDylib &JD) {
   return S.setupJITDylib(JD);
 }
 
+Error GenericLLVMIRPlatform::teardownJITDylib(JITDylib &JD) {
+  return Error::success();
+}
+
 Error GenericLLVMIRPlatform::notifyAdding(ResourceTracker &RT,
                                           const MaterializationUnit &MU) {
   return S.notifyAdding(RT, MU);
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index fb2e90e1c9c5..a364719855b4 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -106,7 +106,8 @@ private:
     auto HeaderContent = G.allocateString(
         StringRef(reinterpret_cast<const char *>(&Hdr), sizeof(Hdr)));
 
-    return G.createContentBlock(HeaderSection, HeaderContent, 0, 8, 0);
+    return G.createContentBlock(HeaderSection, HeaderContent, ExecutorAddr(), 8,
+                                0);
   }
 
   static MaterializationUnit::Interface
@@ -202,6 +203,8 @@ Error MachOPlatform::setupJITDylib(JITDylib &JD) {
       *this, MachOHeaderStartSymbol));
 }
 
+Error MachOPlatform::teardownJITDylib(JITDylib &JD) { return Error::success(); }
+
 Error MachOPlatform::notifyAdding(ResourceTracker &RT,
                                   const MaterializationUnit &MU) {
   auto &JD = RT.getJITDylib();
@@ -379,9 +382,14 @@ void MachOPlatform::getInitializersLookupPhase(
     SendInitializerSequenceFn SendResult, JITDylib &JD) {
 
   auto DFSLinkOrder = JD.getDFSLinkOrder();
+  if (!DFSLinkOrder) {
+    SendResult(DFSLinkOrder.takeError());
+    return;
+  }
+
   DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
   ES.runSessionLocked([&]() {
-    for (auto &InitJD : DFSLinkOrder) {
+    for (auto &InitJD : *DFSLinkOrder) {
       auto RISItr = RegisteredInitSymbols.find(InitJD.get());
       if (RISItr != RegisteredInitSymbols.end()) {
         NewInitSymbols[InitJD.get()] = std::move(RISItr->second);
@@ -394,7 +402,7 @@ void MachOPlatform::getInitializersLookupPhase(
   // phase.
   if (NewInitSymbols.empty()) {
     getInitializersBuildSequencePhase(std::move(SendResult), JD,
-                                      std::move(DFSLinkOrder));
+                                      std::move(*DFSLinkOrder));
     return;
   }
 
@@ -439,7 +447,7 @@ void MachOPlatform::rt_getDeinitializers(SendDeinitializerSequenceFn SendResult,
 
   {
     std::lock_guard<std::mutex> Lock(PlatformMutex);
-    auto I = HeaderAddrToJITDylib.find(Handle.getValue());
+    auto I = HeaderAddrToJITDylib.find(Handle);
     if (I != HeaderAddrToJITDylib.end())
       JD = I->second;
   }
@@ -469,7 +477,7 @@ void MachOPlatform::rt_lookupSymbol(SendSymbolAddressFn SendResult,
 
   {
     std::lock_guard<std::mutex> Lock(PlatformMutex);
-    auto I = HeaderAddrToJITDylib.find(Handle.getValue());
+    auto I = HeaderAddrToJITDylib.find(Handle);
     if (I != HeaderAddrToJITDylib.end())
       JD = I->second;
   }
@@ -661,11 +669,11 @@ Error MachOPlatform::MachOPlatformPlugin::associateJITDylibHeaderSymbol(
 
   auto &JD = MR.getTargetJITDylib();
   std::lock_guard<std::mutex> Lock(MP.PlatformMutex);
-  JITTargetAddress HeaderAddr = (*I)->getAddress();
+  auto HeaderAddr = (*I)->getAddress();
   MP.HeaderAddrToJITDylib[HeaderAddr] = &JD;
   assert(!MP.InitSeqs.count(&JD) && "InitSeq entry for JD already exists");
-  MP.InitSeqs.insert(std::make_pair(
-      &JD, MachOJITDylibInitializers(JD.getName(), ExecutorAddr(HeaderAddr))));
+  MP.InitSeqs.insert(
+      std::make_pair(&JD, MachOJITDylibInitializers(JD.getName(), HeaderAddr)));
   return Error::success();
 }
 
@@ -792,7 +800,7 @@ Error MachOPlatform::MachOPlatformPlugin::registerInitSections(
 
   if (auto *ObjCImageInfoSec = G.findSectionByName(ObjCImageInfoSectionName)) {
     if (auto Addr = jitlink::SectionRange(*ObjCImageInfoSec).getStart())
-      ObjCImageInfoAddr.setValue(Addr);
+      ObjCImageInfoAddr = Addr;
   }
 
   for (auto InitSectionName : InitSectionNames)
@@ -880,10 +888,12 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections(
     jitlink::SectionRange R(*EHFrameSection);
     if (!R.empty())
       G.allocActions().push_back(
-          {{MP.orc_rt_macho_register_ehframe_section.getValue(), R.getStart(),
-            R.getSize()},
-           {MP.orc_rt_macho_deregister_ehframe_section.getValue(), R.getStart(),
-            R.getSize()}});
+          {cantFail(
+               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+                   MP.orc_rt_macho_register_ehframe_section, R.getRange())),
+           cantFail(
+               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+                   MP.orc_rt_macho_deregister_ehframe_section, R.getRange()))});
   }
 
   // Get a pointer to the thread data section if there is one. It will be used
@@ -913,10 +923,13 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHAndTLVSections(
                                        inconvertibleErrorCode());
 
       G.allocActions().push_back(
-          {{MP.orc_rt_macho_register_thread_data_section.getValue(),
-            R.getStart(), R.getSize()},
-           {MP.orc_rt_macho_deregister_thread_data_section.getValue(),
-            R.getStart(), R.getSize()}});
+          {cantFail(
+               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+                   MP.orc_rt_macho_register_thread_data_section, R.getRange())),
+           cantFail(
+               WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+                   MP.orc_rt_macho_deregister_thread_data_section,
+                   R.getRange()))});
     }
   }
   return Error::success();
@@ -963,10 +976,10 @@ Error MachOPlatform::MachOPlatformPlugin::registerEHSectionsPhase1(
   // Otherwise, add allocation actions to the graph to register eh-frames for
   // this object.
   G.allocActions().push_back(
-      {{orc_rt_macho_register_ehframe_section.getValue(), R.getStart(),
-        R.getSize()},
-       {orc_rt_macho_deregister_ehframe_section.getValue(), R.getStart(),
-        R.getSize()}});
+      {cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+           orc_rt_macho_register_ehframe_section, R.getRange())),
+       cantFail(WrapperFunctionCall::Create<SPSArgList<SPSExecutorAddrRange>>(
+           orc_rt_macho_deregister_ehframe_section, R.getRange()))});
 
   return Error::success();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 0d6a33c5685e..32c5998a789b 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -217,7 +217,7 @@ public:
           Flags |= JITSymbolFlags::Exported;
 
         InternedResult[InternedName] =
-            JITEvaluatedSymbol(Sym->getAddress(), Flags);
+            JITEvaluatedSymbol(Sym->getAddress().getValue(), Flags);
         if (AutoClaim && !MR->getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");
@@ -235,7 +235,7 @@ public:
         if (Sym->getLinkage() == Linkage::Weak)
           Flags |= JITSymbolFlags::Weak;
         InternedResult[InternedName] =
-            JITEvaluatedSymbol(Sym->getAddress(), Flags);
+            JITEvaluatedSymbol(Sym->getAddress().getValue(), Flags);
         if (AutoClaim && !MR->getSymbols().count(InternedName)) {
           assert(!ExtraSymbolsToClaim.count(InternedName) &&
                  "Duplicate symbol to claim?");
@@ -743,7 +743,7 @@ void EHFrameRegistrationPlugin::modifyPassConfig(
     PassConfiguration &PassConfig) {
 
   PassConfig.PostFixupPasses.push_back(createEHFrameRecorderPass(
-      G.getTargetTriple(), [this, &MR](JITTargetAddress Addr, size_t Size) {
+      G.getTargetTriple(), [this, &MR](ExecutorAddr Addr, size_t Size) {
         if (Addr) {
           std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
           assert(!InProcessLinks.count(&MR) &&
@@ -756,7 +756,7 @@ void EHFrameRegistrationPlugin::modifyPassConfig(
 Error EHFrameRegistrationPlugin::notifyEmitted(
     MaterializationResponsibility &MR) {
 
-  EHFrameRange EmittedRange;
+  ExecutorAddrRange EmittedRange;
   {
     std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
 
@@ -765,7 +765,7 @@ Error EHFrameRegistrationPlugin::notifyEmitted(
       return Error::success();
 
     EmittedRange = EHFrameRangeItr->second;
-    assert(EmittedRange.Addr && "eh-frame addr to register can not be null");
+    assert(EmittedRange.Start && "eh-frame addr to register can not be null");
     InProcessLinks.erase(EHFrameRangeItr);
   }
 
@@ -773,7 +773,7 @@ Error EHFrameRegistrationPlugin::notifyEmitted(
           [&](ResourceKey K) { EHFrameRanges[K].push_back(EmittedRange); }))
     return Err;
 
-  return Registrar->registerEHFrames(EmittedRange.Addr, EmittedRange.Size);
+  return Registrar->registerEHFrames(EmittedRange);
 }
 
 Error EHFrameRegistrationPlugin::notifyFailed(
@@ -784,7 +784,7 @@ Error EHFrameRegistrationPlugin::notifyFailed(
 }
 
 Error EHFrameRegistrationPlugin::notifyRemovingResources(ResourceKey K) {
-  std::vector<EHFrameRange> RangesToRemove;
+  std::vector<ExecutorAddrRange> RangesToRemove;
 
   ES.runSessionLocked([&] {
     auto I = EHFrameRanges.find(K);
@@ -798,10 +798,9 @@ Error EHFrameRegistrationPlugin::notifyRemovingResources(ResourceKey K) {
   while (!RangesToRemove.empty()) {
     auto RangeToRemove = RangesToRemove.back();
     RangesToRemove.pop_back();
-    assert(RangeToRemove.Addr && "Untracked eh-frame range must not be null");
-    Err = joinErrors(
-        std::move(Err),
-        Registrar->deregisterEHFrames(RangeToRemove.Addr, RangeToRemove.Size));
+    assert(RangeToRemove.Start && "Untracked eh-frame range must not be null");
+    Err = joinErrors(std::move(Err),
+                     Registrar->deregisterEHFrames(RangeToRemove));
   }
 
   return Err;
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index 77a8f5af8ba0..71be8dfdc004 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -611,7 +611,7 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
       DynamicLibrarySearchGenerator::GetForCurrentProcess(GlobalPrefix, Pred);
 
   if (!ProcessSymsGenerator) {
-    *Result = 0;
+    *Result = nullptr;
     return wrap(ProcessSymsGenerator.takeError());
   }
 
@@ -637,7 +637,7 @@ LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForPath(
       DynamicLibrarySearchGenerator::Load(FileName, GlobalPrefix, Pred);
 
   if (!LibrarySymsGenerator) {
-    *Result = 0;
+    *Result = nullptr;
     return wrap(LibrarySymsGenerator.takeError());
   }
 
@@ -657,7 +657,7 @@ LLVMErrorRef LLVMOrcCreateStaticLibrarySearchGeneratorForPath(
     auto LibrarySymsGenerator =
         StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName, TT);
     if (!LibrarySymsGenerator) {
-      *Result = 0;
+      *Result = nullptr;
       return wrap(LibrarySymsGenerator.takeError());
     }
     *Result = wrap(LibrarySymsGenerator->release());
@@ -666,7 +666,7 @@ LLVMErrorRef LLVMOrcCreateStaticLibrarySearchGeneratorForPath(
     auto LibrarySymsGenerator =
         StaticLibraryDefinitionGenerator::Load(*unwrap(ObjLayer), FileName);
     if (!LibrarySymsGenerator) {
-      *Result = 0;
+      *Result = nullptr;
       return wrap(LibrarySymsGenerator.takeError());
     }
     *Result = wrap(LibrarySymsGenerator->release());
@@ -712,7 +712,7 @@ LLVMErrorRef LLVMOrcJITTargetMachineBuilderDetectHost(
 
   auto JTMB = JITTargetMachineBuilder::detectHost();
   if (!JTMB) {
-    Result = 0;
+    Result = nullptr;
     return wrap(JTMB.takeError());
   }
 
@@ -876,7 +876,7 @@ LLVMErrorRef LLVMOrcCreateLLJIT(LLVMOrcLLJITRef *Result,
   LLVMOrcDisposeLLJITBuilder(Builder);
 
   if (!J) {
-    Result = 0;
+    Result = nullptr;
     return wrap(J.takeError());
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
new file mode 100644
index 000000000000..91f2899449ef
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/AllocationActions.cpp
@@ -0,0 +1,44 @@
+//===----- AllocationActions.gpp -- JITLink allocation support calls  -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Shared/AllocationActions.h"
+
+namespace llvm {
+namespace orc {
+namespace shared {
+
+Expected<std::vector<WrapperFunctionCall>>
+runFinalizeActions(AllocActions &AAs) {
+  std::vector<WrapperFunctionCall> DeallocActions;
+  DeallocActions.reserve(numDeallocActions(AAs));
+
+  for (auto &AA : AAs) {
+    if (AA.Finalize)
+      if (auto Err = AA.Finalize.runWithSPSRetErrorMerged())
+        return joinErrors(std::move(Err), runDeallocActions(DeallocActions));
+
+    if (AA.Dealloc)
+      DeallocActions.push_back(std::move(AA.Dealloc));
+  }
+
+  AAs.clear();
+  return DeallocActions;
+}
+
+Error runDeallocActions(ArrayRef<WrapperFunctionCall> DAs) {
+  Error Err = Error::success();
+  while (!DAs.empty()) {
+    Err = joinErrors(std::move(Err), DAs.back().runWithSPSRetErrorMerged());
+    DAs = DAs.drop_back();
+  }
+  return Err;
+}
+
+} // namespace shared
+} // namespace orc
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index 02044e4af29a..5eae33121eb9 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -36,10 +36,10 @@ const char *MemoryWriteUInt64sWrapperName =
     "__llvm_orc_bootstrap_mem_write_uint64s_wrapper";
 const char *MemoryWriteBuffersWrapperName =
     "__llvm_orc_bootstrap_mem_write_buffers_wrapper";
-const char *RegisterEHFrameSectionCustomDirectWrapperName =
-    "__llvm_orc_bootstrap_register_ehframe_section_custom_direct_wrapper";
-const char *DeregisterEHFrameSectionCustomDirectWrapperName =
-    "__llvm_orc_bootstrap_deregister_ehframe_section_custom_direct_wrapper";
+const char *RegisterEHFrameSectionWrapperName =
+    "__llvm_orc_bootstrap_register_ehframe_section_wrapper";
+const char *DeregisterEHFrameSectionWrapperName =
+    "__llvm_orc_bootstrap_deregister_ehframe_section_wrapper";
 const char *RunAsMainWrapperName = "__llvm_orc_bootstrap_run_as_main_wrapper";
 
 } // end namespace rt
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
index 4c15e25b1d89..ffa2969536e7 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.cpp
@@ -105,23 +105,25 @@ static void registerJITLoaderGDBImpl(const char *ObjAddr, size_t Size) {
 extern "C" orc::shared::CWrapperFunctionResult
 llvm_orc_registerJITLoaderGDBAllocAction(const char *Data, size_t Size) {
   using namespace orc::shared;
-  return WrapperFunction<SPSError()>::handle(nullptr, 0,
-                                             [=]() -> Error {
-                                               registerJITLoaderGDBImpl(Data,
-                                                                        Size);
-                                               return Error::success();
-                                             })
+  return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle(
+             Data, Size,
+             [](ExecutorAddrRange R) {
+               registerJITLoaderGDBImpl(R.Start.toPtr<const char *>(),
+                                        R.size());
+               return Error::success();
+             })
       .release();
 }
 
 extern "C" orc::shared::CWrapperFunctionResult
 llvm_orc_registerJITLoaderGDBWrapper(const char *Data, uint64_t Size) {
   using namespace orc::shared;
-  return WrapperFunction<void(SPSExecutorAddrRange)>::handle(
+  return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle(
              Data, Size,
              [](ExecutorAddrRange R) {
-               registerJITLoaderGDBImpl(R.Start.toPtr<char *>(),
-                                        R.size().getValue());
+               registerJITLoaderGDBImpl(R.Start.toPtr<const char *>(),
+                                        R.size());
+               return Error::success();
              })
       .release();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
index 82aa62a0c0d9..909d47deef59 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.cpp
@@ -71,11 +71,10 @@ void addTo(StringMap<ExecutorAddr> &M) {
                          shared::SPSMemoryAccessUInt64Write>);
   M[rt::MemoryWriteBuffersWrapperName] =
       ExecutorAddr::fromPtr(&writeBuffersWrapper);
-  M[rt::RegisterEHFrameSectionCustomDirectWrapperName] = ExecutorAddr::fromPtr(
-      &llvm_orc_registerEHFrameSectionCustomDirectWrapper);
-  M[rt::DeregisterEHFrameSectionCustomDirectWrapperName] =
-      ExecutorAddr::fromPtr(
-          &llvm_orc_deregisterEHFrameSectionCustomDirectWrapper);
+  M[rt::RegisterEHFrameSectionWrapperName] =
+      ExecutorAddr::fromPtr(&llvm_orc_registerEHFrameSectionWrapper);
+  M[rt::DeregisterEHFrameSectionWrapperName] =
+      ExecutorAddr::fromPtr(&llvm_orc_deregisterEHFrameSectionWrapper);
   M[rt::RunAsMainWrapperName] = ExecutorAddr::fromPtr(&runAsMainWrapper);
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h
index 6b7ff79a3efc..92b513d0bb53 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/OrcRTBootstrap.h
@@ -33,4 +33,4 @@ void addTo(StringMap<ExecutorAddr> &M);
 } // end namespace orc
 } // end namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRTBOOTSTRAP_H
+#endif // LIB_EXECUTIONENGINE_ORC_TARGETPROCESS_ORCRTBOOTSTRAP_H
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
index e331bad84200..fdae0e45da65 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/RegisterEHFrames.cpp
@@ -158,42 +158,26 @@ Error deregisterEHFrameSection(const void *EHFrameSectionAddr,
 } // end namespace orc
 } // end namespace llvm
 
-extern "C" llvm::orc::shared::CWrapperFunctionResult
-llvm_orc_registerEHFrameSectionCustomDirectWrapper(
-    const char *EHFrameSectionAddr, uint64_t Size) {
-  if (auto Err = registerEHFrameSection(EHFrameSectionAddr, Size))
-    return WrapperFunctionResult::createOutOfBandError(toString(std::move(Err)))
-        .release();
-  return llvm::orc::shared::CWrapperFunctionResult();
+static Error registerEHFrameWrapper(ExecutorAddrRange EHFrame) {
+  return llvm::orc::registerEHFrameSection(EHFrame.Start.toPtr<const void *>(),
+                                           EHFrame.size());
 }
 
-extern "C" llvm::orc::shared::CWrapperFunctionResult
-llvm_orc_deregisterEHFrameSectionCustomDirectWrapper(
-    const char *EHFrameSectionAddr, uint64_t Size) {
-  if (auto Err = deregisterEHFrameSection(EHFrameSectionAddr, Size))
-    return WrapperFunctionResult::createOutOfBandError(toString(std::move(Err)))
-        .release();
-  return llvm::orc::shared::CWrapperFunctionResult();
-}
-
-static Error registerEHFrameWrapper(ExecutorAddr Addr, uint64_t Size) {
-  return llvm::orc::registerEHFrameSection(Addr.toPtr<const void *>(), Size);
-}
-
-static Error deregisterEHFrameWrapper(ExecutorAddr Addr, uint64_t Size) {
-  return llvm::orc::deregisterEHFrameSection(Addr.toPtr<const void *>(), Size);
+static Error deregisterEHFrameWrapper(ExecutorAddrRange EHFrame) {
+  return llvm::orc::deregisterEHFrameSection(
+      EHFrame.Start.toPtr<const void *>(), EHFrame.size());
 }
 
 extern "C" orc::shared::CWrapperFunctionResult
 llvm_orc_registerEHFrameSectionWrapper(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddr, uint64_t)>::handle(
+  return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle(
              Data, Size, registerEHFrameWrapper)
       .release();
 }
 
 extern "C" orc::shared::CWrapperFunctionResult
 llvm_orc_deregisterEHFrameSectionWrapper(const char *Data, uint64_t Size) {
-  return WrapperFunction<SPSError(SPSExecutorAddr, uint64_t)>::handle(
+  return WrapperFunction<SPSError(SPSExecutorAddrRange)>::handle(
              Data, Size, deregisterEHFrameWrapper)
       .release();
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 232340c22a32..7cadf3bb51a7 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -24,7 +24,7 @@ SimpleExecutorMemoryManager::~SimpleExecutorMemoryManager() {
 Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
   std::error_code EC;
   auto MB = sys::Memory::allocateMappedMemory(
-      Size, 0, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
+      Size, nullptr, sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC);
   if (EC)
     return errorCodeToError(EC);
   std::lock_guard<std::mutex> Lock(M);
@@ -35,7 +35,7 @@ Expected<ExecutorAddr> SimpleExecutorMemoryManager::allocate(uint64_t Size) {
 
 Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
   ExecutorAddr Base(~0ULL);
-  std::vector<tpctypes::WrapperFunctionCall> DeallocationActions;
+  std::vector<shared::WrapperFunctionCall> DeallocationActions;
   size_t SuccessfulFinalizationActions = 0;
 
   if (FR.Segments.empty()) {
@@ -52,8 +52,8 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
     Base = std::min(Base, Seg.Addr);
 
   for (auto &ActPair : FR.Actions)
-    if (ActPair.Deallocate.Func)
-      DeallocationActions.push_back(ActPair.Deallocate);
+    if (ActPair.Dealloc)
+      DeallocationActions.push_back(ActPair.Dealloc);
 
   // Get the Allocation for this finalization.
   size_t AllocSize = 0;
@@ -96,7 +96,7 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
     while (SuccessfulFinalizationActions)
       Err =
           joinErrors(std::move(Err), FR.Actions[--SuccessfulFinalizationActions]
-                                         .Deallocate.runWithSPSRet());
+                                         .Dealloc.runWithSPSRetErrorMerged());
 
     // Deallocate memory.
     sys::MemoryBlock MB(AllocToDestroy.first, AllocToDestroy.second.Size);
@@ -139,7 +139,7 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
 
   // Run finalization actions.
   for (auto &ActPair : FR.Actions) {
-    if (auto Err = ActPair.Finalize.runWithSPSRet())
+    if (auto Err = ActPair.Finalize.runWithSPSRetErrorMerged())
       return BailOut(std::move(Err));
     ++SuccessfulFinalizationActions;
   }
@@ -212,7 +212,7 @@ Error SimpleExecutorMemoryManager::deallocateImpl(void *Base, Allocation &A) {
 
   while (!A.DeallocationActions.empty()) {
     Err = joinErrors(std::move(Err),
-                     A.DeallocationActions.back().runWithSPSRet());
+                     A.DeallocationActions.back().runWithSPSRetErrorMerged());
     A.DeallocationActions.pop_back();
   }
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 2b88c481dab0..33db23408cf2 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -97,8 +97,8 @@ private:
 
   class EvalResult {
   public:
-    EvalResult() : Value(0), ErrorMsg("") {}
-    EvalResult(uint64_t Value) : Value(Value), ErrorMsg("") {}
+    EvalResult() : Value(0) {}
+    EvalResult(uint64_t Value) : Value(Value) {}
     EvalResult(std::string ErrorMsg)
         : Value(0), ErrorMsg(std::move(ErrorMsg)) {}
     uint64_t getValue() const { return Value; }
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 0de76ab78e0f..f92618afdff6 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -422,6 +422,8 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
   default:
     report_fatal_error("Relocation type not implemented yet!");
     break;
+  case ELF::R_AARCH64_NONE:
+    break;
   case ELF::R_AARCH64_ABS16: {
     uint64_t Result = Value + Addend;
     assert(static_cast<int64_t>(Result) >= INT16_MIN && Result < UINT16_MAX);
diff --git a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
index 6690dd07d99b..56b232b9dbcd 100644
--- a/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/SectionMemoryManager.cpp
@@ -114,11 +114,11 @@ uint8_t *SectionMemoryManager::allocateSection(
 
   // Copy the address to all the other groups, if they have not
   // been initialized.
-  if (CodeMem.Near.base() == 0)
+  if (CodeMem.Near.base() == nullptr)
     CodeMem.Near = MB;
-  if (RODataMem.Near.base() == 0)
+  if (RODataMem.Near.base() == nullptr)
     RODataMem.Near = MB;
-  if (RWDataMem.Near.base() == 0)
+  if (RWDataMem.Near.base() == nullptr)
     RWDataMem.Near = MB;
 
   // Remember that we allocated this memory
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index c962231cbdc1..6186af444e73 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1007,8 +1007,9 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
     // brackets. They also accept a combined form which sets a numeric variable
     // to the evaluation of an expression. Both string and numeric variable
     // names must satisfy the regular expression "[a-zA-Z_][0-9a-zA-Z_]*" to be
-    // valid, as this helps catch some common errors.
-    if (PatternStr.startswith("[[")) {
+    // valid, as this helps catch some common errors. If there are extra '['s
+    // before the "[[", treat them literally.
+    if (PatternStr.startswith("[[") && !PatternStr.startswith("[[[")) {
       StringRef UnparsedPatternStr = PatternStr.substr(2);
       // Find the closing bracket pair ending the match.  End is going to be an
       // offset relative to the beginning of the match string.
@@ -1183,12 +1184,14 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
           Substitutions.push_back(Substitution);
         }
       }
+
+      continue;
     }
 
     // Handle fixed string matches.
     // Find the end, which is the start of the next regex.
-    size_t FixedMatchEnd = PatternStr.find("{{");
-    FixedMatchEnd = std::min(FixedMatchEnd, PatternStr.find("[["));
+    size_t FixedMatchEnd =
+        std::min(PatternStr.find("{{", 1), PatternStr.find("[[", 1));
     RegExStr += Regex::escape(PatternStr.substr(0, FixedMatchEnd));
     PatternStr = PatternStr.substr(FixedMatchEnd);
   }
@@ -2215,7 +2218,7 @@ static Error reportMatchResult(bool ExpectedMatch, const SourceMgr &SM,
 static unsigned CountNumNewlinesBetween(StringRef Range,
                                         const char *&FirstNewLine) {
   unsigned NumNewLines = 0;
-  while (1) {
+  while (true) {
     // Scan for newline.
     Range = Range.substr(Range.find_first_of("\n\r"));
     if (Range.empty())
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 5157d51fd18c..3b8d80c4eeec 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -21,7 +22,9 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
@@ -37,6 +40,7 @@
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 
+#include <cstdint>
 #include <sstream>
 
 #define DEBUG_TYPE "openmp-ir-builder"
@@ -56,6 +60,20 @@ static cl::opt<double> UnrollThresholdFactor(
              "simplifications still taking place"),
     cl::init(1.5));
 
+#ifndef NDEBUG
+/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
+/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
+/// an InsertPoint stores the instruction before something is inserted. For
+/// instance, if both point to the same instruction, two IRBuilders alternating
+/// creating instruction will cause the instructions to be interleaved.
+static bool isConflictIP(IRBuilder<>::InsertPoint IP1,
+                         IRBuilder<>::InsertPoint IP2) {
+  if (!IP1.isSet() || !IP2.isSet())
+    return false;
+  return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
+}
+#endif
+
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
   LLVMContext &Ctx = Fn.getContext();
 
@@ -156,7 +174,7 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
 
 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 
-void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) {
+void OpenMPIRBuilder::finalize(Function *Fn) {
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
   SmallVector<BasicBlock *, 32> Blocks;
   SmallVector<OutlineInfo, 16> DeferredOutlines;
@@ -175,7 +193,7 @@ void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) {
     Function *OuterFn = OI.getFunction();
     CodeExtractorAnalysisCache CEAC(*OuterFn);
     CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
-                            /* AggregateArgs */ false,
+                            /* AggregateArgs */ true,
                             /* BlockFrequencyInfo */ nullptr,
                             /* BranchProbabilityInfo */ nullptr,
                             /* AssumptionCache */ nullptr,
@@ -189,6 +207,9 @@ void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) {
     assert(Extractor.isEligible() &&
            "Expected OpenMP outlining to be possible!");
 
+    for (auto *V : OI.ExcludeArgsFromAggregate)
+      Extractor.excludeArgFromAggregate(V);
+
     Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
 
     LLVM_DEBUG(dbgs() << "After      outlining: " << *OuterFn << "\n");
@@ -207,25 +228,25 @@ void OpenMPIRBuilder::finalize(Function *Fn, bool AllowExtractorSinking) {
       BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
       assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
       assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
-      if (AllowExtractorSinking) {
-        // Move instructions from the to-be-deleted ArtificialEntry to the entry
-        // basic block of the parallel region. CodeExtractor may have sunk
-        // allocas/bitcasts for values that are solely used in the outlined
-        // region and do not escape.
-        assert(!ArtificialEntry.empty() &&
-               "Expected instructions to sink in the outlined region");
-        for (BasicBlock::iterator It = ArtificialEntry.begin(),
-                                  End = ArtificialEntry.end();
-             It != End;) {
-          Instruction &I = *It;
-          It++;
-
-          if (I.isTerminator())
-            continue;
-
-          I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
-        }
+      // Move instructions from the to-be-deleted ArtificialEntry to the entry
+      // basic block of the parallel region. CodeExtractor generates
+      // instructions to unwrap the aggregate argument and may sink
+      // allocas/bitcasts for values that are solely used in the outlined region
+      // and do not escape.
+      assert(!ArtificialEntry.empty() &&
+             "Expected instructions to add in the outlined region entry");
+      for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
+                                        End = ArtificialEntry.rend();
+           It != End;) {
+        Instruction &I = *It;
+        It++;
+
+        if (I.isTerminator())
+          continue;
+
+        I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
       }
+
       OI.EntryBB->moveBefore(&ArtificialEntry);
       ArtificialEntry.eraseFromParent();
     }
@@ -251,23 +272,26 @@ GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
       new GlobalVariable(M, I32Ty,
                          /* isConstant = */ true, GlobalValue::WeakODRLinkage,
                          ConstantInt::get(I32Ty, Value), Name);
+  GV->setVisibility(GlobalValue::HiddenVisibility);
 
   return GV;
 }
 
-Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
-                                         IdentFlag LocFlags,
-                                         unsigned Reserve2Flags) {
+Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
+                                            uint32_t SrcLocStrSize,
+                                            IdentFlag LocFlags,
+                                            unsigned Reserve2Flags) {
   // Enable "C-mode".
   LocFlags |= OMP_IDENT_FLAG_KMPC;
 
-  Value *&Ident =
+  Constant *&Ident =
       IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
   if (!Ident) {
     Constant *I32Null = ConstantInt::getNullValue(Int32);
-    Constant *IdentData[] = {
-        I32Null, ConstantInt::get(Int32, uint32_t(LocFlags)),
-        ConstantInt::get(Int32, Reserve2Flags), I32Null, SrcLocStr};
+    Constant *IdentData[] = {I32Null,
+                             ConstantInt::get(Int32, uint32_t(LocFlags)),
+                             ConstantInt::get(Int32, Reserve2Flags),
+                             ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
     Constant *Initializer =
         ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
 
@@ -290,10 +314,12 @@ Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
     }
   }
 
-  return Builder.CreatePointerCast(Ident, IdentPtr);
+  return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
 }
 
-Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
+                                                uint32_t &SrcLocStrSize) {
+  SrcLocStrSize = LocStr.size();
   Constant *&SrcLocStr = SrcLocStrMap[LocStr];
   if (!SrcLocStr) {
     Constant *Initializer =
@@ -314,8 +340,8 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr) {
 
 Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
                                                 StringRef FileName,
-                                                unsigned Line,
-                                                unsigned Column) {
+                                                unsigned Line, unsigned Column,
+                                                uint32_t &SrcLocStrSize) {
   SmallString<128> Buffer;
   Buffer.push_back(';');
   Buffer.append(FileName);
@@ -327,17 +353,21 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
   Buffer.append(std::to_string(Column));
   Buffer.push_back(';');
   Buffer.push_back(';');
-  return getOrCreateSrcLocStr(Buffer.str());
+  return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
 }
 
-Constant *OpenMPIRBuilder::getOrCreateDefaultSrcLocStr() {
-  return getOrCreateSrcLocStr(";unknown;unknown;0;0;;");
+Constant *
+OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
+  StringRef UnknownLoc = ";unknown;unknown;0;0;;";
+  return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
 }
 
-Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, Function *F) {
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
+                                                uint32_t &SrcLocStrSize,
+                                                Function *F) {
   DILocation *DIL = DL.get();
   if (!DIL)
-    return getOrCreateDefaultSrcLocStr();
+    return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
   StringRef FileName = M.getName();
   if (DIFile *DIF = DIL->getFile())
     if (Optional<StringRef> Source = DIF->getSource())
@@ -346,12 +376,13 @@ Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL, Function *F) {
   if (Function.empty() && F)
     Function = F->getName();
   return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
-                              DIL->getColumn());
+                              DIL->getColumn(), SrcLocStrSize);
 }
 
-Constant *
-OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
-  return getOrCreateSrcLocStr(Loc.DL, Loc.IP.getBlock()->getParent());
+Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
+                                                uint32_t &SrcLocStrSize) {
+  return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
+                              Loc.IP.getBlock()->getParent());
 }
 
 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
@@ -393,9 +424,11 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
     break;
   }
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Args[] = {getOrCreateIdent(SrcLocStr, BarrierLocFlags),
-                   getOrCreateThreadID(getOrCreateIdent(SrcLocStr))};
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Args[] = {
+      getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
+      getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
 
   // If we are in a cancellable parallel region, barriers are cancellation
   // points.
@@ -441,8 +474,9 @@ OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
     llvm_unreachable("Unknown cancel kind!");
   }
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
   Value *Result = Builder.CreateCall(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
@@ -510,11 +544,14 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
     BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
     FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
     omp::ProcBindKind ProcBind, bool IsCancellable) {
+  assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
+
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadID = getOrCreateThreadID(Ident);
 
   if (NumThreads) {
@@ -777,8 +814,10 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
       getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
 
   auto PrivHelper = [&](Value &V) {
-    if (&V == TIDAddr || &V == ZeroAddr)
+    if (&V == TIDAddr || &V == ZeroAddr) {
+      OI.ExcludeArgsFromAggregate.push_back(&V);
       return;
+    }
 
     SetVector<Use *> Uses;
     for (Use &U : V.uses())
@@ -871,8 +910,9 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::createParallel(
 
 void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
   // Build call void __kmpc_flush(ident_t *loc)
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Args[] = {getOrCreateIdent(SrcLocStr)};
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
 
   Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
 }
@@ -886,8 +926,9 @@ void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
 void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
   // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
   // global_tid);
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
 
   // Ignore return result until untied tasks are supported.
@@ -903,8 +944,9 @@ void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
 
 void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
   // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Constant *I32Null = ConstantInt::getNullValue(Int32);
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
 
@@ -1114,14 +1156,16 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   Module *Module = Func->getParent();
   Value *RedArrayPtr =
       Builder.CreateBitCast(RedArray, Builder.getInt8PtrTy(), "red.array.ptr");
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   bool CanGenerateAtomic =
       llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
         return RI.AtomicReductionGen;
       });
-  Value *Ident = getOrCreateIdent(
-      SrcLocStr, CanGenerateAtomic ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
-                                   : IdentFlag(0));
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
+                                  CanGenerateAtomic
+                                      ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
+                                      : IdentFlag(0));
   Value *ThreadId = getOrCreateThreadID(Ident);
   Constant *NumVariables = Builder.getInt32(NumReductions);
   const DataLayout &DL = Module->getDataLayout();
@@ -1235,8 +1279,9 @@ OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
     return Loc.IP;
 
   Directive OMPD = Directive::OMPD_master;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId};
 
@@ -1258,8 +1303,9 @@ OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
     return Loc.IP;
 
   Directive OMPD = Directive::OMPD_masked;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId, Filter};
   Value *ArgsEnd[] = {Ident, ThreadId};
@@ -1475,13 +1521,16 @@ OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
                                           InsertPointTy AllocaIP,
                                           bool NeedsBarrier, Value *Chunk) {
   assert(CLI->isValid() && "Requires a valid canonical loop");
+  assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
+         "Require dedicated allocate IP");
 
   // Set up the source location value for OpenMP runtime.
   Builder.restoreIP(CLI->getPreheaderIP());
   Builder.SetCurrentDebugLocation(DL);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(DL);
-  Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
+  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
 
   // Declare useful OpenMP runtime functions.
   Value *IV = CLI->getIndVar();
@@ -1604,12 +1653,15 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
     DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
     OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
   assert(CLI->isValid() && "Requires a valid canonical loop");
+  assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
+         "Require dedicated allocate IP");
 
   // Set up the source location value for OpenMP runtime.
   Builder.SetCurrentDebugLocation(DL);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(DL);
-  Value *SrcLoc = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
+  Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
 
   // Declare useful OpenMP runtime functions.
   Value *IV = CLI->getIndVar();
@@ -2119,6 +2171,19 @@ static void addLoopMetadata(CanonicalLoopInfo *Loop,
   Latch->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopID);
 }
 
+/// Attach llvm.access.group metadata to the memref instructions of \p Block
+static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
+                            LoopInfo &LI) {
+  for (Instruction &I : *Block) {
+    if (I.mayReadOrWriteMemory()) {
+      // TODO: This instruction may already have access group from
+      // other pragmas e.g. #pragma clang loop vectorize.  Append
+      // so that the existing metadata is not overwritten.
+      I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
+    }
+  }
+}
+
 void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
   LLVMContext &Ctx = Builder.getContext();
   addLoopMetadata(
@@ -2134,6 +2199,53 @@ void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
             });
 }
 
+void OpenMPIRBuilder::applySimd(DebugLoc, CanonicalLoopInfo *CanonicalLoop) {
+  LLVMContext &Ctx = Builder.getContext();
+
+  Function *F = CanonicalLoop->getFunction();
+
+  FunctionAnalysisManager FAM;
+  FAM.registerPass([]() { return DominatorTreeAnalysis(); });
+  FAM.registerPass([]() { return LoopAnalysis(); });
+  FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
+
+  LoopAnalysis LIA;
+  LoopInfo &&LI = LIA.run(*F, FAM);
+
+  Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
+
+  SmallSet<BasicBlock *, 8> Reachable;
+
+  // Get the basic blocks from the loop in which memref instructions
+  // can be found.
+  // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
+  // preferably without running any passes.
+  for (BasicBlock *Block : L->getBlocks()) {
+    if (Block == CanonicalLoop->getCond() ||
+        Block == CanonicalLoop->getHeader())
+      continue;
+    Reachable.insert(Block);
+  }
+
+  // Add access group metadata to memory-access instructions.
+  MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
+  for (BasicBlock *BB : Reachable)
+    addSimdMetadata(BB, AccessGroup, LI);
+
+  // Use the above access group metadata to create loop level
+  // metadata, which should be distinct for each loop.
+  ConstantAsMetadata *BoolConst =
+      ConstantAsMetadata::get(ConstantInt::getTrue(Type::getInt1Ty(Ctx)));
+  // TODO:  If the loop has existing parallel access metadata, have
+  // to combine two lists.
+  addLoopMetadata(
+      CanonicalLoop,
+      {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"),
+                         AccessGroup}),
+       MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
+                         BoolConst})});
+}
+
 /// Create the TargetMachine object to query the backend for optimization
 /// preferences.
 ///
@@ -2243,7 +2355,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
       gatherPeelingPreferences(L, SE, TTI,
                                /*UserAllowPeeling=*/false,
                                /*UserAllowProfileBasedPeeling=*/false,
-                               /*UserUnrollingSpecficValues=*/false);
+                               /*UnrollingSpecficValues=*/false);
 
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
@@ -2379,8 +2491,9 @@ OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
 
   llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
@@ -2407,8 +2520,9 @@ OpenMPIRBuilder::createSingle(const LocationDescription &Loc,
   }
 
   Directive OMPD = Directive::OMPD_single;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId};
 
@@ -2436,8 +2550,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCritical(
     return Loc.IP;
 
   Directive OMPD = Directive::OMPD_critical;
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *LockVar = getOMPCriticalRegionLock(CriticalName);
   Value *Args[] = {Ident, ThreadId, LockVar};
@@ -2466,6 +2581,10 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
                                      InsertPointTy AllocaIP, unsigned NumLoops,
                                      ArrayRef<llvm::Value *> StoreValues,
                                      const Twine &Name, bool IsDependSource) {
+  for (size_t I = 0; I < StoreValues.size(); I++)
+    assert(StoreValues[I]->getType()->isIntegerTy(64) &&
+           "OpenMP runtime requires depend vec with i64 type");
+
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -2480,14 +2599,16 @@ OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
   for (unsigned I = 0; I < NumLoops; ++I) {
     Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
         ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
-    Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
+    StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
+    STInst->setAlignment(Align(8));
   }
 
   Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
       ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
 
@@ -2512,8 +2633,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createOrderedThreadsSimd(
   Instruction *ExitCall = nullptr;
 
   if (IsThreads) {
-    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-    Value *Ident = getOrCreateIdent(SrcLocStr);
+    uint32_t SrcLocStrSize;
+    Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+    Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
     Value *ThreadId = getOrCreateThreadID(Ident);
     Value *Args[] = {Ident, ThreadId};
 
@@ -2718,8 +2840,9 @@ CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
   IRBuilder<>::InsertPointGuard IPG(Builder);
   Builder.restoreIP(Loc.IP);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {ThreadId, Size, Allocator};
 
@@ -2734,8 +2857,9 @@ CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
   IRBuilder<>::InsertPointGuard IPG(Builder);
   Builder.restoreIP(Loc.IP);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Value *Args[] = {ThreadId, Addr, Allocator};
   Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
@@ -2748,8 +2872,9 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
   IRBuilder<>::InsertPointGuard IPG(Builder);
   Builder.restoreIP(Loc.IP);
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   Value *ThreadId = getOrCreateThreadID(Ident);
   Constant *ThreadPrivateCache =
       getOrCreateOMPInternalVariable(Int8PtrPtr, Name);
@@ -2767,8 +2892,9 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
   if (!updateToLocation(Loc))
     return Loc.IP;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   ConstantInt *IsSPMDVal = ConstantInt::getSigned(
       IntegerType::getInt8Ty(Int8->getContext()),
       IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
@@ -2820,8 +2946,9 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
   if (!updateToLocation(Loc))
     return;
 
-  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
-  Value *Ident = getOrCreateIdent(SrcLocStr);
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
   ConstantInt *IsSPMDVal = ConstantInt::getSigned(
       IntegerType::getInt8Ty(Int8->getContext()),
       IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
@@ -2860,7 +2987,8 @@ Constant *OpenMPIRBuilder::getOrCreateOMPInternalVariable(
   StringRef RuntimeName = Out.str();
   auto &Elem = *InternalVars.try_emplace(RuntimeName, nullptr).first;
   if (Elem.second) {
-    assert(Elem.second->getType()->getPointerElementType() == Ty &&
+    assert(cast<PointerType>(Elem.second->getType())
+               ->isOpaqueOrPointeeTypeMatches(Ty) &&
            "OMP internal variable has different type than requested");
   } else {
     // TODO: investigate the appropriate linkage type used for the global
diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp
index a37fd5454dd4..221a3a84b49b 100644
--- a/llvm/lib/FuzzMutate/Operations.cpp
+++ b/llvm/lib/FuzzMutate/Operations.cpp
@@ -169,7 +169,7 @@ OpDescriptor llvm::fuzzerop::splitBlockDescriptor(unsigned Weight) {
 
 OpDescriptor llvm::fuzzerop::gepDescriptor(unsigned Weight) {
   auto buildGEP = [](ArrayRef<Value *> Srcs, Instruction *Inst) {
-    Type *Ty = cast<PointerType>(Srcs[0]->getType())->getElementType();
+    Type *Ty = Srcs[0]->getType()->getPointerElementType();
     auto Indices = makeArrayRef(Srcs).drop_front(1);
     return GetElementPtrInst::Create(Ty, Srcs[0], Indices, "G", Inst);
   };
diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index 1295714839e8..27c3bdfb22a8 100644
--- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -53,8 +53,8 @@ Value *RandomIRBuilder::newSource(BasicBlock &BB, ArrayRef<Instruction *> Insts,
       IP = ++I->getIterator();
       assert(IP != BB.end() && "guaranteed by the findPointer");
     }
-    auto *NewLoad = new LoadInst(
-        cast<PointerType>(Ptr->getType())->getElementType(), Ptr, "L", &*IP);
+    auto *NewLoad =
+        new LoadInst(Ptr->getType()->getPointerElementType(), Ptr, "L", &*IP);
 
     // Only sample this load if it really matches the descriptor
     if (Pred.matches(Srcs, NewLoad))
@@ -141,12 +141,12 @@ Value *RandomIRBuilder::findPointer(BasicBlock &BB,
 
     if (auto PtrTy = dyn_cast<PointerType>(Inst->getType())) {
       // We can never generate loads from non first class or non sized types
-      if (!PtrTy->getElementType()->isSized() ||
-          !PtrTy->getElementType()->isFirstClassType())
+      Type *ElemTy = PtrTy->getPointerElementType();
+      if (!ElemTy->isSized() || !ElemTy->isFirstClassType())
         return false;
 
       // TODO: Check if this is horribly expensive.
-      return Pred.matches(Srcs, UndefValue::get(PtrTy->getElementType()));
+      return Pred.matches(Srcs, UndefValue::get(ElemTy));
     }
     return false;
   };
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index bbe0c97e60a2..179754e275b0 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -587,7 +587,7 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
         OS << " addrspace(" << AddressSpace << ')';
       return;
     }
-    print(PTy->getElementType(), OS);
+    print(PTy->getNonOpaquePointerElementType(), OS);
     if (unsigned AddressSpace = PTy->getAddressSpace())
       OS << " addrspace(" << AddressSpace << ')';
     OS << '*';
@@ -1986,6 +1986,8 @@ static void writeDIStringType(raw_ostream &Out, const DIStringType *N,
   Printer.printString("name", N->getName());
   Printer.printMetadata("stringLength", N->getRawStringLength());
   Printer.printMetadata("stringLengthExpression", N->getRawStringLengthExp());
+  Printer.printMetadata("stringLocationExpression",
+                        N->getRawStringLocationExp());
   Printer.printInt("size", N->getSizeInBits());
   Printer.printInt("align", N->getAlignInBits());
   Printer.printDwarfEnum("encoding", N->getEncoding(),
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index c899afae6cce..c92bacaee36d 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -607,14 +607,14 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
 AttributeSet AttributeSet::addAttribute(LLVMContext &C,
                                         Attribute::AttrKind Kind) const {
   if (hasAttribute(Kind)) return *this;
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addAttribute(Kind);
   return addAttributes(C, AttributeSet::get(C, B));
 }
 
 AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind,
                                         StringRef Value) const {
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addAttribute(Kind, Value);
   return addAttributes(C, AttributeSet::get(C, B));
 }
@@ -627,17 +627,15 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C,
   if (!AS.hasAttributes())
     return *this;
 
-  AttrBuilder B(AS);
-  for (const auto &I : *this)
-    B.addAttribute(I);
-
- return get(C, B);
+  AttrBuilder B(C, *this);
+  B.merge(AttrBuilder(C, AS));
+  return get(C, B);
 }
 
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
                                              Attribute::AttrKind Kind) const {
   if (!hasAttribute(Kind)) return *this;
-  AttrBuilder B(*this);
+  AttrBuilder B(C, *this);
   B.removeAttribute(Kind);
   return get(C, B);
 }
@@ -645,14 +643,14 @@ AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
                                              StringRef Kind) const {
   if (!hasAttribute(Kind)) return *this;
-  AttrBuilder B(*this);
+  AttrBuilder B(C, *this);
   B.removeAttribute(Kind);
   return get(C, B);
 }
 
 AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
-                                            const AttrBuilder &Attrs) const {
-  AttrBuilder B(*this);
+                                            const AttributeMask &Attrs) const {
+  AttrBuilder B(C, *this);
   // If there is nothing to remove, directly return the original set.
   if (!B.overlaps(Attrs))
     return *this;
@@ -817,28 +815,7 @@ AttributeSetNode *AttributeSetNode::getSorted(LLVMContext &C,
 }
 
 AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
-  // Add target-independent attributes.
-  SmallVector<Attribute, 8> Attrs;
-  for (Attribute::AttrKind Kind = Attribute::None;
-       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
-    if (!B.contains(Kind))
-      continue;
-
-    Attribute Attr;
-    if (Attribute::isTypeAttrKind(Kind))
-      Attr = Attribute::get(C, Kind, B.getTypeAttr(Kind));
-    else if (Attribute::isIntAttrKind(Kind))
-      Attr = Attribute::get(C, Kind, B.getRawIntAttr(Kind));
-    else
-      Attr = Attribute::get(C, Kind);
-    Attrs.push_back(Attr);
-  }
-
-  // Add target-dependent (string) attributes.
-  for (const auto &TDA : B.td_attrs())
-    Attrs.emplace_back(Attribute::get(C, TDA.first, TDA.second));
-
-  return getSorted(C, Attrs);
+  return getSorted(C, B.attrs());
 }
 
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
@@ -1194,9 +1171,9 @@ AttributeList AttributeList::get(LLVMContext &C,
 
   SmallVector<AttributeSet, 8> NewAttrSets(MaxSize);
   for (unsigned I = 0; I < MaxSize; ++I) {
-    AttrBuilder CurBuilder;
+    AttrBuilder CurBuilder(C);
     for (const auto &List : Attrs)
-      CurBuilder.merge(List.getAttributes(I - 1));
+      CurBuilder.merge(AttrBuilder(C, List.getAttributes(I - 1)));
     NewAttrSets[I] = AttributeSet::get(C, CurBuilder);
   }
 
@@ -1218,14 +1195,14 @@ AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index,
 AttributeList AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index,
                                                  StringRef Kind,
                                                  StringRef Value) const {
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addAttribute(Kind, Value);
   return addAttributesAtIndex(C, Index, B);
 }
 
 AttributeList AttributeList::addAttributeAtIndex(LLVMContext &C, unsigned Index,
                                                  Attribute A) const {
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addAttribute(A);
   return addAttributesAtIndex(C, Index, B);
 }
@@ -1250,16 +1227,7 @@ AttributeList AttributeList::addAttributesAtIndex(LLVMContext &C,
   if (!pImpl)
     return AttributeList::get(C, {{Index, AttributeSet::get(C, B)}});
 
-#ifndef NDEBUG
-  // FIXME it is not obvious how this should work for alignment. For now, say
-  // we can't change a known alignment.
-  const MaybeAlign OldAlign = getAttributes(Index).getAlignment();
-  const MaybeAlign NewAlign = B.getAlignment();
-  assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
-         "Attempt to change alignment!");
-#endif
-
-  AttrBuilder Merged(getAttributes(Index));
+  AttrBuilder Merged(C, getAttributes(Index));
   Merged.merge(B);
   return setAttributesAtIndex(C, Index, AttributeSet::get(C, Merged));
 }
@@ -1276,7 +1244,7 @@ AttributeList AttributeList::addParamAttribute(LLVMContext &C,
 
   for (unsigned ArgNo : ArgNos) {
     unsigned Index = attrIdxToArrayIdx(ArgNo + FirstArgIndex);
-    AttrBuilder B(AttrSets[Index]);
+    AttrBuilder B(C, AttrSets[Index]);
     B.addAttribute(A);
     AttrSets[Index] = AttributeSet::get(C, B);
   }
@@ -1314,9 +1282,8 @@ AttributeList AttributeList::removeAttributeAtIndex(LLVMContext &C,
   return getImpl(C, AttrSets);
 }
 
-AttributeList
-AttributeList::removeAttributesAtIndex(LLVMContext &C, unsigned Index,
-                                       const AttrBuilder &AttrsToRemove) const {
+AttributeList AttributeList::removeAttributesAtIndex(
+    LLVMContext &C, unsigned Index, const AttributeMask &AttrsToRemove) const {
   AttributeSet Attrs = getAttributes(Index);
   AttributeSet NewAttrs = Attrs.removeAttributes(C, AttrsToRemove);
   // If nothing was removed, return the original list.
@@ -1340,7 +1307,7 @@ AttributeList::removeAttributesAtIndex(LLVMContext &C,
 
 AttributeList AttributeList::addDereferenceableRetAttr(LLVMContext &C,
                                                        uint64_t Bytes) const {
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addDereferenceableAttr(Bytes);
   return addRetAttributes(C, B);
 }
@@ -1348,7 +1315,7 @@ AttributeList AttributeList::addDereferenceableRetAttr(LLVMContext &C,
 AttributeList AttributeList::addDereferenceableParamAttr(LLVMContext &C,
                                                          unsigned Index,
                                                          uint64_t Bytes) const {
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addDereferenceableAttr(Bytes);
   return addParamAttributes(C, Index, B);
 }
@@ -1356,7 +1323,7 @@ AttributeList AttributeList::addDereferenceableParamAttr(LLVMContext &C,
 AttributeList
 AttributeList::addDereferenceableOrNullParamAttr(LLVMContext &C, unsigned Index,
                                                  uint64_t Bytes) const {
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addDereferenceableOrNullAttr(Bytes);
   return addParamAttributes(C, Index, B);
 }
@@ -1365,7 +1332,7 @@ AttributeList
 AttributeList::addAllocSizeParamAttr(LLVMContext &C, unsigned Index,
                                      unsigned ElemSizeArg,
                                      const Optional<unsigned> &NumElemsArg) {
-  AttrBuilder B;
+  AttrBuilder B(C);
   B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
   return addParamAttributes(C, Index, B);
 }
@@ -1549,97 +1516,93 @@ LLVM_DUMP_METHOD void AttributeList::dump() const { print(dbgs()); }
 // AttrBuilder Method Implementations
 //===----------------------------------------------------------------------===//
 
-// FIXME: Remove this ctor, use AttributeSet.
-AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) {
-  AttributeSet AS = AL.getAttributes(Index);
-  for (const auto &A : AS)
-    addAttribute(A);
-}
-
-AttrBuilder::AttrBuilder(AttributeSet AS) {
-  for (const auto &A : AS)
-    addAttribute(A);
+AttrBuilder::AttrBuilder(LLVMContext &Ctx, AttributeSet AS) : Ctx(Ctx) {
+  append_range(Attrs, AS);
+  assert(is_sorted(Attrs) && "AttributeSet should be sorted");
 }
 
-void AttrBuilder::clear() {
-  Attrs.reset();
-  TargetDepAttrs.clear();
-  IntAttrs = {};
-  TypeAttrs = {};
-}
+void AttrBuilder::clear() { Attrs.clear(); }
 
-Optional<unsigned>
-AttrBuilder::kindToIntIndex(Attribute::AttrKind Kind) const {
-  if (Attribute::isIntAttrKind(Kind))
-    return Kind - Attribute::FirstIntAttr;
-  return None;
-}
+/// Attribute comparator that only compares attribute keys. Enum attributes are
+/// sorted before string attributes.
+struct AttributeComparator {
+  bool operator()(Attribute A0, Attribute A1) const {
+    bool A0IsString = A0.isStringAttribute();
+    bool A1IsString = A1.isStringAttribute();
+    if (A0IsString) {
+      if (A1IsString)
+        return A0.getKindAsString() < A1.getKindAsString();
+      else
+        return false;
+    }
+    if (A1IsString)
+      return true;
+    return A0.getKindAsEnum() < A1.getKindAsEnum();
+  }
+  bool operator()(Attribute A0, Attribute::AttrKind Kind) const {
+    if (A0.isStringAttribute())
+      return false;
+    return A0.getKindAsEnum() < Kind;
+  }
+  bool operator()(Attribute A0, StringRef Kind) const {
+    if (A0.isStringAttribute())
+      return A0.getKindAsString() < Kind;
+    return true;
+  }
+};
 
-Optional<unsigned>
-AttrBuilder::kindToTypeIndex(Attribute::AttrKind Kind) const {
-  if (Attribute::isTypeAttrKind(Kind))
-    return Kind - Attribute::FirstTypeAttr;
-  return None;
+template <typename K>
+static void addAttributeImpl(SmallVectorImpl<Attribute> &Attrs, K Kind,
+                             Attribute Attr) {
+  auto It = lower_bound(Attrs, Kind, AttributeComparator());
+  if (It != Attrs.end() && It->hasAttribute(Kind))
+    std::swap(*It, Attr);
+  else
+    Attrs.insert(It, Attr);
 }
 
 AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
-  if (Attr.isStringAttribute()) {
-    addAttribute(Attr.getKindAsString(), Attr.getValueAsString());
-    return *this;
-  }
-
-  Attribute::AttrKind Kind = Attr.getKindAsEnum();
-  Attrs[Kind] = true;
-
-  if (Optional<unsigned> TypeIndex = kindToTypeIndex(Kind))
-    TypeAttrs[*TypeIndex] = Attr.getValueAsType();
-  else if (Optional<unsigned> IntIndex = kindToIntIndex(Kind))
-    IntAttrs[*IntIndex] = Attr.getValueAsInt();
+  if (Attr.isStringAttribute())
+    addAttributeImpl(Attrs, Attr.getKindAsString(), Attr);
+  else
+    addAttributeImpl(Attrs, Attr.getKindAsEnum(), Attr);
+  return *this;
+}
 
+AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Kind) {
+  addAttributeImpl(Attrs, Kind, Attribute::get(Ctx, Kind));
   return *this;
 }
 
 AttrBuilder &AttrBuilder::addAttribute(StringRef A, StringRef V) {
-  TargetDepAttrs[A] = V;
+  addAttributeImpl(Attrs, A, Attribute::get(Ctx, A, V));
   return *this;
 }
 
 AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
   assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
-  Attrs[Val] = false;
-
-  if (Optional<unsigned> TypeIndex = kindToTypeIndex(Val))
-    TypeAttrs[*TypeIndex] = nullptr;
-  else if (Optional<unsigned> IntIndex = kindToIntIndex(Val))
-    IntAttrs[*IntIndex] = 0;
-
-  return *this;
-}
-
-AttrBuilder &AttrBuilder::removeAttributes(AttributeList A, uint64_t Index) {
-  remove(A.getAttributes(Index));
+  auto It = lower_bound(Attrs, Val, AttributeComparator());
+  if (It != Attrs.end() && It->hasAttribute(Val))
+    Attrs.erase(It);
   return *this;
 }
 
 AttrBuilder &AttrBuilder::removeAttribute(StringRef A) {
-  TargetDepAttrs.erase(A);
+  auto It = lower_bound(Attrs, A, AttributeComparator());
+  if (It != Attrs.end() && It->hasAttribute(A))
+    Attrs.erase(It);
   return *this;
 }
 
 uint64_t AttrBuilder::getRawIntAttr(Attribute::AttrKind Kind) const {
-  Optional<unsigned> IntIndex = kindToIntIndex(Kind);
-  assert(IntIndex && "Not an int attribute");
-  return IntAttrs[*IntIndex];
+  assert(Attribute::isIntAttrKind(Kind) && "Not an int attribute");
+  Attribute A = getAttribute(Kind);
+  return A.isValid() ? A.getValueAsInt() : 0;
 }
 
 AttrBuilder &AttrBuilder::addRawIntAttr(Attribute::AttrKind Kind,
                                         uint64_t Value) {
-  Optional<unsigned> IntIndex = kindToIntIndex(Kind);
-  assert(IntIndex && "Not an int attribute");
-  assert(Value && "Value cannot be zero");
-  Attrs[Kind] = true;
-  IntAttrs[*IntIndex] = Value;
-  return *this;
+  return addAttribute(Attribute::get(Ctx, Kind, Value));
 }
 
 std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const {
@@ -1709,17 +1672,13 @@ AttrBuilder &AttrBuilder::addVScaleRangeAttrFromRawRepr(uint64_t RawArgs) {
 }
 
 Type *AttrBuilder::getTypeAttr(Attribute::AttrKind Kind) const {
-  Optional<unsigned> TypeIndex = kindToTypeIndex(Kind);
-  assert(TypeIndex && "Not a type attribute");
-  return TypeAttrs[*TypeIndex];
+  assert(Attribute::isTypeAttrKind(Kind) && "Not a type attribute");
+  Attribute A = getAttribute(Kind);
+  return A.isValid() ? A.getValueAsType() : nullptr;
 }
 
 AttrBuilder &AttrBuilder::addTypeAttr(Attribute::AttrKind Kind, Type *Ty) {
-  Optional<unsigned> TypeIndex = kindToTypeIndex(Kind);
-  assert(TypeIndex && "Not a type attribute");
-  Attrs[Kind] = true;
-  TypeAttrs[*TypeIndex] = Ty;
-  return *this;
+  return addAttribute(Attribute::get(Ctx, Kind, Ty));
 }
 
 AttrBuilder &AttrBuilder::addByValAttr(Type *Ty) {
@@ -1743,76 +1702,43 @@ AttrBuilder &AttrBuilder::addInAllocaAttr(Type *Ty) {
 }
 
 AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
-  // FIXME: What if both have an int/type attribute, but they don't match?!
-  for (unsigned Index = 0; Index < Attribute::NumIntAttrKinds; ++Index)
-    if (!IntAttrs[Index])
-      IntAttrs[Index] = B.IntAttrs[Index];
-
-  for (unsigned Index = 0; Index < Attribute::NumTypeAttrKinds; ++Index)
-    if (!TypeAttrs[Index])
-      TypeAttrs[Index] = B.TypeAttrs[Index];
-
-  Attrs |= B.Attrs;
-
-  for (const auto &I : B.td_attrs())
-    TargetDepAttrs[I.first] = I.second;
+  // TODO: Could make this O(n) as we're merging two sorted lists.
+  for (const auto &I : B.attrs())
+    addAttribute(I);
 
   return *this;
 }
 
-AttrBuilder &AttrBuilder::remove(const AttrBuilder &B) {
-  // FIXME: What if both have an int/type attribute, but they don't match?!
-  for (unsigned Index = 0; Index < Attribute::NumIntAttrKinds; ++Index)
-    if (B.IntAttrs[Index])
-      IntAttrs[Index] = 0;
-
-  for (unsigned Index = 0; Index < Attribute::NumTypeAttrKinds; ++Index)
-    if (B.TypeAttrs[Index])
-      TypeAttrs[Index] = nullptr;
-
-  Attrs &= ~B.Attrs;
-
-  for (const auto &I : B.td_attrs())
-    TargetDepAttrs.erase(I.first);
-
+AttrBuilder &AttrBuilder::remove(const AttributeMask &AM) {
+  erase_if(Attrs, [&](Attribute A) { return AM.contains(A); });
   return *this;
 }
 
-bool AttrBuilder::overlaps(const AttrBuilder &B) const {
-  // First check if any of the target independent attributes overlap.
-  if ((Attrs & B.Attrs).any())
-    return true;
-
-  // Then check if any target dependent ones do.
-  for (const auto &I : td_attrs())
-    if (B.contains(I.first))
-      return true;
-
-  return false;
+bool AttrBuilder::overlaps(const AttributeMask &AM) const {
+  return any_of(Attrs, [&](Attribute A) { return AM.contains(A); });
 }
 
-bool AttrBuilder::contains(StringRef A) const {
-  return TargetDepAttrs.find(A) != TargetDepAttrs.end();
+Attribute AttrBuilder::getAttribute(Attribute::AttrKind A) const {
+  assert((unsigned)A < Attribute::EndAttrKinds && "Attribute out of range!");
+  auto It = lower_bound(Attrs, A, AttributeComparator());
+  if (It != Attrs.end() && It->hasAttribute(A))
+    return *It;
+  return {};
 }
 
-bool AttrBuilder::hasAttributes() const {
-  return !Attrs.none() || !TargetDepAttrs.empty();
+Attribute AttrBuilder::getAttribute(StringRef A) const {
+  auto It = lower_bound(Attrs, A, AttributeComparator());
+  if (It != Attrs.end() && It->hasAttribute(A))
+    return *It;
+  return {};
 }
 
-bool AttrBuilder::hasAttributes(AttributeList AL, uint64_t Index) const {
-  AttributeSet AS = AL.getAttributes(Index);
-
-  for (const auto &Attr : AS) {
-    if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
-      if (contains(Attr.getKindAsEnum()))
-        return true;
-    } else {
-      assert(Attr.isStringAttribute() && "Invalid attribute kind!");
-      return contains(Attr.getKindAsString());
-    }
-  }
+bool AttrBuilder::contains(Attribute::AttrKind A) const {
+  return getAttribute(A).isValid();
+}
 
-  return false;
+bool AttrBuilder::contains(StringRef A) const {
+  return getAttribute(A).isValid();
 }
 
 bool AttrBuilder::hasAlignmentAttr() const {
@@ -1820,14 +1746,7 @@ bool AttrBuilder::hasAlignmentAttr() const {
 }
 
 bool AttrBuilder::operator==(const AttrBuilder &B) const {
-  if (Attrs != B.Attrs)
-    return false;
-
-  for (const auto &TDA : TargetDepAttrs)
-    if (B.TargetDepAttrs.find(TDA.first) == B.TargetDepAttrs.end())
-      return false;
-
-  return IntAttrs == B.IntAttrs && TypeAttrs == B.TypeAttrs;
+  return Attrs == B.Attrs;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1835,16 +1754,16 @@ bool AttrBuilder::operator==(const AttrBuilder &B) const {
 //===----------------------------------------------------------------------===//
 
 /// Which attributes cannot be applied to a type.
-AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
-  AttrBuilder Incompatible;
+AttributeMask AttributeFuncs::typeIncompatible(Type *Ty) {
+  AttributeMask Incompatible;
 
   if (!Ty->isIntegerTy())
-    // Attribute that only apply to integers.
+    // Attributes that only apply to integers.
     Incompatible.addAttribute(Attribute::SExt)
       .addAttribute(Attribute::ZExt);
 
   if (!Ty->isPointerTy())
-    // Attribute that only apply to pointers.
+    // Attributes that only apply to pointers.
     Incompatible.addAttribute(Attribute::Nest)
         .addAttribute(Attribute::NoAlias)
         .addAttribute(Attribute::NoCapture)
@@ -1852,15 +1771,18 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
         .addAttribute(Attribute::ReadNone)
         .addAttribute(Attribute::ReadOnly)
         .addAttribute(Attribute::SwiftError)
-        .addAlignmentAttr(1)             // the int here is ignored
-        .addDereferenceableAttr(1)       // the int here is ignored
-        .addDereferenceableOrNullAttr(1) // the int here is ignored
-        .addPreallocatedAttr(Ty)
-        .addInAllocaAttr(Ty)
-        .addByValAttr(Ty)
-        .addStructRetAttr(Ty)
-        .addByRefAttr(Ty)
-        .addTypeAttr(Attribute::ElementType, Ty);
+        .addAttribute(Attribute::Dereferenceable)
+        .addAttribute(Attribute::DereferenceableOrNull)
+        .addAttribute(Attribute::Preallocated)
+        .addAttribute(Attribute::InAlloca)
+        .addAttribute(Attribute::ByVal)
+        .addAttribute(Attribute::StructRet)
+        .addAttribute(Attribute::ByRef)
+        .addAttribute(Attribute::ElementType);
+
+  if (!Ty->isPtrOrPtrVectorTy())
+    // Attributes that only apply to pointers or vectors of pointers.
+    Incompatible.addAttribute(Attribute::Alignment);
 
   // Some attributes can apply to all "values" but there are no `void` values.
   if (Ty->isVoidTy())
@@ -1869,12 +1791,12 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
   return Incompatible;
 }
 
-AttrBuilder AttributeFuncs::getUBImplyingAttributes() {
-  AttrBuilder B;
-  B.addAttribute(Attribute::NoUndef);
-  B.addDereferenceableAttr(1);
-  B.addDereferenceableOrNullAttr(1);
-  return B;
+AttributeMask AttributeFuncs::getUBImplyingAttributes() {
+  AttributeMask AM;
+  AM.addAttribute(Attribute::NoUndef);
+  AM.addAttribute(Attribute::Dereferenceable);
+  AM.addAttribute(Attribute::DereferenceableOrNull);
+  return AM;
 }
 
 template<typename AttrClass>
@@ -1910,10 +1832,16 @@ static void setOR(Function &Caller, const Function &Callee) {
 /// If the inlined function had a higher stack protection level than the
 /// calling function, then bump up the caller's stack protection level.
 static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
+  // If the calling function has *no* stack protection level (e.g. it was built
+  // with Clang's -fno-stack-protector or no_stack_protector attribute), don't
+  // change it as that could change the program's semantics.
+  if (!Caller.hasStackProtectorFnAttr())
+    return;
+
   // If upgrading the SSP attribute, clear out the old SSP Attributes first.
   // Having multiple SSP attributes doesn't actually hurt, but it adds useless
   // clutter to the IR.
-  AttrBuilder OldSSPAttr;
+  AttributeMask OldSSPAttr;
   OldSSPAttr.addAttribute(Attribute::StackProtect)
       .addAttribute(Attribute::StackProtectStrong)
       .addAttribute(Attribute::StackProtectReq);
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b8ad2b294b87..45459e200b3d 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -727,6 +727,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "arm.cde.vcx3qa.predicated.v2i64.v4i1")
       return true;
 
+    if (Name == "amdgcn.alignbit") {
+      // Target specific intrinsic became redundant
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::fshr,
+                                        {F->getReturnType()});
+      return true;
+    }
+
     break;
   }
 
@@ -4488,7 +4495,7 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
 
   if (F.getCallingConv() == CallingConv::X86_INTR &&
       !F.arg_empty() && !F.hasParamAttribute(0, Attribute::ByVal)) {
-    Type *ByValTy = cast<PointerType>(F.getArg(0)->getType())->getElementType();
+    Type *ByValTy = F.getArg(0)->getType()->getPointerElementType();
     Attribute NewAttr = Attribute::getWithByValType(F.getContext(), ByValTy);
     F.addParamAttr(0, NewAttr);
   }
@@ -4569,27 +4576,39 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
     return DL.empty() ? std::string("G1") : (DL + "-G1").str();
   }
 
+  std::string Res = DL.str();
+  if (!T.isX86())
+    return Res;
+
+  // If the datalayout matches the expected format, add pointer size address
+  // spaces to the datalayout.
   std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64";
-  // If X86, and the datalayout matches the expected format, add pointer size
-  // address spaces to the datalayout.
-  if (!T.isX86() || DL.contains(AddrSpaces))
-    return std::string(DL);
+  if (!DL.contains(AddrSpaces)) {
+    SmallVector<StringRef, 4> Groups;
+    Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)");
+    if (R.match(DL, &Groups))
+      Res = (Groups[1] + AddrSpaces + Groups[3]).str();
+  }
 
-  SmallVector<StringRef, 4> Groups;
-  Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)");
-  if (!R.match(DL, &Groups))
-    return std::string(DL);
+  // For 32-bit MSVC targets, raise the alignment of f80 values to 16 bytes.
+  // Raising the alignment is safe because Clang did not produce f80 values in
+  // the MSVC environment before this upgrade was added.
+  if (T.isWindowsMSVCEnvironment() && !T.isArch64Bit()) {
+    StringRef Ref = Res;
+    auto I = Ref.find("-f80:32-");
+    if (I != StringRef::npos)
+      Res = (Ref.take_front(I) + "-f80:128-" + Ref.drop_front(I + 8)).str();
+  }
 
-  return (Groups[1] + AddrSpaces + Groups[3]).str();
+  return Res;
 }
 
 void llvm::UpgradeAttributes(AttrBuilder &B) {
   StringRef FramePointer;
-  if (B.contains("no-frame-pointer-elim")) {
+  Attribute A = B.getAttribute("no-frame-pointer-elim");
+  if (A.isValid()) {
     // The value can be "true" or "false".
-    for (const auto &I : B.td_attrs())
-      if (I.first == "no-frame-pointer-elim")
-        FramePointer = I.second == "true" ? "all" : "none";
+    FramePointer = A.getValueAsString() == "true" ? "all" : "none";
     B.removeAttribute("no-frame-pointer-elim");
   }
   if (B.contains("no-frame-pointer-elim-non-leaf")) {
@@ -4601,12 +4620,10 @@ void llvm::UpgradeAttributes(AttrBuilder &B) {
   if (!FramePointer.empty())
     B.addAttribute("frame-pointer", FramePointer);
 
-  if (B.contains("null-pointer-is-valid")) {
+  A = B.getAttribute("null-pointer-is-valid");
+  if (A.isValid()) {
     // The value can be "true" or "false".
-    bool NullPointerIsValid = false;
-    for (const auto &I : B.td_attrs())
-      if (I.first == "null-pointer-is-valid")
-        NullPointerIsValid = I.second == "true";
+    bool NullPointerIsValid = A.getValueAsString() == "true";
     B.removeAttribute("null-pointer-is-valid");
     if (NullPointerIsValid)
       B.addAttribute(Attribute::NullPointerIsValid);
diff --git a/llvm/lib/IR/Comdat.cpp b/llvm/lib/IR/Comdat.cpp
index 1a5d38d17bc0..90d5c6e82e5c 100644
--- a/llvm/lib/IR/Comdat.cpp
+++ b/llvm/lib/IR/Comdat.cpp
@@ -25,6 +25,10 @@ Comdat::Comdat() = default;
 
 StringRef Comdat::getName() const { return Name->first(); }
 
+void Comdat::addUser(GlobalObject *GO) { Users.insert(GO); }
+
+void Comdat::removeUser(GlobalObject *GO) { Users.erase(GO); }
+
 LLVMComdatRef LLVMGetOrInsertComdat(LLVMModuleRef M, const char *Name) {
   return wrap(unwrap(M)->getOrInsertComdat(Name));
 }
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 8668fe82601c..622a984be22c 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -119,21 +119,21 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
     if (PointerType *DPTy = dyn_cast<PointerType>(DestTy))
       if (PTy->getAddressSpace() == DPTy->getAddressSpace() &&
           !PTy->isOpaque() && !DPTy->isOpaque() &&
-          PTy->getElementType()->isSized()) {
+          PTy->getNonOpaquePointerElementType()->isSized()) {
         SmallVector<Value*, 8> IdxList;
         Value *Zero =
           Constant::getNullValue(Type::getInt32Ty(DPTy->getContext()));
         IdxList.push_back(Zero);
-        Type *ElTy = PTy->getElementType();
-        while (ElTy && ElTy != DPTy->getElementType()) {
+        Type *ElTy = PTy->getNonOpaquePointerElementType();
+        while (ElTy && ElTy != DPTy->getNonOpaquePointerElementType()) {
           ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, (uint64_t)0);
           IdxList.push_back(Zero);
         }
 
-        if (ElTy == DPTy->getElementType())
+        if (ElTy == DPTy->getNonOpaquePointerElementType())
           // This GEP is inbounds because all indices are zero.
-          return ConstantExpr::getInBoundsGetElementPtr(PTy->getElementType(),
-                                                        V, IdxList);
+          return ConstantExpr::getInBoundsGetElementPtr(
+              PTy->getNonOpaquePointerElementType(), V, IdxList);
       }
 
   // Handle casts from one vector constant to another.  We know that the src
@@ -1299,63 +1299,6 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
   return nullptr;
 }
 
-/// This type is zero-sized if it's an array or structure of zero-sized types.
-/// The only leaf zero-sized type is an empty structure.
-static bool isMaybeZeroSizedType(Type *Ty) {
-  if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    if (STy->isOpaque()) return true;  // Can't say.
-
-    // If all of elements have zero size, this does too.
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-      if (!isMaybeZeroSizedType(STy->getElementType(i))) return false;
-    return true;
-
-  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    return isMaybeZeroSizedType(ATy->getElementType());
-  }
-  return false;
-}
-
-/// Compare the two constants as though they were getelementptr indices.
-/// This allows coercion of the types to be the same thing.
-///
-/// If the two constants are the "same" (after coercion), return 0.  If the
-/// first is less than the second, return -1, if the second is less than the
-/// first, return 1.  If the constants are not integral, return -2.
-///
-static int IdxCompare(Constant *C1, Constant *C2, Type *ElTy) {
-  if (C1 == C2) return 0;
-
-  // Ok, we found a different index.  If they are not ConstantInt, we can't do
-  // anything with them.
-  if (!isa<ConstantInt>(C1) || !isa<ConstantInt>(C2))
-    return -2; // don't know!
-
-  // We cannot compare the indices if they don't fit in an int64_t.
-  if (cast<ConstantInt>(C1)->getValue().getActiveBits() > 64 ||
-      cast<ConstantInt>(C2)->getValue().getActiveBits() > 64)
-    return -2; // don't know!
-
-  // Ok, we have two differing integer indices.  Sign extend them to be the same
-  // type.
-  int64_t C1Val = cast<ConstantInt>(C1)->getSExtValue();
-  int64_t C2Val = cast<ConstantInt>(C2)->getSExtValue();
-
-  if (C1Val == C2Val) return 0;  // They are equal
-
-  // If the type being indexed over is really just a zero sized type, there is
-  // no pointer difference being made here.
-  if (isMaybeZeroSizedType(ElTy))
-    return -2; // dunno.
-
-  // If they are really different, now that they are the same type, then we
-  // found a difference!
-  if (C1Val < C2Val)
-    return -1;
-  else
-    return 1;
-}
-
 /// This function determines if there is anything we can decide about the two
 /// constants provided. This doesn't need to handle simple things like
 /// ConstantFP comparisons, but should instead handle ConstantExprs.
@@ -1594,103 +1537,28 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
         if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
           // If its not weak linkage, the GVal must have a non-zero address
           // so the result is greater-than
-          if (!GV->hasExternalWeakLinkage())
+          if (!GV->hasExternalWeakLinkage() && CE1GEP->isInBounds())
             return ICmpInst::ICMP_UGT;
-        } else if (isa<ConstantPointerNull>(CE1Op0)) {
-          // If we are indexing from a null pointer, check to see if we have any
-          // non-zero indices.
-          for (unsigned i = 1, e = CE1->getNumOperands(); i != e; ++i)
-            if (!CE1->getOperand(i)->isNullValue())
-              // Offsetting from null, must not be equal.
-              return ICmpInst::ICMP_UGT;
-          // Only zero indexes from null, must still be zero.
-          return ICmpInst::ICMP_EQ;
         }
-        // Otherwise, we can't really say if the first operand is null or not.
       } else if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) {
-        if (isa<ConstantPointerNull>(CE1Op0)) {
-          // If its not weak linkage, the GVal must have a non-zero address
-          // so the result is less-than
-          if (!GV2->hasExternalWeakLinkage())
-            return ICmpInst::ICMP_ULT;
-        } else if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
-          if (GV == GV2) {
-            // If this is a getelementptr of the same global, then it must be
-            // different.  Because the types must match, the getelementptr could
-            // only have at most one index, and because we fold getelementptr's
-            // with a single zero index, it must be nonzero.
-            assert(CE1->getNumOperands() == 2 &&
-                   !CE1->getOperand(1)->isNullValue() &&
-                   "Surprising getelementptr!");
-            return ICmpInst::ICMP_UGT;
-          } else {
+        if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
+          if (GV != GV2) {
             if (CE1GEP->hasAllZeroIndices())
               return areGlobalsPotentiallyEqual(GV, GV2);
             return ICmpInst::BAD_ICMP_PREDICATE;
           }
         }
-      } else {
-        ConstantExpr *CE2 = cast<ConstantExpr>(V2);
-        Constant *CE2Op0 = CE2->getOperand(0);
-
-        // There are MANY other foldings that we could perform here.  They will
-        // probably be added on demand, as they seem needed.
-        switch (CE2->getOpcode()) {
-        default: break;
-        case Instruction::GetElementPtr:
-          // By far the most common case to handle is when the base pointers are
-          // obviously to the same global.
-          if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) {
-            // Don't know relative ordering, but check for inequality.
-            if (CE1Op0 != CE2Op0) {
-              GEPOperator *CE2GEP = cast<GEPOperator>(CE2);
-              if (CE1GEP->hasAllZeroIndices() && CE2GEP->hasAllZeroIndices())
-                return areGlobalsPotentiallyEqual(cast<GlobalValue>(CE1Op0),
-                                                  cast<GlobalValue>(CE2Op0));
-              return ICmpInst::BAD_ICMP_PREDICATE;
-            }
-            // Ok, we know that both getelementptr instructions are based on the
-            // same global.  From this, we can precisely determine the relative
-            // ordering of the resultant pointers.
-            unsigned i = 1;
-
-            // The logic below assumes that the result of the comparison
-            // can be determined by finding the first index that differs.
-            // This doesn't work if there is over-indexing in any
-            // subsequent indices, so check for that case first.
-            if (!CE1->isGEPWithNoNotionalOverIndexing() ||
-                !CE2->isGEPWithNoNotionalOverIndexing())
-               return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
-
-            // Compare all of the operands the GEP's have in common.
-            gep_type_iterator GTI = gep_type_begin(CE1);
-            for (;i != CE1->getNumOperands() && i != CE2->getNumOperands();
-                 ++i, ++GTI)
-              switch (IdxCompare(CE1->getOperand(i),
-                                 CE2->getOperand(i), GTI.getIndexedType())) {
-              case -1: return isSigned ? ICmpInst::ICMP_SLT:ICmpInst::ICMP_ULT;
-              case 1:  return isSigned ? ICmpInst::ICMP_SGT:ICmpInst::ICMP_UGT;
-              case -2: return ICmpInst::BAD_ICMP_PREDICATE;
-              }
-
-            // Ok, we ran out of things they have in common.  If any leftovers
-            // are non-zero then we have a difference, otherwise we are equal.
-            for (; i < CE1->getNumOperands(); ++i)
-              if (!CE1->getOperand(i)->isNullValue()) {
-                if (isa<ConstantInt>(CE1->getOperand(i)))
-                  return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
-                else
-                  return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
-              }
-
-            for (; i < CE2->getNumOperands(); ++i)
-              if (!CE2->getOperand(i)->isNullValue()) {
-                if (isa<ConstantInt>(CE2->getOperand(i)))
-                  return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
-                else
-                  return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
-              }
-            return ICmpInst::ICMP_EQ;
+      } else if (const auto *CE2GEP = dyn_cast<GEPOperator>(V2)) {
+        // By far the most common case to handle is when the base pointers are
+        // obviously to the same global.
+        const Constant *CE2Op0 = cast<Constant>(CE2GEP->getPointerOperand());
+        if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) {
+          // Don't know relative ordering, but check for inequality.
+          if (CE1Op0 != CE2Op0) {
+            if (CE1GEP->hasAllZeroIndices() && CE2GEP->hasAllZeroIndices())
+              return areGlobalsPotentiallyEqual(cast<GlobalValue>(CE1Op0),
+                                                cast<GlobalValue>(CE2Op0));
+            return ICmpInst::BAD_ICMP_PREDICATE;
           }
         }
       }
@@ -1704,7 +1572,7 @@ static ICmpInst::Predicate evaluateICmpRelation(Constant *V1, Constant *V2,
   return ICmpInst::BAD_ICMP_PREDICATE;
 }
 
-Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
+Constant *llvm::ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
                                                Constant *C1, Constant *C2) {
   Type *ResultTy;
   if (VectorType *VT = dyn_cast<VectorType>(C1->getType()))
@@ -1714,10 +1582,10 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     ResultTy = Type::getInt1Ty(C1->getContext());
 
   // Fold FCMP_FALSE/FCMP_TRUE unconditionally.
-  if (pred == FCmpInst::FCMP_FALSE)
+  if (Predicate == FCmpInst::FCMP_FALSE)
     return Constant::getNullValue(ResultTy);
 
-  if (pred == FCmpInst::FCMP_TRUE)
+  if (Predicate == FCmpInst::FCMP_TRUE)
     return Constant::getAllOnesValue(ResultTy);
 
   // Handle some degenerate cases first
@@ -1725,7 +1593,6 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     return PoisonValue::get(ResultTy);
 
   if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
-    CmpInst::Predicate Predicate = CmpInst::Predicate(pred);
     bool isIntegerPredicate = ICmpInst::isIntPredicate(Predicate);
     // For EQ and NE, we can always pick a value for the undef to make the
     // predicate pass or fail, so we can return undef.
@@ -1750,9 +1617,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage() &&
           !NullPointerIsDefined(nullptr /* F */,
                                 GV->getType()->getAddressSpace())) {
-        if (pred == ICmpInst::ICMP_EQ)
+        if (Predicate == ICmpInst::ICMP_EQ)
           return ConstantInt::getFalse(C1->getContext());
-        else if (pred == ICmpInst::ICMP_NE)
+        else if (Predicate == ICmpInst::ICMP_NE)
           return ConstantInt::getTrue(C1->getContext());
       }
   // icmp eq/ne(GV,null) -> false/true
@@ -1762,9 +1629,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage() &&
           !NullPointerIsDefined(nullptr /* F */,
                                 GV->getType()->getAddressSpace())) {
-        if (pred == ICmpInst::ICMP_EQ)
+        if (Predicate == ICmpInst::ICMP_EQ)
           return ConstantInt::getFalse(C1->getContext());
-        else if (pred == ICmpInst::ICMP_NE)
+        else if (Predicate == ICmpInst::ICMP_NE)
           return ConstantInt::getTrue(C1->getContext());
       }
     }
@@ -1772,16 +1639,16 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     // The caller is expected to commute the operands if the constant expression
     // is C2.
     // C1 >= 0 --> true
-    if (pred == ICmpInst::ICMP_UGE)
+    if (Predicate == ICmpInst::ICMP_UGE)
       return Constant::getAllOnesValue(ResultTy);
     // C1 < 0 --> false
-    if (pred == ICmpInst::ICMP_ULT)
+    if (Predicate == ICmpInst::ICMP_ULT)
       return Constant::getNullValue(ResultTy);
   }
 
   // If the comparison is a comparison between two i1's, simplify it.
   if (C1->getType()->isIntegerTy(1)) {
-    switch(pred) {
+    switch (Predicate) {
     case ICmpInst::ICMP_EQ:
       if (isa<ConstantInt>(C2))
         return ConstantExpr::getXor(C1, ConstantExpr::getNot(C2));
@@ -1796,12 +1663,10 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   if (isa<ConstantInt>(C1) && isa<ConstantInt>(C2)) {
     const APInt &V1 = cast<ConstantInt>(C1)->getValue();
     const APInt &V2 = cast<ConstantInt>(C2)->getValue();
-    return ConstantInt::get(
-        ResultTy, ICmpInst::compare(V1, V2, (ICmpInst::Predicate)pred));
+    return ConstantInt::get(ResultTy, ICmpInst::compare(V1, V2, Predicate));
   } else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) {
     const APFloat &C1V = cast<ConstantFP>(C1)->getValueAPF();
     const APFloat &C2V = cast<ConstantFP>(C2)->getValueAPF();
-    CmpInst::Predicate Predicate = CmpInst::Predicate(pred);
     return ConstantInt::get(ResultTy, FCmpInst::compare(C1V, C2V, Predicate));
   } else if (auto *C1VTy = dyn_cast<VectorType>(C1->getType())) {
 
@@ -1810,7 +1675,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       if (Constant *C2Splat = C2->getSplatValue())
         return ConstantVector::getSplat(
             C1VTy->getElementCount(),
-            ConstantExpr::getCompare(pred, C1Splat, C2Splat));
+            ConstantExpr::getCompare(Predicate, C1Splat, C2Splat));
 
     // Do not iterate on scalable vector. The number of elements is unknown at
     // compile-time.
@@ -1829,7 +1694,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       Constant *C2E =
           ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, I));
 
-      ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
+      ResElts.push_back(ConstantExpr::getCompare(Predicate, C1E, C2E));
     }
 
     return ConstantVector::get(ResElts);
@@ -1854,46 +1719,52 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     case FCmpInst::BAD_FCMP_PREDICATE:
       break; // Couldn't determine anything about these constants.
     case FCmpInst::FCMP_OEQ: // We know that C1 == C2
-      Result = (pred == FCmpInst::FCMP_UEQ || pred == FCmpInst::FCMP_OEQ ||
-                pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE ||
-                pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE);
+      Result =
+          (Predicate == FCmpInst::FCMP_UEQ || Predicate == FCmpInst::FCMP_OEQ ||
+           Predicate == FCmpInst::FCMP_ULE || Predicate == FCmpInst::FCMP_OLE ||
+           Predicate == FCmpInst::FCMP_UGE || Predicate == FCmpInst::FCMP_OGE);
       break;
     case FCmpInst::FCMP_OLT: // We know that C1 < C2
-      Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE ||
-                pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT ||
-                pred == FCmpInst::FCMP_ULE || pred == FCmpInst::FCMP_OLE);
+      Result =
+          (Predicate == FCmpInst::FCMP_UNE || Predicate == FCmpInst::FCMP_ONE ||
+           Predicate == FCmpInst::FCMP_ULT || Predicate == FCmpInst::FCMP_OLT ||
+           Predicate == FCmpInst::FCMP_ULE || Predicate == FCmpInst::FCMP_OLE);
       break;
     case FCmpInst::FCMP_OGT: // We know that C1 > C2
-      Result = (pred == FCmpInst::FCMP_UNE || pred == FCmpInst::FCMP_ONE ||
-                pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT ||
-                pred == FCmpInst::FCMP_UGE || pred == FCmpInst::FCMP_OGE);
+      Result =
+          (Predicate == FCmpInst::FCMP_UNE || Predicate == FCmpInst::FCMP_ONE ||
+           Predicate == FCmpInst::FCMP_UGT || Predicate == FCmpInst::FCMP_OGT ||
+           Predicate == FCmpInst::FCMP_UGE || Predicate == FCmpInst::FCMP_OGE);
       break;
     case FCmpInst::FCMP_OLE: // We know that C1 <= C2
       // We can only partially decide this relation.
-      if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT)
+      if (Predicate == FCmpInst::FCMP_UGT || Predicate == FCmpInst::FCMP_OGT)
         Result = 0;
-      else if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT)
+      else if (Predicate == FCmpInst::FCMP_ULT ||
+               Predicate == FCmpInst::FCMP_OLT)
         Result = 1;
       break;
     case FCmpInst::FCMP_OGE: // We known that C1 >= C2
       // We can only partially decide this relation.
-      if (pred == FCmpInst::FCMP_ULT || pred == FCmpInst::FCMP_OLT)
+      if (Predicate == FCmpInst::FCMP_ULT || Predicate == FCmpInst::FCMP_OLT)
         Result = 0;
-      else if (pred == FCmpInst::FCMP_UGT || pred == FCmpInst::FCMP_OGT)
+      else if (Predicate == FCmpInst::FCMP_UGT ||
+               Predicate == FCmpInst::FCMP_OGT)
         Result = 1;
       break;
     case FCmpInst::FCMP_ONE: // We know that C1 != C2
       // We can only partially decide this relation.
-      if (pred == FCmpInst::FCMP_OEQ || pred == FCmpInst::FCMP_UEQ)
+      if (Predicate == FCmpInst::FCMP_OEQ || Predicate == FCmpInst::FCMP_UEQ)
         Result = 0;
-      else if (pred == FCmpInst::FCMP_ONE || pred == FCmpInst::FCMP_UNE)
+      else if (Predicate == FCmpInst::FCMP_ONE ||
+               Predicate == FCmpInst::FCMP_UNE)
         Result = 1;
       break;
     case FCmpInst::FCMP_UEQ: // We know that C1 == C2 || isUnordered(C1, C2).
       // We can only partially decide this relation.
-      if (pred == FCmpInst::FCMP_ONE)
+      if (Predicate == FCmpInst::FCMP_ONE)
         Result = 0;
-      else if (pred == FCmpInst::FCMP_UEQ)
+      else if (Predicate == FCmpInst::FCMP_UEQ)
         Result = 1;
       break;
     }
@@ -1905,67 +1776,84 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   } else {
     // Evaluate the relation between the two constants, per the predicate.
     int Result = -1;  // -1 = unknown, 0 = known false, 1 = known true.
-    switch (evaluateICmpRelation(C1, C2,
-                                 CmpInst::isSigned((CmpInst::Predicate)pred))) {
+    switch (evaluateICmpRelation(C1, C2, CmpInst::isSigned(Predicate))) {
     default: llvm_unreachable("Unknown relational!");
     case ICmpInst::BAD_ICMP_PREDICATE:
       break;  // Couldn't determine anything about these constants.
     case ICmpInst::ICMP_EQ:   // We know the constants are equal!
       // If we know the constants are equal, we can decide the result of this
       // computation precisely.
-      Result = ICmpInst::isTrueWhenEqual((ICmpInst::Predicate)pred);
+      Result = ICmpInst::isTrueWhenEqual(Predicate);
       break;
     case ICmpInst::ICMP_ULT:
-      switch (pred) {
+      switch (Predicate) {
       case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULE:
         Result = 1; break;
       case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_UGE:
         Result = 0; break;
+      default:
+        break;
       }
       break;
     case ICmpInst::ICMP_SLT:
-      switch (pred) {
+      switch (Predicate) {
       case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SLE:
         Result = 1; break;
       case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SGE:
         Result = 0; break;
+      default:
+        break;
       }
       break;
     case ICmpInst::ICMP_UGT:
-      switch (pred) {
+      switch (Predicate) {
       case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGE:
         Result = 1; break;
       case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULE:
         Result = 0; break;
+      default:
+        break;
       }
       break;
     case ICmpInst::ICMP_SGT:
-      switch (pred) {
+      switch (Predicate) {
       case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SGE:
         Result = 1; break;
       case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SLE:
         Result = 0; break;
+      default:
+        break;
       }
       break;
     case ICmpInst::ICMP_ULE:
-      if (pred == ICmpInst::ICMP_UGT) Result = 0;
-      if (pred == ICmpInst::ICMP_ULT || pred == ICmpInst::ICMP_ULE) Result = 1;
+      if (Predicate == ICmpInst::ICMP_UGT)
+        Result = 0;
+      if (Predicate == ICmpInst::ICMP_ULT || Predicate == ICmpInst::ICMP_ULE)
+        Result = 1;
       break;
     case ICmpInst::ICMP_SLE:
-      if (pred == ICmpInst::ICMP_SGT) Result = 0;
-      if (pred == ICmpInst::ICMP_SLT || pred == ICmpInst::ICMP_SLE) Result = 1;
+      if (Predicate == ICmpInst::ICMP_SGT)
+        Result = 0;
+      if (Predicate == ICmpInst::ICMP_SLT || Predicate == ICmpInst::ICMP_SLE)
+        Result = 1;
       break;
     case ICmpInst::ICMP_UGE:
-      if (pred == ICmpInst::ICMP_ULT) Result = 0;
-      if (pred == ICmpInst::ICMP_UGT || pred == ICmpInst::ICMP_UGE) Result = 1;
+      if (Predicate == ICmpInst::ICMP_ULT)
+        Result = 0;
+      if (Predicate == ICmpInst::ICMP_UGT || Predicate == ICmpInst::ICMP_UGE)
+        Result = 1;
       break;
     case ICmpInst::ICMP_SGE:
-      if (pred == ICmpInst::ICMP_SLT) Result = 0;
-      if (pred == ICmpInst::ICMP_SGT || pred == ICmpInst::ICMP_SGE) Result = 1;
+      if (Predicate == ICmpInst::ICMP_SLT)
+        Result = 0;
+      if (Predicate == ICmpInst::ICMP_SGT || Predicate == ICmpInst::ICMP_SGE)
+        Result = 1;
       break;
     case ICmpInst::ICMP_NE:
-      if (pred == ICmpInst::ICMP_EQ) Result = 0;
-      if (pred == ICmpInst::ICMP_NE) Result = 1;
+      if (Predicate == ICmpInst::ICMP_EQ)
+        Result = 0;
+      if (Predicate == ICmpInst::ICMP_NE)
+        Result = 1;
       break;
     }
 
@@ -1983,16 +1871,16 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
           CE2->getType()->isVectorTy() == CE2Op0->getType()->isVectorTy() &&
           !CE2Op0->getType()->isFPOrFPVectorTy()) {
         Constant *Inverse = ConstantExpr::getBitCast(C1, CE2Op0->getType());
-        return ConstantExpr::getICmp(pred, Inverse, CE2Op0);
+        return ConstantExpr::getICmp(Predicate, Inverse, CE2Op0);
       }
     }
 
     // If the left hand side is an extension, try eliminating it.
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
       if ((CE1->getOpcode() == Instruction::SExt &&
-           ICmpInst::isSigned((ICmpInst::Predicate)pred)) ||
+           ICmpInst::isSigned(Predicate)) ||
           (CE1->getOpcode() == Instruction::ZExt &&
-           !ICmpInst::isSigned((ICmpInst::Predicate)pred))){
+           !ICmpInst::isSigned(Predicate))) {
         Constant *CE1Op0 = CE1->getOperand(0);
         Constant *CE1Inverse = ConstantExpr::getTrunc(CE1, CE1Op0->getType());
         if (CE1Inverse == CE1Op0) {
@@ -2000,7 +1888,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
           Constant *C2Inverse = ConstantExpr::getTrunc(C2, CE1Op0->getType());
           if (ConstantExpr::getCast(CE1->getOpcode(), C2Inverse,
                                     C2->getType()) == C2)
-            return ConstantExpr::getICmp(pred, CE1Inverse, C2Inverse);
+            return ConstantExpr::getICmp(Predicate, CE1Inverse, C2Inverse);
         }
       }
     }
@@ -2010,8 +1898,8 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       // If C2 is a constant expr and C1 isn't, flip them around and fold the
       // other way if possible.
       // Also, if C1 is null and C2 isn't, flip them around.
-      pred = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)pred);
-      return ConstantExpr::getICmp(pred, C2, C1);
+      Predicate = ICmpInst::getSwappedPredicate(Predicate);
+      return ConstantExpr::getICmp(Predicate, C2, C1);
     }
   }
   return nullptr;
@@ -2086,32 +1974,14 @@ static Constant *foldGEPOfGEP(GEPOperator *GEP, Type *PointeeTy, bool InBounds,
        I != E; ++I)
     LastI = I;
 
-  // We cannot combine indices if doing so would take us outside of an
-  // array or vector.  Doing otherwise could trick us if we evaluated such a
-  // GEP as part of a load.
-  //
-  // e.g. Consider if the original GEP was:
-  // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
-  //                    i32 0, i32 0, i64 0)
-  //
-  // If we then tried to offset it by '8' to get to the third element,
-  // an i8, we should *not* get:
-  // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
-  //                    i32 0, i32 0, i64 8)
-  //
-  // This GEP tries to index array element '8  which runs out-of-bounds.
-  // Subsequent evaluation would get confused and produce erroneous results.
-  //
-  // The following prohibits such a GEP from being formed by checking to see
-  // if the index is in-range with respect to an array.
+  // We can't combine GEPs if the last index is a struct type.
   if (!LastI.isSequential())
     return nullptr;
+  // We could perform the transform with non-constant index, but prefer leaving
+  // it as GEP of GEP rather than GEP of add for now.
   ConstantInt *CI = dyn_cast<ConstantInt>(Idx0);
   if (!CI)
     return nullptr;
-  if (LastI.isBoundedSequential() &&
-      !isIndexInRangeOfArrayType(LastI.getSequentialNumElements(), CI))
-    return nullptr;
 
   // TODO: This code may be extended to handle vectors as well.
   auto *LastIdx = cast<Constant>(GEP->getOperand(GEP->getNumOperands()-1));
@@ -2226,11 +2096,12 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       PointerType *SrcPtrTy =
         dyn_cast<PointerType>(CE->getOperand(0)->getType());
       PointerType *DstPtrTy = dyn_cast<PointerType>(CE->getType());
-      if (SrcPtrTy && DstPtrTy) {
+      if (SrcPtrTy && DstPtrTy && !SrcPtrTy->isOpaque() &&
+          !DstPtrTy->isOpaque()) {
         ArrayType *SrcArrayTy =
-          dyn_cast<ArrayType>(SrcPtrTy->getElementType());
+          dyn_cast<ArrayType>(SrcPtrTy->getNonOpaquePointerElementType());
         ArrayType *DstArrayTy =
-          dyn_cast<ArrayType>(DstPtrTy->getElementType());
+          dyn_cast<ArrayType>(DstPtrTy->getNonOpaquePointerElementType());
         if (SrcArrayTy && DstArrayTy
             && SrcArrayTy->getElementType() == DstArrayTy->getElementType()
             && SrcPtrTy->getAddressSpace() == DstPtrTy->getAddressSpace())
diff --git a/llvm/lib/IR/ConstantFold.h b/llvm/lib/IR/ConstantFold.h
index 0cdd5cf3cbce..1aa44f4d21e5 100644
--- a/llvm/lib/IR/ConstantFold.h
+++ b/llvm/lib/IR/ConstantFold.h
@@ -19,6 +19,7 @@
 #define LLVM_LIB_IR_CONSTANTFOLD_H
 
 #include "llvm/ADT/Optional.h"
+#include "llvm/IR/InstrTypes.h"
 
 namespace llvm {
 template <typename T> class ArrayRef;
@@ -46,7 +47,7 @@ template <typename T> class ArrayRef;
   Constant *ConstantFoldUnaryInstruction(unsigned Opcode, Constant *V);
   Constant *ConstantFoldBinaryInstruction(unsigned Opcode, Constant *V1,
                                           Constant *V2);
-  Constant *ConstantFoldCompareInstruction(unsigned short predicate,
+  Constant *ConstantFoldCompareInstruction(CmpInst::Predicate Predicate,
                                            Constant *C1, Constant *C2);
   Constant *ConstantFoldGetElementPtr(Type *Ty, Constant *C, bool InBounds,
                                       Optional<unsigned> InRangeIndex,
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 837be910f6d8..c13990af360e 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -739,15 +739,8 @@ static bool constantIsDead(const Constant *C, bool RemoveDeadUsers) {
       ++I;
   }
 
-  if (RemoveDeadUsers) {
-    // If C is only used by metadata, it should not be preserved but should
-    // have its uses replaced.
-    if (C->isUsedByMetadata()) {
-      const_cast<Constant *>(C)->replaceAllUsesWith(
-          UndefValue::get(C->getType()));
-    }
+  if (RemoveDeadUsers)
     const_cast<Constant *>(C)->destroyConstant();
-  }
 
   return true;
 }
@@ -779,18 +772,22 @@ void Constant::removeDeadConstantUsers() const {
   }
 }
 
-bool Constant::hasOneLiveUse() const {
+bool Constant::hasOneLiveUse() const { return hasNLiveUses(1); }
+
+bool Constant::hasZeroLiveUses() const { return hasNLiveUses(0); }
+
+bool Constant::hasNLiveUses(unsigned N) const {
   unsigned NumUses = 0;
-  for (const Use &use : uses()) {
-    const Constant *User = dyn_cast<Constant>(use.getUser());
+  for (const Use &U : uses()) {
+    const Constant *User = dyn_cast<Constant>(U.getUser());
     if (!User || !constantIsDead(User, /* RemoveDeadUsers= */ false)) {
       ++NumUses;
 
-      if (NumUses > 1)
+      if (NumUses > N)
         return false;
     }
   }
-  return NumUses == 1;
+  return NumUses == N;
 }
 
 Constant *Constant::replaceUndefsWith(Constant *C, Constant *Replacement) {
@@ -1491,28 +1488,6 @@ bool ConstantExpr::isCompare() const {
   return getOpcode() == Instruction::ICmp || getOpcode() == Instruction::FCmp;
 }
 
-bool ConstantExpr::isGEPWithNoNotionalOverIndexing() const {
-  if (getOpcode() != Instruction::GetElementPtr) return false;
-
-  gep_type_iterator GEPI = gep_type_begin(this), E = gep_type_end(this);
-  User::const_op_iterator OI = std::next(this->op_begin());
-
-  // The remaining indices may be compile-time known integers within the bounds
-  // of the corresponding notional static array types.
-  for (; GEPI != E; ++GEPI, ++OI) {
-    if (isa<UndefValue>(*OI))
-      continue;
-    auto *CI = dyn_cast<ConstantInt>(*OI);
-    if (!CI || (GEPI.isBoundedSequential() &&
-                (CI->getValue().getActiveBits() > 64 ||
-                 CI->getZExtValue() >= GEPI.getSequentialNumElements())))
-      return false;
-  }
-
-  // All the indices checked out.
-  return true;
-}
-
 bool ConstantExpr::hasIndices() const {
   return getOpcode() == Instruction::ExtractValue ||
          getOpcode() == Instruction::InsertValue;
@@ -2546,11 +2521,11 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
 
 Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS,
                                 Constant *RHS, bool OnlyIfReduced) {
+  auto Predicate = static_cast<CmpInst::Predicate>(pred);
   assert(LHS->getType() == RHS->getType());
-  assert(CmpInst::isIntPredicate((CmpInst::Predicate)pred) &&
-         "Invalid ICmp Predicate");
+  assert(CmpInst::isIntPredicate(Predicate) && "Invalid ICmp Predicate");
 
-  if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
+  if (Constant *FC = ConstantFoldCompareInstruction(Predicate, LHS, RHS))
     return FC;          // Fold a few common cases...
 
   if (OnlyIfReduced)
@@ -2559,7 +2534,7 @@ Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS,
   // Look up the constant in the table first to ensure uniqueness
   Constant *ArgVec[] = { LHS, RHS };
   // Get the key type with both the opcode and predicate
-  const ConstantExprKeyType Key(Instruction::ICmp, ArgVec, pred);
+  const ConstantExprKeyType Key(Instruction::ICmp, ArgVec, Predicate);
 
   Type *ResultTy = Type::getInt1Ty(LHS->getContext());
   if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
@@ -2571,11 +2546,11 @@ Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS,
 
 Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS,
                                 Constant *RHS, bool OnlyIfReduced) {
+  auto Predicate = static_cast<CmpInst::Predicate>(pred);
   assert(LHS->getType() == RHS->getType());
-  assert(CmpInst::isFPPredicate((CmpInst::Predicate)pred) &&
-         "Invalid FCmp Predicate");
+  assert(CmpInst::isFPPredicate(Predicate) && "Invalid FCmp Predicate");
 
-  if (Constant *FC = ConstantFoldCompareInstruction(pred, LHS, RHS))
+  if (Constant *FC = ConstantFoldCompareInstruction(Predicate, LHS, RHS))
     return FC;          // Fold a few common cases...
 
   if (OnlyIfReduced)
@@ -2584,7 +2559,7 @@ Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS,
   // Look up the constant in the table first to ensure uniqueness
   Constant *ArgVec[] = { LHS, RHS };
   // Get the key type with both the opcode and predicate
-  const ConstantExprKeyType Key(Instruction::FCmp, ArgVec, pred);
+  const ConstantExprKeyType Key(Instruction::FCmp, ArgVec, Predicate);
 
   Type *ResultTy = Type::getInt1Ty(LHS->getContext());
   if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index a263d2536541..43df15e4d932 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -142,12 +142,12 @@ LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
   if (AttrKind == Attribute::AttrKind::ByVal) {
     // After r362128, byval attributes need to have a type attribute. Provide a
     // NULL one until a proper API is added for this.
-    return wrap(Attribute::getWithByValType(Ctx, NULL));
+    return wrap(Attribute::getWithByValType(Ctx, nullptr));
   }
 
   if (AttrKind == Attribute::AttrKind::StructRet) {
     // Same as byval.
-    return wrap(Attribute::getWithStructRetType(Ctx, NULL));
+    return wrap(Attribute::getWithStructRetType(Ctx, nullptr));
   }
 
   return wrap(Attribute::get(Ctx, AttrKind, Val));
@@ -796,7 +796,7 @@ LLVMTypeRef LLVMScalableVectorType(LLVMTypeRef ElementType,
 LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) {
   auto *Ty = unwrap<Type>(WrappedTy);
   if (auto *PTy = dyn_cast<PointerType>(Ty))
-    return wrap(PTy->getElementType());
+    return wrap(PTy->getPointerElementType());
   if (auto *ATy = dyn_cast<ArrayType>(Ty))
     return wrap(ATy->getElementType());
   return wrap(cast<VectorType>(Ty)->getElementType());
@@ -1691,8 +1691,7 @@ LLVMValueRef LLVMConstGEP(LLVMValueRef ConstantVal,
   ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
                                NumIndices);
   Constant *Val = unwrap<Constant>(ConstantVal);
-  Type *Ty =
-      cast<PointerType>(Val->getType()->getScalarType())->getElementType();
+  Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType();
   return wrap(ConstantExpr::getGetElementPtr(Ty, Val, IdxList));
 }
 
@@ -1710,8 +1709,7 @@ LLVMValueRef LLVMConstInBoundsGEP(LLVMValueRef ConstantVal,
   ArrayRef<Constant *> IdxList(unwrap<Constant>(ConstantIndices, NumIndices),
                                NumIndices);
   Constant *Val = unwrap<Constant>(ConstantVal);
-  Type *Ty =
-      cast<PointerType>(Val->getType()->getScalarType())->getElementType();
+  Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType();
   return wrap(ConstantExpr::getInBoundsGetElementPtr(Ty, Val, IdxList));
 }
 
@@ -2278,7 +2276,8 @@ void LLVMSetExternallyInitialized(LLVMValueRef GlobalVar, LLVMBool IsExtInit) {
 LLVMValueRef LLVMAddAlias(LLVMModuleRef M, LLVMTypeRef Ty, LLVMValueRef Aliasee,
                           const char *Name) {
   auto *PTy = cast<PointerType>(unwrap(Ty));
-  return wrap(GlobalAlias::create(PTy->getElementType(), PTy->getAddressSpace(),
+  return wrap(GlobalAlias::create(PTy->getNonOpaquePointerElementType(),
+                                  PTy->getAddressSpace(),
                                   GlobalValue::ExternalLinkage, Name,
                                   unwrap<Constant>(Aliasee), unwrap(M)));
 }
@@ -2293,7 +2292,7 @@ LLVMValueRef LLVMAddAlias2(LLVMModuleRef M, LLVMTypeRef ValueTy,
 
 LLVMValueRef LLVMGetNamedGlobalAlias(LLVMModuleRef M,
                                      const char *Name, size_t NameLen) {
-  return wrap(unwrap(M)->getNamedAlias(Name));
+  return wrap(unwrap(M)->getNamedAlias(StringRef(Name, NameLen)));
 }
 
 LLVMValueRef LLVMGetFirstGlobalAlias(LLVMModuleRef M) {
@@ -3218,7 +3217,7 @@ LLVMValueRef LLVMBuildInvoke(LLVMBuilderRef B, LLVMValueRef Fn,
                              const char *Name) {
   Value *V = unwrap(Fn);
   FunctionType *FnT =
-      cast<FunctionType>(cast<PointerType>(V->getType())->getElementType());
+      cast<FunctionType>(V->getType()->getNonOpaquePointerElementType());
 
   return wrap(
       unwrap(B)->CreateInvoke(FnT, unwrap(Fn), unwrap(Then), unwrap(Catch),
@@ -3590,7 +3589,8 @@ LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
   Value *V = unwrap(PointerVal);
   PointerType *Ty = cast<PointerType>(V->getType());
 
-  return wrap(unwrap(B)->CreateLoad(Ty->getElementType(), V, Name));
+  return wrap(
+      unwrap(B)->CreateLoad(Ty->getNonOpaquePointerElementType(), V, Name));
 }
 
 LLVMValueRef LLVMBuildLoad2(LLVMBuilderRef B, LLVMTypeRef Ty,
@@ -3692,8 +3692,7 @@ LLVMValueRef LLVMBuildGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
                           const char *Name) {
   ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
   Value *Val = unwrap(Pointer);
-  Type *Ty =
-      cast<PointerType>(Val->getType()->getScalarType())->getElementType();
+  Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType();
   return wrap(unwrap(B)->CreateGEP(Ty, Val, IdxList, Name));
 }
 
@@ -3709,8 +3708,7 @@ LLVMValueRef LLVMBuildInBoundsGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
                                   const char *Name) {
   ArrayRef<Value *> IdxList(unwrap(Indices), NumIndices);
   Value *Val = unwrap(Pointer);
-  Type *Ty =
-      cast<PointerType>(Val->getType()->getScalarType())->getElementType();
+  Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType();
   return wrap(unwrap(B)->CreateInBoundsGEP(Ty, Val, IdxList, Name));
 }
 
@@ -3725,8 +3723,7 @@ LLVMValueRef LLVMBuildInBoundsGEP2(LLVMBuilderRef B, LLVMTypeRef Ty,
 LLVMValueRef LLVMBuildStructGEP(LLVMBuilderRef B, LLVMValueRef Pointer,
                                 unsigned Idx, const char *Name) {
   Value *Val = unwrap(Pointer);
-  Type *Ty =
-      cast<PointerType>(Val->getType()->getScalarType())->getElementType();
+  Type *Ty = Val->getType()->getScalarType()->getNonOpaquePointerElementType();
   return wrap(unwrap(B)->CreateStructGEP(Ty, Val, Idx, Name));
 }
 
@@ -3947,7 +3944,7 @@ LLVMValueRef LLVMBuildCall(LLVMBuilderRef B, LLVMValueRef Fn,
                            const char *Name) {
   Value *V = unwrap(Fn);
   FunctionType *FnT =
-      cast<FunctionType>(cast<PointerType>(V->getType())->getElementType());
+      cast<FunctionType>(V->getType()->getNonOpaquePointerElementType());
 
   return wrap(unwrap(B)->CreateCall(FnT, unwrap(Fn),
                                     makeArrayRef(unwrap(Args), NumArgs), Name));
@@ -4022,7 +4019,16 @@ LLVMValueRef LLVMBuildIsNotNull(LLVMBuilderRef B, LLVMValueRef Val,
 
 LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS,
                               LLVMValueRef RHS, const char *Name) {
-  return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name));
+  Value *L = unwrap(LHS);
+  Type *ElemTy = L->getType()->getNonOpaquePointerElementType();
+  return wrap(unwrap(B)->CreatePtrDiff(ElemTy, L, unwrap(RHS), Name));
+}
+
+LLVMValueRef LLVMBuildPtrDiff2(LLVMBuilderRef B, LLVMTypeRef ElemTy,
+                               LLVMValueRef LHS, LLVMValueRef RHS,
+                               const char *Name) {
+  return wrap(unwrap(B)->CreatePtrDiff(unwrap(ElemTy), unwrap(LHS),
+                                       unwrap(RHS), Name));
 }
 
 LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index 35af22034a12..a6e84dfbe1dd 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -33,7 +33,7 @@ static cl::opt<bool>
 
 DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU)
     : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr),
-      ValueFn(nullptr), LabelFn(nullptr),
+      ValueFn(nullptr), LabelFn(nullptr), AddrFn(nullptr),
       AllowUnresolvedNodes(AllowUnresolvedNodes) {
   if (CUNode) {
     if (const auto &ETs = CUNode->getEnumTypes())
@@ -821,12 +821,6 @@ DIExpression *DIBuilder::createExpression(ArrayRef<uint64_t> Addr) {
   return DIExpression::get(VMContext, Addr);
 }
 
-DIExpression *DIBuilder::createExpression(ArrayRef<int64_t> Signed) {
-  // TODO: Remove the callers of this signed version and delete.
-  SmallVector<uint64_t, 8> Addr(Signed.begin(), Signed.end());
-  return createExpression(Addr);
-}
-
 template <class... Ts>
 static DISubprogram *getSubprogram(bool IsDistinct, Ts &&...Args) {
   if (IsDistinct)
@@ -980,6 +974,24 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V,
   return insertDbgValueIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr);
 }
 
+Instruction *DIBuilder::insertDbgAddrIntrinsic(Value *V,
+                                               DILocalVariable *VarInfo,
+                                               DIExpression *Expr,
+                                               const DILocation *DL,
+                                               Instruction *InsertBefore) {
+  return insertDbgAddrIntrinsic(
+      V, VarInfo, Expr, DL, InsertBefore ? InsertBefore->getParent() : nullptr,
+      InsertBefore);
+}
+
+Instruction *DIBuilder::insertDbgAddrIntrinsic(Value *V,
+                                               DILocalVariable *VarInfo,
+                                               DIExpression *Expr,
+                                               const DILocation *DL,
+                                               BasicBlock *InsertAtEnd) {
+  return insertDbgAddrIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr);
+}
+
 /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics.
 /// This abstracts over the various ways to specify an insert position.
 static void initIRBuilder(IRBuilder<> &Builder, const DILocation *DL,
@@ -1001,6 +1013,24 @@ static Function *getDeclareIntrin(Module &M) {
                                                   : Intrinsic::dbg_declare);
 }
 
+Instruction *DIBuilder::insertDbgValueIntrinsic(
+    llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr,
+    const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
+  if (!ValueFn)
+    ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
+  return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB,
+                            InsertBefore);
+}
+
+Instruction *DIBuilder::insertDbgAddrIntrinsic(
+    llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr,
+    const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
+  if (!AddrFn)
+    AddrFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_addr);
+  return insertDbgIntrinsic(AddrFn, Val, VarInfo, Expr, DL, InsertBB,
+                            InsertBefore);
+}
+
 Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
                                       DIExpression *Expr, const DILocation *DL,
                                       BasicBlock *InsertBB,
@@ -1024,17 +1054,20 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
   return B.CreateCall(DeclareFn, Args);
 }
 
-Instruction *DIBuilder::insertDbgValueIntrinsic(
-    Value *V, DILocalVariable *VarInfo, DIExpression *Expr,
-    const DILocation *DL, BasicBlock *InsertBB, Instruction *InsertBefore) {
-  assert(V && "no value passed to dbg.value");
-  assert(VarInfo && "empty or invalid DILocalVariable* passed to dbg.value");
+Instruction *DIBuilder::insertDbgIntrinsic(llvm::Function *IntrinsicFn,
+                                           Value *V, DILocalVariable *VarInfo,
+                                           DIExpression *Expr,
+                                           const DILocation *DL,
+                                           BasicBlock *InsertBB,
+                                           Instruction *InsertBefore) {
+  assert(IntrinsicFn && "must pass a non-null intrinsic function");
+  assert(V && "must pass a value to a dbg intrinsic");
+  assert(VarInfo &&
+         "empty or invalid DILocalVariable* passed to debug intrinsic");
   assert(DL && "Expected debug loc");
   assert(DL->getScope()->getSubprogram() ==
              VarInfo->getScope()->getSubprogram() &&
          "Expected matching subprograms");
-  if (!ValueFn)
-    ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
   trackIfUnresolved(VarInfo);
   trackIfUnresolved(Expr);
@@ -1044,7 +1077,7 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(
 
   IRBuilder<> B(DL->getContext());
   initIRBuilder(B, DL, InsertBB, InsertBefore);
-  return B.CreateCall(ValueFn, Args);
+  return B.CreateCall(IntrinsicFn, Args);
 }
 
 Instruction *DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL,
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 7c69fbf7085d..98f25b035157 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1436,14 +1436,14 @@ LLVMDIBuilderCreateSubroutineType(LLVMDIBuilderRef Builder,
 }
 
 LLVMMetadataRef LLVMDIBuilderCreateExpression(LLVMDIBuilderRef Builder,
-                                              int64_t *Addr, size_t Length) {
-  return wrap(unwrap(Builder)->createExpression(ArrayRef<int64_t>(Addr,
-                                                                  Length)));
+                                              uint64_t *Addr, size_t Length) {
+  return wrap(
+      unwrap(Builder)->createExpression(ArrayRef<uint64_t>(Addr, Length)));
 }
 
 LLVMMetadataRef
 LLVMDIBuilderCreateConstantValueExpression(LLVMDIBuilderRef Builder,
-                                           int64_t Value) {
+                                           uint64_t Value) {
   return wrap(unwrap(Builder)->createConstantValueExpression(Value));
 }
 
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index b20e581d283a..59afb844eb89 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -567,13 +567,16 @@ Optional<DIBasicType::Signedness> DIBasicType::getSignedness() const {
 DIStringType *DIStringType::getImpl(LLVMContext &Context, unsigned Tag,
                                     MDString *Name, Metadata *StringLength,
                                     Metadata *StringLengthExp,
+                                    Metadata *StringLocationExp,
                                     uint64_t SizeInBits, uint32_t AlignInBits,
                                     unsigned Encoding, StorageType Storage,
                                     bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DIStringType, (Tag, Name, StringLength, StringLengthExp,
-                                       SizeInBits, AlignInBits, Encoding));
-  Metadata *Ops[] = {nullptr, nullptr, Name, StringLength, StringLengthExp};
+  DEFINE_GETIMPL_LOOKUP(DIStringType,
+                        (Tag, Name, StringLength, StringLengthExp,
+                         StringLocationExp, SizeInBits, AlignInBits, Encoding));
+  Metadata *Ops[] = {nullptr,      nullptr,         Name,
+                     StringLength, StringLengthExp, StringLocationExp};
   DEFINE_GETIMPL_STORE(DIStringType, (Tag, SizeInBits, AlignInBits, Encoding),
                        Ops);
 }
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index f1a6402fb11b..1e874d7afa79 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -300,9 +300,9 @@ void Argument::removeAttr(Attribute::AttrKind Kind) {
   getParent()->removeParamAttr(getArgNo(), Kind);
 }
 
-void Argument::removeAttrs(const AttrBuilder &B) {
+void Argument::removeAttrs(const AttributeMask &AM) {
   AttributeList AL = getParent()->getAttributes();
-  AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), B);
+  AL = AL.removeParamAttributes(Parent->getContext(), getArgNo(), AM);
   getParent()->setAttributes(AL);
 }
 
@@ -340,7 +340,7 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
                                           unsigned AddrSpace, const Twine &N,
                                           Module *M) {
   auto *F = new Function(Ty, Linkage, AddrSpace, N, M);
-  AttrBuilder B;
+  AttrBuilder B(F->getContext());
   if (M->getUwtable())
     B.addAttribute(Attribute::UWTable);
   switch (M->getFramePointer()) {
@@ -589,8 +589,8 @@ void Function::removeFnAttr(StringRef Kind) {
   AttributeSets = AttributeSets.removeFnAttribute(getContext(), Kind);
 }
 
-void Function::removeFnAttrs(const AttrBuilder &Attrs) {
-  AttributeSets = AttributeSets.removeFnAttributes(getContext(), Attrs);
+void Function::removeFnAttrs(const AttributeMask &AM) {
+  AttributeSets = AttributeSets.removeFnAttributes(getContext(), AM);
 }
 
 void Function::removeRetAttr(Attribute::AttrKind Kind) {
@@ -601,7 +601,7 @@ void Function::removeRetAttr(StringRef Kind) {
   AttributeSets = AttributeSets.removeRetAttribute(getContext(), Kind);
 }
 
-void Function::removeRetAttrs(const AttrBuilder &Attrs) {
+void Function::removeRetAttrs(const AttributeMask &Attrs) {
   AttributeSets = AttributeSets.removeRetAttributes(getContext(), Attrs);
 }
 
@@ -613,7 +613,7 @@ void Function::removeParamAttr(unsigned ArgNo, StringRef Kind) {
   AttributeSets = AttributeSets.removeParamAttribute(getContext(), ArgNo, Kind);
 }
 
-void Function::removeParamAttrs(unsigned ArgNo, const AttrBuilder &Attrs) {
+void Function::removeParamAttrs(unsigned ArgNo, const AttributeMask &Attrs) {
   AttributeSets =
       AttributeSets.removeParamAttributes(getContext(), ArgNo, Attrs);
 }
@@ -817,7 +817,8 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
     // Opaque pointer doesn't have pointee type information, so we just mangle
     // address space for opaque pointer.
     if (!PTyp->isOpaque())
-      Result += getMangledTypeStr(PTyp->getElementType(), HasUnnamedType);
+      Result += getMangledTypeStr(PTyp->getNonOpaquePointerElementType(),
+                                  HasUnnamedType);
   } else if (ArrayType *ATyp = dyn_cast<ArrayType>(Ty)) {
     Result += "a" + utostr(ATyp->getNumElements()) +
               getMangledTypeStr(ATyp->getElementType(), HasUnnamedType);
@@ -1465,8 +1466,8 @@ static bool matchIntrinsicType(
       if (!PT || PT->getAddressSpace() != D.Pointer_AddressSpace)
         return true;
       if (!PT->isOpaque())
-        return matchIntrinsicType(PT->getElementType(), Infos, ArgTys,
-                                  DeferredChecks, IsDeferredCheck);
+        return matchIntrinsicType(PT->getNonOpaquePointerElementType(), Infos,
+                                  ArgTys, DeferredChecks, IsDeferredCheck);
       // Consume IIT descriptors relating to the pointer element type.
       while (Infos.front().Kind == IITDescriptor::Pointer)
         Infos = Infos.slice(1);
@@ -1573,7 +1574,8 @@ static bool matchIntrinsicType(
         return IsDeferredCheck || DeferCheck(Ty);
       Type * ReferenceType = ArgTys[D.getArgumentNumber()];
       PointerType *ThisArgType = dyn_cast<PointerType>(Ty);
-      return (!ThisArgType || ThisArgType->getElementType() != ReferenceType);
+      return (!ThisArgType ||
+              !ThisArgType->isOpaqueOrPointeeTypeMatches(ReferenceType));
     }
     case IITDescriptor::PtrToElt: {
       if (D.getArgumentNumber() >= ArgTys.size())
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index b6bd25aa1234..c832499dde06 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -95,6 +95,8 @@ void GlobalValue::eraseFromParent() {
   llvm_unreachable("not a global");
 }
 
+GlobalObject::~GlobalObject() { setComdat(nullptr); }
+
 bool GlobalValue::isInterposable() const {
   if (isInterposableLinkage(getLinkage()))
     return true;
@@ -103,10 +105,15 @@ bool GlobalValue::isInterposable() const {
 }
 
 bool GlobalValue::canBenefitFromLocalAlias() const {
-  // See AsmPrinter::getSymbolPreferLocal().
+  // See AsmPrinter::getSymbolPreferLocal(). For a deduplicate comdat kind,
+  // references to a discarded local symbol from outside the group are not
+  // allowed, so avoid the local alias.
+  auto isDeduplicateComdat = [](const Comdat *C) {
+    return C && C->getSelectionKind() != Comdat::NoDeduplicate;
+  };
   return hasDefaultVisibility() &&
          GlobalObject::isExternalLinkage(getLinkage()) && !isDeclaration() &&
-         !isa<GlobalIFunc>(this) && !hasComdat();
+         !isa<GlobalIFunc>(this) && !isDeduplicateComdat(getComdat());
 }
 
 unsigned GlobalValue::getAddressSpace() const {
@@ -182,6 +189,14 @@ const Comdat *GlobalValue::getComdat() const {
   return cast<GlobalObject>(this)->getComdat();
 }
 
+void GlobalObject::setComdat(Comdat *C) {
+  if (ObjComdat)
+    ObjComdat->removeUser(this);
+  ObjComdat = C;
+  if (C)
+    C->addUser(this);
+}
+
 StringRef GlobalValue::getPartition() const {
   if (!hasPartition())
     return "";
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 98f6ccf81973..27528a69be21 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -679,7 +679,7 @@ static CallInst *CreateGCStatepointCallCommon(
     const Twine &Name) {
   // Extract out the type of the callee.
   auto *FuncPtrType = cast<PointerType>(ActualCallee->getType());
-  assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
+  assert(isa<FunctionType>(FuncPtrType->getPointerElementType()) &&
          "actual callee must be a callable value");
 
   Module *M = Builder->GetInsertBlock()->getParent()->getParent();
@@ -736,7 +736,7 @@ static InvokeInst *CreateGCStatepointInvokeCommon(
     ArrayRef<T3> GCArgs, const Twine &Name) {
   // Extract out the type of the callee.
   auto *FuncPtrType = cast<PointerType>(ActualInvokee->getType());
-  assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
+  assert(isa<FunctionType>(FuncPtrType->getPointerElementType()) &&
          "actual callee must be a callable value");
 
   Module *M = Builder->GetInsertBlock()->getParent()->getParent();
@@ -984,10 +984,8 @@ CallInst *IRBuilderBase::CreateConstrainedFPCall(
 
 Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
                                    const Twine &Name, Instruction *MDFrom) {
-  if (auto *CC = dyn_cast<Constant>(C))
-    if (auto *TC = dyn_cast<Constant>(True))
-      if (auto *FC = dyn_cast<Constant>(False))
-        return Insert(Folder.CreateSelect(CC, TC, FC), Name);
+  if (auto *V = Folder.FoldSelect(C, True, False))
+    return V;
 
   SelectInst *Sel = SelectInst::Create(C, True, False);
   if (MDFrom) {
@@ -1000,16 +998,17 @@ Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
   return Insert(Sel, Name);
 }
 
-Value *IRBuilderBase::CreatePtrDiff(Value *LHS, Value *RHS,
+Value *IRBuilderBase::CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS,
                                     const Twine &Name) {
   assert(LHS->getType() == RHS->getType() &&
          "Pointer subtraction operand types must match!");
-  auto *ArgType = cast<PointerType>(LHS->getType());
+  assert(cast<PointerType>(LHS->getType())
+             ->isOpaqueOrPointeeTypeMatches(ElemTy) &&
+         "Pointer type must match element type");
   Value *LHS_int = CreatePtrToInt(LHS, Type::getInt64Ty(Context));
   Value *RHS_int = CreatePtrToInt(RHS, Type::getInt64Ty(Context));
   Value *Difference = CreateSub(LHS_int, RHS_int);
-  return CreateExactSDiv(Difference,
-                         ConstantExpr::getSizeOf(ArgType->getElementType()),
+  return CreateExactSDiv(Difference, ConstantExpr::getSizeOf(ElemTy),
                          Name);
 }
 
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 4480ec799c35..59b7221d1fa2 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -186,7 +186,8 @@ void Instruction::dropUndefImplyingAttrsAndUnknownMetadata(
   AttributeList AL = CB->getAttributes();
   if (AL.isEmpty())
     return;
-  AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes();
+  AttributeMask UBImplyingAttributes =
+      AttributeFuncs::getUBImplyingAttributes();
   for (unsigned ArgNo = 0; ArgNo < CB->arg_size(); ArgNo++)
     CB->removeParamAttrs(ArgNo, UBImplyingAttributes);
   CB->removeRetAttrs(UBImplyingAttributes);
@@ -584,7 +585,7 @@ bool Instruction::mayReadFromMemory() const {
   case Instruction::Call:
   case Instruction::Invoke:
   case Instruction::CallBr:
-    return !cast<CallBase>(this)->doesNotReadMemory();
+    return !cast<CallBase>(this)->onlyWritesMemory();
   case Instruction::Store:
     return !cast<StoreInst>(this)->isUnordered();
   }
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 8f7318665cfb..adea7abb75cf 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -178,6 +178,18 @@ int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
   return -1;
 }
 
+ConstantInt *InstrProfInstBase::getNumCounters() const {
+  if (InstrProfValueProfileInst::classof(this))
+    llvm_unreachable("InstrProfValueProfileInst does not have counters!");
+  return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+}
+
+ConstantInt *InstrProfInstBase::getIndex() const {
+  if (InstrProfValueProfileInst::classof(this))
+    llvm_unreachable("Please use InstrProfValueProfileInst::getIndex()");
+  return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
+}
+
 Value *InstrProfIncrementInst::getStep() const {
   if (InstrProfIncrementInstStep::classof(this)) {
     return const_cast<Value *>(getArgOperand(4));
@@ -482,6 +494,7 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID,
     VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy);
     break;
   }
+  case Intrinsic::vp_merge:
   case Intrinsic::vp_select:
     VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()});
     break;
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 24c4a348f4da..0b5f928165e8 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -428,20 +428,22 @@ template <> struct MDNodeKeyImpl<DIStringType> {
   MDString *Name;
   Metadata *StringLength;
   Metadata *StringLengthExp;
+  Metadata *StringLocationExp;
   uint64_t SizeInBits;
   uint32_t AlignInBits;
   unsigned Encoding;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *StringLength,
-                Metadata *StringLengthExp, uint64_t SizeInBits,
-                uint32_t AlignInBits, unsigned Encoding)
+                Metadata *StringLengthExp, Metadata *StringLocationExp,
+                uint64_t SizeInBits, uint32_t AlignInBits, unsigned Encoding)
       : Tag(Tag), Name(Name), StringLength(StringLength),
-        StringLengthExp(StringLengthExp), SizeInBits(SizeInBits),
-        AlignInBits(AlignInBits), Encoding(Encoding) {}
+        StringLengthExp(StringLengthExp), StringLocationExp(StringLocationExp),
+        SizeInBits(SizeInBits), AlignInBits(AlignInBits), Encoding(Encoding) {}
   MDNodeKeyImpl(const DIStringType *N)
       : Tag(N->getTag()), Name(N->getRawName()),
         StringLength(N->getRawStringLength()),
         StringLengthExp(N->getRawStringLengthExp()),
+        StringLocationExp(N->getRawStringLocationExp()),
         SizeInBits(N->getSizeInBits()), AlignInBits(N->getAlignInBits()),
         Encoding(N->getEncoding()) {}
 
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index bb72bec93066..4357c95aa9f6 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -256,9 +256,9 @@ private:
   bool wasRun;
 public:
   static char ID;
-  explicit FunctionPassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
+  explicit FunctionPassManagerImpl()
+      : Pass(PT_PassManager, ID), PMTopLevelManager(new FPPassManager()),
+        wasRun(false) {}
 
   /// \copydoc FunctionPassManager::add()
   void add(Pass *P) {
@@ -387,8 +387,7 @@ namespace {
 class MPPassManager : public Pass, public PMDataManager {
 public:
   static char ID;
-  explicit MPPassManager() :
-    Pass(PT_PassManager, ID), PMDataManager() { }
+  explicit MPPassManager() : Pass(PT_PassManager, ID) {}
 
   // Delete on the fly managers.
   ~MPPassManager() override {
@@ -478,9 +477,8 @@ class PassManagerImpl : public Pass,
 
 public:
   static char ID;
-  explicit PassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-                              PMTopLevelManager(new MPPassManager()) {}
+  explicit PassManagerImpl()
+      : Pass(PT_PassManager, ID), PMTopLevelManager(new MPPassManager()) {}
 
   /// \copydoc PassManager::add()
   void add(Pass *P) {
diff --git a/llvm/lib/IR/TypeFinder.cpp b/llvm/lib/IR/TypeFinder.cpp
index 1f757d7dbf4e..904af7e737cc 100644
--- a/llvm/lib/IR/TypeFinder.cpp
+++ b/llvm/lib/IR/TypeFinder.cpp
@@ -18,8 +18,10 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -34,22 +36,27 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
 
   // Get types from global variables.
   for (const auto &G : M.globals()) {
-    incorporateType(G.getType());
+    incorporateType(G.getValueType());
     if (G.hasInitializer())
       incorporateValue(G.getInitializer());
   }
 
   // Get types from aliases.
   for (const auto &A : M.aliases()) {
-    incorporateType(A.getType());
+    incorporateType(A.getValueType());
     if (const Value *Aliasee = A.getAliasee())
       incorporateValue(Aliasee);
   }
 
+  // Get types from ifuncs.
+  for (const auto &GI : M.ifuncs())
+    incorporateType(GI.getValueType());
+
   // Get types from functions.
   SmallVector<std::pair<unsigned, MDNode *>, 4> MDForInst;
   for (const Function &FI : M) {
-    incorporateType(FI.getType());
+    incorporateType(FI.getFunctionType());
+    incorporateAttributes(FI.getAttributes());
 
     for (const Use &U : FI.operands())
       incorporateValue(U.get());
@@ -69,6 +76,13 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
           if (&*O && !isa<Instruction>(&*O))
             incorporateValue(&*O);
 
+        if (auto *GEP = dyn_cast<GetElementPtrInst>(&I))
+          incorporateType(GEP->getSourceElementType());
+        if (auto *AI = dyn_cast<AllocaInst>(&I))
+          incorporateType(AI->getAllocatedType());
+        if (const auto *CB = dyn_cast<CallBase>(&I))
+          incorporateAttributes(CB->getAttributes());
+
         // Incorporate types hiding in metadata.
         I.getAllMetadataOtherThanDebugLoc(MDForInst);
         for (const auto &MD : MDForInst)
@@ -138,6 +152,9 @@ void TypeFinder::incorporateValue(const Value *V) {
   if (isa<Instruction>(V))
     return;
 
+  if (auto *GEP = dyn_cast<GEPOperator>(V))
+    incorporateType(GEP->getSourceElementType());
+
   // Look in operands for types.
   const User *U = cast<User>(V);
   for (const auto &I : U->operands())
@@ -173,3 +190,13 @@ void TypeFinder::incorporateMDNode(const MDNode *V) {
     }
   }
 }
+
+void TypeFinder::incorporateAttributes(AttributeList AL) {
+  if (!VisitedAttributes.insert(AL).second)
+    return;
+
+  for (AttributeSet AS : AL)
+    for (Attribute A : AS)
+      if (A.isTypeAttribute())
+        incorporateType(A.getValueAsType());
+}
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index fb7c423e54e2..b84edb789405 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -551,11 +551,12 @@ private:
   void checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr,
                                     const Value *V);
   void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
-                           const Value *V, bool IsIntrinsic);
+                           const Value *V, bool IsIntrinsic, bool IsInlineAsm);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
 
   void visitConstantExprsRecursively(const Constant *EntryC);
   void visitConstantExpr(const ConstantExpr *CE);
+  void verifyInlineAsmCall(const CallBase &Call);
   void verifyStatepoint(const CallBase &Call);
   void verifyFrameRecoverIndices();
   void verifySiblingFuncletUnwinds();
@@ -1058,6 +1059,7 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
                N.getTag() == dwarf::DW_TAG_reference_type ||
                N.getTag() == dwarf::DW_TAG_rvalue_reference_type ||
                N.getTag() == dwarf::DW_TAG_const_type ||
+               N.getTag() == dwarf::DW_TAG_immutable_type ||
                N.getTag() == dwarf::DW_TAG_volatile_type ||
                N.getTag() == dwarf::DW_TAG_restrict_type ||
                N.getTag() == dwarf::DW_TAG_atomic_type ||
@@ -1792,7 +1794,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
          "'noinline and alwaysinline' are incompatible!",
          V);
 
-  AttrBuilder IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
+  AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
   for (Attribute Attr : Attrs) {
     if (!Attr.isStringAttribute() &&
         IncompatibleAttrs.contains(Attr.getKindAsEnum())) {
@@ -1824,33 +1826,34 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
              "Attribute 'preallocated' does not support unsized types!", V);
     }
     if (!PTy->isOpaque()) {
-      if (!isa<PointerType>(PTy->getElementType()))
+      if (!isa<PointerType>(PTy->getNonOpaquePointerElementType()))
         Assert(!Attrs.hasAttribute(Attribute::SwiftError),
                "Attribute 'swifterror' only applies to parameters "
                "with pointer to pointer type!",
                V);
       if (Attrs.hasAttribute(Attribute::ByRef)) {
-        Assert(Attrs.getByRefType() == PTy->getElementType(),
+        Assert(Attrs.getByRefType() == PTy->getNonOpaquePointerElementType(),
                "Attribute 'byref' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::ByVal) && Attrs.getByValType()) {
-        Assert(Attrs.getByValType() == PTy->getElementType(),
+        Assert(Attrs.getByValType() == PTy->getNonOpaquePointerElementType(),
                "Attribute 'byval' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::Preallocated)) {
-        Assert(Attrs.getPreallocatedType() == PTy->getElementType(),
+        Assert(Attrs.getPreallocatedType() ==
+                   PTy->getNonOpaquePointerElementType(),
                "Attribute 'preallocated' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::InAlloca)) {
-        Assert(Attrs.getInAllocaType() == PTy->getElementType(),
+        Assert(Attrs.getInAllocaType() == PTy->getNonOpaquePointerElementType(),
                "Attribute 'inalloca' type does not match parameter!", V);
       }
 
       if (Attrs.hasAttribute(Attribute::ElementType)) {
-        Assert(Attrs.getElementType() == PTy->getElementType(),
+        Assert(Attrs.getElementType() == PTy->getNonOpaquePointerElementType(),
                "Attribute 'elementtype' type does not match parameter!", V);
       }
     }
@@ -1870,7 +1873,8 @@ void Verifier::checkUnsignedBaseTenFuncAttr(AttributeList Attrs, StringRef Attr,
 // Check parameter attributes against a function type.
 // The value V is printed in error messages.
 void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
-                                   const Value *V, bool IsIntrinsic) {
+                                   const Value *V, bool IsIntrinsic,
+                                   bool IsInlineAsm) {
   if (Attrs.isEmpty())
     return;
 
@@ -1913,8 +1917,10 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
     if (!IsIntrinsic) {
       Assert(!ArgAttrs.hasAttribute(Attribute::ImmArg),
              "immarg attribute only applies to intrinsics",V);
-      Assert(!ArgAttrs.hasAttribute(Attribute::ElementType),
-             "Attribute 'elementtype' can only be applied to intrinsics.", V);
+      if (!IsInlineAsm)
+        Assert(!ArgAttrs.hasAttribute(Attribute::ElementType),
+               "Attribute 'elementtype' can only be applied to intrinsics"
+               " and inline asm.", V);
     }
 
     verifyParameterAttrs(ArgAttrs, Ty, V);
@@ -2141,6 +2147,33 @@ bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
   return Attrs.getNumAttrSets() <= Params + 2;
 }
 
+void Verifier::verifyInlineAsmCall(const CallBase &Call) {
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+  unsigned ArgNo = 0;
+  for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    // Only deal with constraints that correspond to call arguments.
+    if (!CI.hasArg())
+      continue;
+
+    if (CI.isIndirect) {
+      const Value *Arg = Call.getArgOperand(ArgNo);
+      Assert(Arg->getType()->isPointerTy(),
+             "Operand for indirect constraint must have pointer type",
+             &Call);
+
+      Assert(Call.getAttributes().getParamElementType(ArgNo),
+             "Operand for indirect constraint must have elementtype attribute",
+             &Call);
+    } else {
+      Assert(!Call.paramHasAttr(ArgNo, Attribute::ElementType),
+             "Elementtype attribute can only be applied for indirect "
+             "constraints", &Call);
+    }
+
+    ArgNo++;
+  }
+}
+
 /// Verify that statepoint intrinsic is well formed.
 void Verifier::verifyStatepoint(const CallBase &Call) {
   assert(Call.getCalledFunction() &&
@@ -2163,9 +2196,10 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
 
   const Value *Target = Call.getArgOperand(2);
   auto *PT = dyn_cast<PointerType>(Target->getType());
-  Assert(PT && PT->getElementType()->isFunctionTy(),
+  Assert(PT && PT->getPointerElementType()->isFunctionTy(),
          "gc.statepoint callee must be of function pointer type", Call, Target);
-  FunctionType *TargetFuncType = cast<FunctionType>(PT->getElementType());
+  FunctionType *TargetFuncType =
+      cast<FunctionType>(PT->getPointerElementType());
 
   const int NumCallArgs = cast<ConstantInt>(Call.getArgOperand(3))->getZExtValue();
   Assert(NumCallArgs >= 0,
@@ -2364,7 +2398,7 @@ void Verifier::visitFunction(const Function &F) {
   bool IsIntrinsic = F.isIntrinsic();
 
   // Check function attributes.
-  verifyFunctionAttrs(FT, Attrs, &F, IsIntrinsic);
+  verifyFunctionAttrs(FT, Attrs, &F, IsIntrinsic, /* IsInlineAsm */ false);
 
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
@@ -2779,6 +2813,7 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) {
       Assert(ArgBBs.count(BB), "Indirect label missing from arglist.", &CBI);
   }
 
+  verifyInlineAsmCall(CBI);
   visitTerminator(CBI);
 }
 
@@ -3123,7 +3158,7 @@ void Verifier::visitCallBase(CallBase &Call) {
   }
 
   // Verify call attributes.
-  verifyFunctionAttrs(FTy, Attrs, &Call, IsIntrinsic);
+  verifyFunctionAttrs(FTy, Attrs, &Call, IsIntrinsic, Call.isInlineAsm());
 
   // Conservatively check the inalloca argument.
   // We have a bug if we can find that there is an underlying alloca without
@@ -3316,6 +3351,9 @@ void Verifier::visitCallBase(CallBase &Call) {
              "debug info must have a !dbg location",
              Call);
 
+  if (Call.isInlineAsm())
+    verifyInlineAsmCall(Call);
+
   visitInstruction(Call);
 }
 
@@ -3345,13 +3383,13 @@ static bool isTypeCongruent(Type *L, Type *R) {
   return PL->getAddressSpace() == PR->getAddressSpace();
 }
 
-static AttrBuilder getParameterABIAttributes(unsigned I, AttributeList Attrs) {
+static AttrBuilder getParameterABIAttributes(LLVMContext& C, unsigned I, AttributeList Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
       Attribute::StructRet,  Attribute::ByVal,          Attribute::InAlloca,
       Attribute::InReg,      Attribute::StackAlignment, Attribute::SwiftSelf,
       Attribute::SwiftAsync, Attribute::SwiftError,     Attribute::Preallocated,
       Attribute::ByRef};
-  AttrBuilder Copy;
+  AttrBuilder Copy(C);
   for (auto AK : ABIAttrs) {
     Attribute Attr = Attrs.getParamAttrs(I).getAttribute(AK);
     if (Attr.isValid())
@@ -3414,12 +3452,12 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
     // - Only sret, byval, swiftself, and swiftasync ABI-impacting attributes
     //   are allowed in swifttailcc call
     for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
-      AttrBuilder ABIAttrs = getParameterABIAttributes(I, CallerAttrs);
+      AttrBuilder ABIAttrs = getParameterABIAttributes(F->getContext(), I, CallerAttrs);
       SmallString<32> Context{CCName, StringRef(" musttail caller")};
       verifyTailCCMustTailAttrs(ABIAttrs, Context);
     }
     for (unsigned I = 0, E = CalleeTy->getNumParams(); I != E; ++I) {
-      AttrBuilder ABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
+      AttrBuilder ABIAttrs = getParameterABIAttributes(F->getContext(), I, CalleeAttrs);
       SmallString<32> Context{CCName, StringRef(" musttail callee")};
       verifyTailCCMustTailAttrs(ABIAttrs, Context);
     }
@@ -3446,8 +3484,8 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
   // - All ABI-impacting function attributes, such as sret, byval, inreg,
   //   returned, preallocated, and inalloca, must match.
   for (unsigned I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
-    AttrBuilder CallerABIAttrs = getParameterABIAttributes(I, CallerAttrs);
-    AttrBuilder CalleeABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
+    AttrBuilder CallerABIAttrs = getParameterABIAttributes(F->getContext(), I, CallerAttrs);
+    AttrBuilder CalleeABIAttrs = getParameterABIAttributes(F->getContext(), I, CalleeAttrs);
     Assert(CallerABIAttrs == CalleeABIAttrs,
            "cannot guarantee tail call due to mismatched ABI impacting "
            "function attributes",
@@ -3963,6 +4001,11 @@ void Verifier::visitEHPadPredecessors(Instruction &I) {
              "A single unwind edge may only enter one EH pad", TI);
       Assert(Seen.insert(FromPad).second,
              "EH pad jumps through a cycle of pads", FromPad);
+
+      // This will be diagnosed on the corresponding instruction already. We
+      // need the extra check here to make sure getParentPad() works.
+      Assert(isa<FuncletPadInst>(FromPad) || isa<CatchSwitchInst>(FromPad),
+             "Parent pad must be catchpad/cleanuppad/catchswitch", TI);
     }
   }
 }
@@ -4964,7 +5007,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     // Assert that result type matches wrapped callee.
     const Value *Target = StatepointCall->getArgOperand(2);
     auto *PT = cast<PointerType>(Target->getType());
-    auto *TargetFuncType = cast<FunctionType>(PT->getElementType());
+    auto *TargetFuncType = cast<FunctionType>(PT->getPointerElementType());
     Assert(Call.getType() == TargetFuncType->getReturnType(),
            "gc.result result type does not match wrapped callee", Call);
     break;
@@ -5271,7 +5314,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       PointerType *Op0PtrTy =
           cast<PointerType>(Call.getArgOperand(0)->getType());
       if (!Op0PtrTy->isOpaque())
-        Op0ElemTy = Op0PtrTy->getElementType();
+        Op0ElemTy = Op0PtrTy->getNonOpaquePointerElementType();
       break;
     }
     case Intrinsic::matrix_column_major_store: {
@@ -5285,7 +5328,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       PointerType *Op1PtrTy =
           cast<PointerType>(Call.getArgOperand(1)->getType());
       if (!Op1PtrTy->isOpaque())
-        Op1ElemTy = Op1PtrTy->getElementType();
+        Op1ElemTy = Op1PtrTy->getNonOpaquePointerElementType();
       break;
     }
     default:
@@ -5316,6 +5359,24 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
+  case Intrinsic::experimental_vector_splice: {
+    VectorType *VecTy = cast<VectorType>(Call.getType());
+    int64_t Idx = cast<ConstantInt>(Call.getArgOperand(2))->getSExtValue();
+    int64_t KnownMinNumElements = VecTy->getElementCount().getKnownMinValue();
+    if (Call.getParent() && Call.getParent()->getParent()) {
+      AttributeList Attrs = Call.getParent()->getParent()->getAttributes();
+      if (Attrs.hasFnAttr(Attribute::VScaleRange))
+        KnownMinNumElements *= Attrs.getFnAttrs().getVScaleRangeMin();
+    }
+    Assert((Idx < 0 && std::abs(Idx) <= KnownMinNumElements) ||
+               (Idx >= 0 && Idx < KnownMinNumElements),
+           "The splice index exceeds the range [-VL, VL-1] where VL is the "
+           "known minimum number of elements in the vector. For scalable "
+           "vectors the minimum number of elements is determined from "
+           "vscale_range.",
+           &Call);
+    break;
+  }
   case Intrinsic::experimental_stepvector: {
     VectorType *VecTy = dyn_cast<VectorType>(Call.getType());
     Assert(VecTy && VecTy->getScalarType()->isIntegerTy() &&
diff --git a/llvm/lib/InterfaceStub/ELFObjHandler.cpp b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
index 0d1a864f31ac..cb72f57f7bde 100644
--- a/llvm/lib/InterfaceStub/ELFObjHandler.cpp
+++ b/llvm/lib/InterfaceStub/ELFObjHandler.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Process.h"
 
-using llvm::MemoryBufferRef;
 using llvm::object::ELFObjectFile;
 
 using namespace llvm;
diff --git a/llvm/lib/InterfaceStub/IFSHandler.cpp b/llvm/lib/InterfaceStub/IFSHandler.cpp
index e6bf09232ce2..4ccbb18ca04a 100644
--- a/llvm/lib/InterfaceStub/IFSHandler.cpp
+++ b/llvm/lib/InterfaceStub/IFSHandler.cpp
@@ -195,7 +195,7 @@ Expected<std::unique_ptr<IFSStub>> ifs::readIFSFromBuffer(StringRef Buf) {
 }
 
 Error ifs::writeIFSToOutputStream(raw_ostream &OS, const IFSStub &Stub) {
-  yaml::Output YamlOut(OS, NULL, /*WrapColumn =*/0);
+  yaml::Output YamlOut(OS, nullptr, /*WrapColumn =*/0);
   std::unique_ptr<IFSStubTriple> CopyStub(new IFSStubTriple(Stub));
   if (Stub.Target.Arch) {
     CopyStub->Target.ArchString = std::string(
diff --git a/llvm/lib/InterfaceStub/IFSStub.cpp b/llvm/lib/InterfaceStub/IFSStub.cpp
index 008263f8db9f..1ce7a66869b8 100644
--- a/llvm/lib/InterfaceStub/IFSStub.cpp
+++ b/llvm/lib/InterfaceStub/IFSStub.cpp
@@ -37,7 +37,7 @@ IFSStubTriple::IFSStubTriple(IFSStubTriple const &Stub) : IFSStub() {
   Symbols = Stub.Symbols;
 }
 
-IFSStubTriple::IFSStubTriple(IFSStub const &Stub) : IFSStub() {
+IFSStubTriple::IFSStubTriple(IFSStub const &Stub) {
   IfsVersion = Stub.IfsVersion;
   Target = Stub.Target;
   SoName = Stub.SoName;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 855d0fc8a8be..7694c9848384 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -229,8 +229,7 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
     PGOOpt = PGOOptions("", "", "", PGOOptions::NoAction,
                         PGOOptions::NoCSAction, true);
   }
-  if (TM)
-    TM->setPGOOption(PGOOpt);
+  TM->setPGOOption(PGOOpt);
 
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
@@ -415,6 +414,8 @@ static void codegen(const Config &Conf, TargetMachine *TM,
   TM->Options.ObjectFilenameForDebug = Stream->ObjectPathName;
 
   legacy::PassManager CodeGenPasses;
+  TargetLibraryInfoImpl TLII(Triple(Mod.getTargetTriple()));
+  CodeGenPasses.add(new TargetLibraryInfoWrapperPass(TLII));
   CodeGenPasses.add(
       createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
   if (Conf.PreCodeGenPassesHook)
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 5c2aaddff4d1..119237bb052e 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -620,6 +620,8 @@ void MCAsmStreamer::emitVersionMin(MCVersionMinType Type, unsigned Major,
 
 static const char *getPlatformName(MachO::PlatformType Type) {
   switch (Type) {
+  case MachO::PLATFORM_UNKNOWN: /* silence warning*/
+    break;
   case MachO::PLATFORM_MACOS:            return "macos";
   case MachO::PLATFORM_IOS:              return "ios";
   case MachO::PLATFORM_TVOS:             return "tvos";
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index aa4051aa2400..7f639e9c408f 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -978,13 +978,3 @@ void MCContext::reportWarning(SMLoc Loc, const Twine &Msg) {
     });
   }
 }
-
-void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) {
-  reportError(Loc, Msg);
-
-  // If we reached here, we are failing ungracefully. Run the interrupt handlers
-  // to make sure any special cleanups get done, in particular that we remove
-  // files registered with RemoveFileOnSignal.
-  sys::RunInterruptHandlers();
-  exit(1);
-}
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index 1c9cfb9042e2..2cb5a000f88a 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -561,7 +561,7 @@ Expected<unsigned> MCDwarfLineTable::tryGetFile(StringRef &Directory,
 
 static bool isRootFile(const MCDwarfFile &RootFile, StringRef &Directory,
                        StringRef &FileName, Optional<MD5::MD5Result> Checksum) {
-  if (RootFile.Name.empty() || RootFile.Name != FileName.data())
+  if (RootFile.Name.empty() || StringRef(RootFile.Name) != FileName)
     return false;
   return RootFile.Checksum == Checksum;
 }
@@ -586,7 +586,7 @@ MCDwarfLineTableHeader::tryGetFile(StringRef &Directory,
     trackMD5Usage(Checksum.hasValue());
     HasSource = (Source != None);
   }
-  if (isRootFile(RootFile, Directory, FileName, Checksum) && DwarfVersion >= 5)
+  if (DwarfVersion >= 5 && isRootFile(RootFile, Directory, FileName, Checksum))
     return 0;
   if (FileNumber == 0) {
     // File numbers start with 1 and/or after any file numbers
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 3edf7a3f49e6..88aeeb980738 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -116,8 +116,16 @@ public:
   void emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override {
     getAssembler().getLOHContainer().addDirective(Kind, Args);
   }
+  void emitCGProfileEntry(const MCSymbolRefExpr *From,
+                          const MCSymbolRefExpr *To, uint64_t Count) override {
+    if (!From->getSymbol().isTemporary() && !To->getSymbol().isTemporary())
+      getAssembler().CGProfile.push_back({From, To, Count});
+  }
 
   void finishImpl() override;
+
+  void finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE);
+  void finalizeCGProfile();
 };
 
 } // end anonymous namespace.
@@ -145,7 +153,8 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
   if (SegName == "__DATA" && (SecName == "__nl_symbol_ptr" ||
                               SecName == "__thread_ptr"))
     return true;
-
+  if (SegName == "__LLVM" && SecName == "__cg_profile")
+    return true;
   return false;
 }
 
@@ -513,9 +522,40 @@ void MCMachOStreamer::finishImpl() {
     }
   }
 
+  finalizeCGProfile();
+
   this->MCObjectStreamer::finishImpl();
 }
 
+void MCMachOStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
+  const MCSymbol *S = &SRE->getSymbol();
+  bool Created;
+  getAssembler().registerSymbol(*S, &Created);
+  if (Created)
+    S->setExternal(true);
+}
+
+void MCMachOStreamer::finalizeCGProfile() {
+  MCAssembler &Asm = getAssembler();
+  if (Asm.CGProfile.empty())
+    return;
+  for (MCAssembler::CGProfileEntry &E : Asm.CGProfile) {
+    finalizeCGProfileEntry(E.From);
+    finalizeCGProfileEntry(E.To);
+  }
+  // We can't write the section out until symbol indices are finalized which
+  // doesn't happen until after section layout. We need to create the section
+  // and set its size now so that it's accounted for in layout.
+  MCSection *CGProfileSection = Asm.getContext().getMachOSection(
+      "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
+  Asm.registerSection(*CGProfileSection);
+  auto *Frag = new MCDataFragment(CGProfileSection);
+  // For each entry, reserve space for 2 32-bit indices and a 64-bit count.
+  size_t SectionBytes =
+      Asm.CGProfile.size() * (2 * sizeof(uint32_t) + sizeof(uint64_t));
+  Frag->getContents().resize(SectionBytes);
+}
+
 MCStreamer *llvm::createMachOStreamer(MCContext &Context,
                                       std::unique_ptr<MCAsmBackend> &&MAB,
                                       std::unique_ptr<MCObjectWriter> &&OW,
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 6604d7988c4c..ebbbd6ad4e16 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -119,8 +119,31 @@ void MCObjectStreamer::resolvePendingFixups() {
       continue;
     }
     flushPendingLabels(PendingFixup.DF, PendingFixup.DF->getContents().size());
-    PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset());
-    PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup);
+    PendingFixup.Fixup.setOffset(PendingFixup.Sym->getOffset() +
+                                 PendingFixup.Fixup.getOffset());
+
+    // If the location symbol to relocate is in MCEncodedFragmentWithFixups,
+    // put the Fixup into location symbol's fragment. Otherwise
+    // put into PendingFixup.DF
+    MCFragment *SymFragment = PendingFixup.Sym->getFragment();
+    switch (SymFragment->getKind()) {
+    case MCFragment::FT_Relaxable:
+    case MCFragment::FT_Dwarf:
+    case MCFragment::FT_PseudoProbe:
+      cast<MCEncodedFragmentWithFixups<8, 1>>(SymFragment)
+          ->getFixups()
+          .push_back(PendingFixup.Fixup);
+      break;
+    case MCFragment::FT_Data:
+    case MCFragment::FT_CVDefRange:
+      cast<MCEncodedFragmentWithFixups<32, 4>>(SymFragment)
+          ->getFixups()
+          .push_back(PendingFixup.Fixup);
+      break;
+    default:
+      PendingFixup.DF->getFixups().push_back(PendingFixup.Fixup);
+      break;
+    }
   }
   PendingFixups.clear();
 }
@@ -816,8 +839,9 @@ MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
     return None;
   }
 
-  PendingFixups.emplace_back(&SRE.getSymbol(), DF,
-                             MCFixup::create(-1, Expr, Kind, Loc));
+  PendingFixups.emplace_back(
+      &SRE.getSymbol(), DF,
+      MCFixup::create(OffsetVal.getConstant(), Expr, Kind, Loc));
   return None;
 }
 
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 705f7159d55b..0cea491f227d 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -159,7 +159,7 @@ private:
     int64_t LineNumber;
     SMLoc Loc;
     unsigned Buf;
-    CppHashInfoTy() : Filename(), LineNumber(0), Loc(), Buf(0) {}
+    CppHashInfoTy() : LineNumber(0), Buf(0) {}
   };
   CppHashInfoTy CppHashInfo;
 
@@ -1121,11 +1121,8 @@ StringRef AsmParser::parseStringToComma() {
 bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   if (parseExpression(Res))
     return true;
-  if (Lexer.isNot(AsmToken::RParen))
-    return TokError("expected ')' in parentheses expression");
   EndLoc = Lexer.getTok().getEndLoc();
-  Lex();
-  return false;
+  return parseRParen();
 }
 
 /// Parse a bracket expression and return it.
@@ -1214,9 +1211,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
       Lex(); // eat '('.
       StringRef VName;
       parseIdentifier(VName);
-      // eat ')'.
-      if (parseToken(AsmToken::RParen,
-                     "unexpected token in variant, expected ')'"))
+      if (parseRParen())
         return true;
       Split = std::make_pair(Identifier, VName);
     }
@@ -1379,9 +1374,8 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
     Lex(); // Eat the operator.
     if (parseExpression(Res, EndLoc))
       return true;
-    if (Lexer.isNot(AsmToken::RParen))
-      return TokError("expected ')'");
-    Lex(); // Eat the operator.
+    if (parseRParen())
+      return true;
     Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx);
     return !Res;
   }
@@ -1553,8 +1547,7 @@ bool AsmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
     // This is the same behavior as parseParenExpression().
     if (ParenDepth - 1 > 0) {
       EndLoc = getTok().getEndLoc();
-      if (parseToken(AsmToken::RParen,
-                     "expected ')' in parentheses expression"))
+      if (parseRParen())
         return true;
     }
   }
@@ -5047,15 +5040,7 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) {
   // NOTE: a size of zero for a .comm should create a undefined symbol
   // but a size of .lcomm creates a bss symbol of size zero.
   if (Size < 0)
-    return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
-                          "be less than zero");
-
-  // NOTE: The alignment in the directive is a power of 2 value, the assembler
-  // may internally end up wanting an alignment in bytes.
-  // FIXME: Diagnose overflow.
-  if (Pow2Alignment < 0)
-    return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
-                                   "alignment, can't be less than zero");
+    return Error(SizeLoc, "size must be non-negative");
 
   Sym->redefineIfPossible();
   if (!Sym->isUndefined())
diff --git a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index 3bc13012c019..308b3842c61e 100644
--- a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -195,6 +195,8 @@ public:
     addDirectiveHandler<&DarwinAsmParser::parseMacOSXVersionMin>(
       ".macosx_version_min");
     addDirectiveHandler<&DarwinAsmParser::parseBuildVersion>(".build_version");
+    addDirectiveHandler<&DarwinAsmParser::parseDirectiveCGProfile>(
+        ".cg_profile");
 
     LastVersionDirective = SMLoc();
   }
@@ -467,6 +469,7 @@ public:
   bool parseSDKVersion(VersionTuple &SDKVersion);
   void checkVersion(StringRef Directive, StringRef Arg, SMLoc Loc,
                     Triple::OSType ExpectedOS);
+  bool parseDirectiveCGProfile(StringRef Directive, SMLoc Loc);
 };
 
 } // end anonymous namespace
@@ -1142,6 +1145,8 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc,
 
 static Triple::OSType getOSTypeFromPlatform(MachO::PlatformType Type) {
   switch (Type) {
+  case MachO::PLATFORM_UNKNOWN: /* silence warning */
+    break;
   case MachO::PLATFORM_MACOS:   return Triple::MacOSX;
   case MachO::PLATFORM_IOS:     return Triple::IOS;
   case MachO::PLATFORM_TVOS:    return Triple::TvOS;
@@ -1198,6 +1203,11 @@ bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) {
   return false;
 }
 
+/// parseDirectiveCGProfile
+///   ::= .cg_profile from, to, count
+bool DarwinAsmParser::parseDirectiveCGProfile(StringRef S, SMLoc Loc) {
+  return MCAsmParserExtension::ParseDirectiveCGProfile(S, Loc);
+}
 
 namespace llvm {
 
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index e95019c12db7..e814cf003656 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -499,7 +499,8 @@ bool ELFAsmParser::maybeParseUniqueID(int64_t &UniqueID) {
 }
 
 static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
-  return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back();
+  return SectionName.consume_front(Prefix) &&
+         (SectionName.empty() || SectionName[0] == '.');
 }
 
 static bool allowSectionTypeMismatch(const Triple &TT, StringRef SectionName,
@@ -514,7 +515,7 @@ static bool allowSectionTypeMismatch(const Triple &TT, StringRef SectionName,
     // MIPS .debug_* sections should have SHT_MIPS_DWARF section type to
     // distinguish among sections contain DWARF and ECOFF debug formats,
     // but in assembly files these sections have SHT_PROGBITS type.
-    return hasPrefix(SectionName, ".debug_") && Type == ELF::SHT_PROGBITS;
+    return SectionName.startswith(".debug_") && Type == ELF::SHT_PROGBITS;
   }
   return false;
 }
@@ -537,19 +538,18 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   int64_t UniqueID = ~0;
 
   // Set the defaults first.
-  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
+  if (hasPrefix(SectionName, ".rodata") || SectionName == ".rodata1")
     Flags |= ELF::SHF_ALLOC;
   else if (SectionName == ".fini" || SectionName == ".init" ||
-           hasPrefix(SectionName, ".text."))
+           hasPrefix(SectionName, ".text"))
     Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
-  else if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
-           hasPrefix(SectionName, ".bss.") ||
-           hasPrefix(SectionName, ".init_array.") ||
-           hasPrefix(SectionName, ".fini_array.") ||
-           hasPrefix(SectionName, ".preinit_array."))
+  else if (hasPrefix(SectionName, ".data") || SectionName == ".data1" ||
+           hasPrefix(SectionName, ".bss") ||
+           hasPrefix(SectionName, ".init_array") ||
+           hasPrefix(SectionName, ".fini_array") ||
+           hasPrefix(SectionName, ".preinit_array"))
     Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE;
-  else if (hasPrefix(SectionName, ".tdata.") ||
-           hasPrefix(SectionName, ".tbss."))
+  else if (hasPrefix(SectionName, ".tdata") || hasPrefix(SectionName, ".tbss"))
     Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
 
   if (getLexer().is(AsmToken::Comma)) {
@@ -620,15 +620,15 @@ EndStmt:
   if (TypeName.empty()) {
     if (SectionName.startswith(".note"))
       Type = ELF::SHT_NOTE;
-    else if (hasPrefix(SectionName, ".init_array."))
+    else if (hasPrefix(SectionName, ".init_array"))
       Type = ELF::SHT_INIT_ARRAY;
-    else if (hasPrefix(SectionName, ".bss."))
+    else if (hasPrefix(SectionName, ".bss"))
       Type = ELF::SHT_NOBITS;
-    else if (hasPrefix(SectionName, ".tbss."))
+    else if (hasPrefix(SectionName, ".tbss"))
       Type = ELF::SHT_NOBITS;
-    else if (hasPrefix(SectionName, ".fini_array."))
+    else if (hasPrefix(SectionName, ".fini_array"))
       Type = ELF::SHT_FINI_ARRAY;
-    else if (hasPrefix(SectionName, ".preinit_array."))
+    else if (hasPrefix(SectionName, ".preinit_array"))
       Type = ELF::SHT_PREINIT_ARRAY;
   } else {
     if (TypeName == "init_array")
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index f1704cef46ac..f9433240743d 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -379,7 +380,7 @@ private:
   /// time of assembly
   struct tm TM;
 
-  std::vector<bool> EndStatementAtEOFStack;
+  BitVector EndStatementAtEOFStack;
 
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
@@ -424,7 +425,7 @@ private:
     int64_t LineNumber;
     SMLoc Loc;
     unsigned Buf;
-    CppHashInfoTy() : Filename(), LineNumber(0), Loc(), Buf(0) {}
+    CppHashInfoTy() : LineNumber(0), Buf(0) {}
   };
   CppHashInfoTy CppHashInfo;
 
@@ -1516,11 +1517,8 @@ StringRef MasmParser::parseStringToEndOfStatement() {
 bool MasmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   if (parseExpression(Res))
     return true;
-  if (Lexer.isNot(AsmToken::RParen))
-    return TokError("expected ')' in parentheses expression");
   EndLoc = Lexer.getTok().getEndLoc();
-  Lex();
-  return false;
+  return parseRParen();
 }
 
 /// Parse a bracket expression and return it.
@@ -1838,9 +1836,8 @@ bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc,
     Lex(); // Eat the operator.
     if (parseExpression(Res, EndLoc))
       return true;
-    if (Lexer.isNot(AsmToken::RParen))
-      return TokError("expected ')'");
-    Lex(); // Eat the operator.
+    if (parseRParen())
+      return true;
     Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx);
     return !Res;
   }
@@ -1929,8 +1926,7 @@ bool MasmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
     // This is the same behavior as parseParenExpression().
     if (ParenDepth - 1 > 0) {
       EndLoc = getTok().getEndLoc();
-      if (parseToken(AsmToken::RParen,
-                     "expected ')' in parentheses expression"))
+      if (parseRParen())
         return true;
     }
   }
@@ -3358,8 +3354,7 @@ bool MasmParser::handleMacroInvocation(const MCAsmMacro *M, SMLoc NameLoc) {
   }
 
   // Consume the right-parenthesis on the other side of the arguments.
-  if (parseToken(AsmToken::RParen, "invoking macro function '" + M->Name +
-                                       "' requires arguments in parentheses"))
+  if (parseRParen())
     return true;
 
   // Exit values may require lexing, unfortunately. We construct a new buffer to
@@ -3743,8 +3738,7 @@ bool MasmParser::parseScalarInitializer(unsigned Size,
       SmallVector<const MCExpr *, 1> DuplicatedValues;
       if (parseToken(AsmToken::LParen,
                      "parentheses required for 'dup' contents") ||
-          parseScalarInstList(Size, DuplicatedValues) ||
-          parseToken(AsmToken::RParen, "unmatched parentheses"))
+          parseScalarInstList(Size, DuplicatedValues) || parseRParen())
         return true;
 
       for (int i = 0; i < Repetitions; ++i)
@@ -3950,8 +3944,7 @@ bool MasmParser::parseRealInstList(const fltSemantics &Semantics,
       SmallVector<APInt, 1> DuplicatedValues;
       if (parseToken(AsmToken::LParen,
                      "parentheses required for 'dup' contents") ||
-          parseRealInstList(Semantics, DuplicatedValues) ||
-          parseToken(AsmToken::RParen, "unmatched parentheses"))
+          parseRealInstList(Semantics, DuplicatedValues) || parseRParen())
         return true;
 
       for (int i = 0; i < Repetitions; ++i)
@@ -4316,8 +4309,7 @@ bool MasmParser::parseStructInstList(
       std::vector<StructInitializer> DuplicatedValues;
       if (parseToken(AsmToken::LParen,
                      "parentheses required for 'dup' contents") ||
-          parseStructInstList(Structure, DuplicatedValues) ||
-          parseToken(AsmToken::RParen, "unmatched parentheses"))
+          parseStructInstList(Structure, DuplicatedValues) || parseRParen())
         return true;
 
       for (int i = 0; i < Repetitions; ++i)
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 7f7380bf810d..2ff4839d3706 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -34,7 +34,8 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 
   if (getKind().isReadOnly()) {
-    if (getMappingClass() != XCOFF::XMC_RO)
+    if (getMappingClass() != XCOFF::XMC_RO &&
+        getMappingClass() != XCOFF::XMC_TD)
       report_fatal_error("Unhandled storage-mapping class for .rodata csect.");
     printCsectDirective(OS);
     return;
@@ -70,7 +71,8 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 
   if (isCsect() && getMappingClass() == XCOFF::XMC_TD) {
-    assert((getKind().isBSSExtern() || getKind().isBSSLocal()) &&
+    assert((getKind().isBSSExtern() || getKind().isBSSLocal() ||
+            getKind().isReadOnlyWithRel()) &&
            "Unexepected section kind for toc-data");
     printCsectDirective(OS);
     return;
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 9c37a7bebe2a..a14f0de65a9d 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1348,8 +1348,8 @@ void MCStreamer::emitVersionForTarget(
         DarwinTargetVariantTriple->isMacOSX()) {
       emitVersionForTarget(*DarwinTargetVariantTriple,
                            DarwinTargetVariantSDKVersion,
-                           /*TargetVariantTriple=*/nullptr,
-                           /*TargetVariantSDKVersion=*/VersionTuple());
+                           /*DarwinTargetVariantTriple=*/nullptr,
+                           /*DarwinTargetVariantSDKVersion=*/VersionTuple());
       emitDarwinTargetVariantBuildVersion(
           getMachoBuildVersionPlatformType(Target),
           LinkedTargetVersion.getMajor(),
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 16941b1cb727..56bb03ad8d42 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -759,6 +759,23 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
   computeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
 
+  if (!Asm.CGProfile.empty()) {
+    MCSection *CGProfileSection = Asm.getContext().getMachOSection(
+        "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
+    MCDataFragment *Frag = dyn_cast_or_null<MCDataFragment>(
+        &*CGProfileSection->getFragmentList().begin());
+    assert(Frag && "call graph profile section not reserved");
+    Frag->getContents().clear();
+    raw_svector_ostream OS(Frag->getContents());
+    for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
+      uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
+      uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
+      support::endian::write(OS, FromIndex, W.Endian);
+      support::endian::write(OS, ToIndex, W.Endian);
+      support::endian::write(OS, CGPE.Count, W.Endian);
+    }
+  }
+
   unsigned NumSections = Asm.size();
   const MCAssembler::VersionInfoType &VersionInfo =
     Layout.getAssembler().getVersionInfo();
diff --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
index 07be7b077bc9..121d320f10e6 100644
--- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
+++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
@@ -68,7 +68,8 @@ void LSUnitBase::dump() const {
 
 unsigned LSUnit::dispatch(const InstRef &IR) {
   const InstrDesc &Desc = IR.getInstruction()->getDesc();
-  unsigned IsMemBarrier = Desc.HasSideEffects;
+  bool IsStoreBarrier = IR.getInstruction()->isAStoreBarrier();
+  bool IsLoadBarrier = IR.getInstruction()->isALoadBarrier();
   assert((Desc.MayLoad || Desc.MayStore) && "Not a memory operation!");
 
   if (Desc.MayLoad)
@@ -111,12 +112,12 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
 
 
     CurrentStoreGroupID = NewGID;
-    if (IsMemBarrier)
+    if (IsStoreBarrier)
       CurrentStoreBarrierGroupID = NewGID;
 
     if (Desc.MayLoad) {
       CurrentLoadGroupID = NewGID;
-      if (IsMemBarrier)
+      if (IsLoadBarrier)
         CurrentLoadBarrierGroupID = NewGID;
     }
 
@@ -141,7 +142,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
   //    However that group has already started execution, so we cannot add
   //    this load to it.
   bool ShouldCreateANewGroup =
-      IsMemBarrier || !ImmediateLoadDominator ||
+      IsLoadBarrier || !ImmediateLoadDominator ||
       CurrentLoadBarrierGroupID == ImmediateLoadDominator ||
       ImmediateLoadDominator <= CurrentStoreGroupID ||
       getGroup(ImmediateLoadDominator).isExecuting();
@@ -161,7 +162,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     }
 
     // A load barrier may not pass a previous load or load barrier.
-    if (IsMemBarrier) {
+    if (IsLoadBarrier) {
       if (ImmediateLoadDominator) {
         MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator);
         LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
@@ -181,7 +182,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     }
 
     CurrentLoadGroupID = NewGID;
-    if (IsMemBarrier)
+    if (IsLoadBarrier)
       CurrentLoadBarrierGroupID = NewGID;
     return NewGID;
   }
diff --git a/llvm/lib/MCA/Stages/DispatchStage.cpp b/llvm/lib/MCA/Stages/DispatchStage.cpp
index 5385142698e6..66228bd5a862 100644
--- a/llvm/lib/MCA/Stages/DispatchStage.cpp
+++ b/llvm/lib/MCA/Stages/DispatchStage.cpp
@@ -30,7 +30,7 @@ DispatchStage::DispatchStage(const MCSubtargetInfo &Subtarget,
                              unsigned MaxDispatchWidth, RetireControlUnit &R,
                              RegisterFile &F)
     : DispatchWidth(MaxDispatchWidth), AvailableEntries(MaxDispatchWidth),
-      CarryOver(0U), CarriedOver(), STI(Subtarget), RCU(R), PRF(F) {
+      CarryOver(0U), STI(Subtarget), RCU(R), PRF(F) {
   if (!DispatchWidth)
     DispatchWidth = Subtarget.getSchedModel().IssueWidth;
 }
diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
index fa5c0fc66b9e..abfbc80f17c9 100644
--- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
+++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp
@@ -47,7 +47,7 @@ InOrderIssueStage::InOrderIssueStage(const MCSubtargetInfo &STI,
                                      RegisterFile &PRF, CustomBehaviour &CB,
                                      LSUnit &LSU)
     : STI(STI), PRF(PRF), RM(STI.getSchedModel()), CB(CB), LSU(LSU),
-      NumIssued(), SI(), CarryOver(), Bandwidth(), LastWriteBackCycle() {}
+      NumIssued(), CarryOver(), Bandwidth(), LastWriteBackCycle() {}
 
 unsigned InOrderIssueStage::getIssueWidth() const {
   return STI.getSchedModel().IssueWidth;
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 5492692445e7..9a4ef055faa4 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -38,9 +39,6 @@ using namespace llvm;
 using namespace object;
 using namespace llvm::support::endian;
 
-const char Magic[] = "!<arch>\n";
-const char ThinMagic[] = "!<thin>\n";
-
 void Archive::anchor() {}
 
 static Error malformedError(Twine Msg) {
@@ -49,27 +47,62 @@ static Error malformedError(Twine Msg) {
                                         object_error::parse_failed);
 }
 
+static Error
+createMemberHeaderParseError(const AbstractArchiveMemberHeader *ArMemHeader,
+                             const char *RawHeaderPtr, uint64_t Size) {
+  StringRef Msg("remaining size of archive too small for next archive "
+                "member header ");
+
+  Expected<StringRef> NameOrErr = ArMemHeader->getName(Size);
+  if (NameOrErr)
+    return malformedError(Msg + "for " + *NameOrErr);
+
+  consumeError(NameOrErr.takeError());
+  uint64_t Offset = RawHeaderPtr - ArMemHeader->Parent->getData().data();
+  return malformedError(Msg + "at offset " + Twine(Offset));
+}
+
+template <class T, std::size_t N>
+StringRef getFieldRawString(const T (&Field)[N]) {
+  return StringRef(Field, N).rtrim(" ");
+}
+
+template <class T>
+StringRef CommonArchiveMemberHeader<T>::getRawAccessMode() const {
+  return getFieldRawString(ArMemHdr->AccessMode);
+}
+
+template <class T>
+StringRef CommonArchiveMemberHeader<T>::getRawLastModified() const {
+  return getFieldRawString(ArMemHdr->LastModified);
+}
+
+template <class T> StringRef CommonArchiveMemberHeader<T>::getRawUID() const {
+  return getFieldRawString(ArMemHdr->UID);
+}
+
+template <class T> StringRef CommonArchiveMemberHeader<T>::getRawGID() const {
+  return getFieldRawString(ArMemHdr->GID);
+}
+
+template <class T> uint64_t CommonArchiveMemberHeader<T>::getOffset() const {
+  return reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data();
+}
+
+template class object::CommonArchiveMemberHeader<UnixArMemHdrType>;
+template class object::CommonArchiveMemberHeader<BigArMemHdrType>;
+
 ArchiveMemberHeader::ArchiveMemberHeader(const Archive *Parent,
                                          const char *RawHeaderPtr,
                                          uint64_t Size, Error *Err)
-    : Parent(Parent),
-      ArMemHdr(reinterpret_cast<const ArMemHdrType *>(RawHeaderPtr)) {
+    : CommonArchiveMemberHeader<UnixArMemHdrType>(
+          Parent, reinterpret_cast<const UnixArMemHdrType *>(RawHeaderPtr)) {
   if (RawHeaderPtr == nullptr)
     return;
   ErrorAsOutParameter ErrAsOutParam(Err);
 
-  if (Size < sizeof(ArMemHdrType)) {
-    if (Err) {
-      std::string Msg("remaining size of archive too small for next archive "
-                      "member header ");
-      Expected<StringRef> NameOrErr = getName(Size);
-      if (!NameOrErr) {
-        consumeError(NameOrErr.takeError());
-        uint64_t Offset = RawHeaderPtr - Parent->getData().data();
-        *Err = malformedError(Msg + "at offset " + Twine(Offset));
-      } else
-        *Err = malformedError(Msg + "for " + NameOrErr.get());
-    }
+  if (Size < getSizeOf()) {
+    *Err = createMemberHeaderParseError(this, RawHeaderPtr, Size);
     return;
   }
   if (ArMemHdr->Terminator[0] != '`' || ArMemHdr->Terminator[1] != '\n') {
@@ -94,6 +127,19 @@ ArchiveMemberHeader::ArchiveMemberHeader(const Archive *Parent,
   }
 }
 
+BigArchiveMemberHeader::BigArchiveMemberHeader(const Archive *Parent,
+                                               const char *RawHeaderPtr,
+                                               uint64_t Size, Error *Err)
+    : CommonArchiveMemberHeader<BigArMemHdrType>(
+          Parent, reinterpret_cast<const BigArMemHdrType *>(RawHeaderPtr)) {
+  if (RawHeaderPtr == nullptr)
+    return;
+  ErrorAsOutParameter ErrAsOutParam(Err);
+
+  if (Size < getSizeOf())
+    *Err = createMemberHeaderParseError(this, RawHeaderPtr, Size);
+}
+
 // This gets the raw name from the ArMemHdr->Name field and checks that it is
 // valid for the kind of archive.  If it is not valid it returns an Error.
 Expected<StringRef> ArchiveMemberHeader::getRawName() const {
@@ -121,7 +167,69 @@ Expected<StringRef> ArchiveMemberHeader::getRawName() const {
   return StringRef(ArMemHdr->Name, end);
 }
 
-// This gets the name looking up long names. Size is the size of the archive
+Expected<uint64_t>
+getArchiveMemberDecField(Twine FieldName, const StringRef RawField,
+                         const Archive *Parent,
+                         const AbstractArchiveMemberHeader *MemHeader) {
+  uint64_t Value;
+  if (RawField.getAsInteger(10, Value)) {
+    uint64_t Offset = MemHeader->getOffset();
+    return malformedError("characters in " + FieldName +
+                          " field in archive member header are not "
+                          "all decimal numbers: '" +
+                          RawField +
+                          "' for the archive "
+                          "member header at offset " +
+                          Twine(Offset));
+  }
+  return Value;
+}
+
+Expected<uint64_t>
+getArchiveMemberOctField(Twine FieldName, const StringRef RawField,
+                         const Archive *Parent,
+                         const AbstractArchiveMemberHeader *MemHeader) {
+  uint64_t Value;
+  if (RawField.getAsInteger(8, Value)) {
+    uint64_t Offset = MemHeader->getOffset();
+    return malformedError("characters in " + FieldName +
+                          " field in archive member header are not "
+                          "all octal numbers: '" +
+                          RawField +
+                          "' for the archive "
+                          "member header at offset " +
+                          Twine(Offset));
+  }
+  return Value;
+}
+
+Expected<StringRef> BigArchiveMemberHeader::getRawName() const {
+  Expected<uint64_t> NameLenOrErr = getArchiveMemberDecField(
+      "NameLen", getFieldRawString(ArMemHdr->NameLen), Parent, this);
+  if (!NameLenOrErr)
+    // TODO: Out-of-line.
+    return NameLenOrErr.takeError();
+  uint64_t NameLen = NameLenOrErr.get();
+
+  // If the name length is odd, pad with '\0' to get an even length. After
+  // padding, there is the name terminator "`\n".
+  uint64_t NameLenWithPadding = alignTo(NameLen, 2);
+  StringRef NameTerminator = "`\n";
+  StringRef NameStringWithNameTerminator =
+      StringRef(ArMemHdr->Name, NameLenWithPadding + NameTerminator.size());
+  if (!NameStringWithNameTerminator.endswith(NameTerminator)) {
+    uint64_t Offset =
+        reinterpret_cast<const char *>(ArMemHdr->Name + NameLenWithPadding) -
+        Parent->getData().data();
+    // TODO: Out-of-line.
+    return malformedError(
+        "name does not have name terminator \"`\\n\" for archive member"
+        "header at offset " +
+        Twine(Offset));
+  }
+  return StringRef(ArMemHdr->Name, NameLen);
+}
+
 // member including the header, so the size of any name following the header
 // is checked to make sure it does not overflow.
 Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
@@ -129,7 +237,7 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
   // This can be called from the ArchiveMemberHeader constructor when the
   // archive header is truncated to produce an error message with the name.
   // Make sure the name field is not truncated.
-  if (Size < offsetof(ArMemHdrType, Name) + sizeof(ArMemHdr->Name)) {
+  if (Size < offsetof(UnixArMemHdrType, Name) + sizeof(ArMemHdr->Name)) {
     uint64_t ArchiveOffset =
         reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data();
     return malformedError("archive header truncated before the name field "
@@ -224,126 +332,133 @@ Expected<StringRef> ArchiveMemberHeader::getName(uint64_t Size) const {
   return Name.drop_back(1);
 }
 
+Expected<StringRef> BigArchiveMemberHeader::getName(uint64_t Size) const {
+  return getRawName();
+}
+
 Expected<uint64_t> ArchiveMemberHeader::getSize() const {
-  uint64_t Ret;
-  if (StringRef(ArMemHdr->Size, sizeof(ArMemHdr->Size))
-          .rtrim(" ")
-          .getAsInteger(10, Ret)) {
-    std::string Buf;
-    raw_string_ostream OS(Buf);
-    OS.write_escaped(
-        StringRef(ArMemHdr->Size, sizeof(ArMemHdr->Size)).rtrim(" "));
-    OS.flush();
-    uint64_t Offset =
-        reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data();
-    return malformedError("characters in size field in archive header are not "
-                          "all decimal numbers: '" +
-                          Buf +
-                          "' for archive "
-                          "member header at offset " +
-                          Twine(Offset));
-  }
-  return Ret;
+  return getArchiveMemberDecField("size", getFieldRawString(ArMemHdr->Size),
+                                  Parent, this);
 }
 
-Expected<sys::fs::perms> ArchiveMemberHeader::getAccessMode() const {
-  unsigned Ret;
-  if (StringRef(ArMemHdr->AccessMode, sizeof(ArMemHdr->AccessMode))
-          .rtrim(' ')
-          .getAsInteger(8, Ret)) {
-    std::string Buf;
-    raw_string_ostream OS(Buf);
-    OS.write_escaped(
-        StringRef(ArMemHdr->AccessMode, sizeof(ArMemHdr->AccessMode))
-            .rtrim(" "));
-    OS.flush();
-    uint64_t Offset =
-        reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data();
-    return malformedError("characters in AccessMode field in archive header "
-                          "are not all decimal numbers: '" +
-                          Buf + "' for the archive member header at offset " +
-                          Twine(Offset));
-  }
-  return static_cast<sys::fs::perms>(Ret);
+Expected<uint64_t> BigArchiveMemberHeader::getSize() const {
+  Expected<uint64_t> SizeOrErr = getArchiveMemberDecField(
+      "size", getFieldRawString(ArMemHdr->Size), Parent, this);
+  if (!SizeOrErr)
+    return SizeOrErr.takeError();
+
+  Expected<uint64_t> NameLenOrErr = getRawNameSize();
+  if (!NameLenOrErr)
+    return NameLenOrErr.takeError();
+
+  return *SizeOrErr + alignTo(*NameLenOrErr, 2);
+}
+
+Expected<uint64_t> BigArchiveMemberHeader::getRawNameSize() const {
+  return getArchiveMemberDecField(
+      "NameLen", getFieldRawString(ArMemHdr->NameLen), Parent, this);
+}
+
+Expected<uint64_t> BigArchiveMemberHeader::getNextOffset() const {
+  return getArchiveMemberDecField(
+      "NextOffset", getFieldRawString(ArMemHdr->NextOffset), Parent, this);
+}
+
+Expected<sys::fs::perms> AbstractArchiveMemberHeader::getAccessMode() const {
+  Expected<uint64_t> AccessModeOrErr =
+      getArchiveMemberOctField("AccessMode", getRawAccessMode(), Parent, this);
+  if (!AccessModeOrErr)
+    return AccessModeOrErr.takeError();
+  return static_cast<sys::fs::perms>(*AccessModeOrErr);
 }
 
 Expected<sys::TimePoint<std::chrono::seconds>>
-ArchiveMemberHeader::getLastModified() const {
-  unsigned Seconds;
-  if (StringRef(ArMemHdr->LastModified, sizeof(ArMemHdr->LastModified))
-          .rtrim(' ')
-          .getAsInteger(10, Seconds)) {
-    std::string Buf;
-    raw_string_ostream OS(Buf);
-    OS.write_escaped(
-        StringRef(ArMemHdr->LastModified, sizeof(ArMemHdr->LastModified))
-            .rtrim(" "));
-    OS.flush();
-    uint64_t Offset =
-        reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data();
-    return malformedError("characters in LastModified field in archive header "
-                          "are not all decimal numbers: '" +
-                          Buf + "' for the archive member header at offset " +
-                          Twine(Offset));
-  }
+AbstractArchiveMemberHeader::getLastModified() const {
+  Expected<uint64_t> SecondsOrErr = getArchiveMemberDecField(
+      "LastModified", getRawLastModified(), Parent, this);
 
-  return sys::toTimePoint(Seconds);
+  if (!SecondsOrErr)
+    return SecondsOrErr.takeError();
+
+  return sys::toTimePoint(*SecondsOrErr);
 }
 
-Expected<unsigned> ArchiveMemberHeader::getUID() const {
-  unsigned Ret;
-  StringRef User = StringRef(ArMemHdr->UID, sizeof(ArMemHdr->UID)).rtrim(' ');
+Expected<unsigned> AbstractArchiveMemberHeader::getUID() const {
+  StringRef User = getRawUID();
   if (User.empty())
     return 0;
-  if (User.getAsInteger(10, Ret)) {
-    std::string Buf;
-    raw_string_ostream OS(Buf);
-    OS.write_escaped(User);
-    OS.flush();
-    uint64_t Offset =
-        reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data();
-    return malformedError("characters in UID field in archive header "
-                          "are not all decimal numbers: '" +
-                          Buf + "' for the archive member header at offset " +
-                          Twine(Offset));
-  }
-  return Ret;
+  return getArchiveMemberDecField("UID", User, Parent, this);
 }
 
-Expected<unsigned> ArchiveMemberHeader::getGID() const {
-  unsigned Ret;
-  StringRef Group = StringRef(ArMemHdr->GID, sizeof(ArMemHdr->GID)).rtrim(' ');
+Expected<unsigned> AbstractArchiveMemberHeader::getGID() const {
+  StringRef Group = getRawGID();
   if (Group.empty())
     return 0;
-  if (Group.getAsInteger(10, Ret)) {
-    std::string Buf;
-    raw_string_ostream OS(Buf);
-    OS.write_escaped(Group);
-    OS.flush();
-    uint64_t Offset =
-        reinterpret_cast<const char *>(ArMemHdr) - Parent->getData().data();
-    return malformedError("characters in GID field in archive header "
-                          "are not all decimal numbers: '" +
-                          Buf + "' for the archive member header at offset " +
-                          Twine(Offset));
+  return getArchiveMemberDecField("GID", Group, Parent, this);
+}
+
+Expected<bool> ArchiveMemberHeader::isThin() const {
+  Expected<StringRef> NameOrErr = getRawName();
+  if (!NameOrErr)
+    return NameOrErr.takeError();
+  StringRef Name = NameOrErr.get();
+  return Parent->isThin() && Name != "/" && Name != "//" && Name != "/SYM64/";
+}
+
+Expected<const char *> ArchiveMemberHeader::getNextChildLoc() const {
+  uint64_t Size = getSizeOf();
+  Expected<bool> isThinOrErr = isThin();
+  if (!isThinOrErr)
+    return isThinOrErr.takeError();
+
+  bool isThin = isThinOrErr.get();
+  if (!isThin) {
+    Expected<uint64_t> MemberSize = getSize();
+    if (!MemberSize)
+      return MemberSize.takeError();
+
+    Size += MemberSize.get();
   }
-  return Ret;
+
+  // If Size is odd, add 1 to make it even.
+  const char *NextLoc =
+      reinterpret_cast<const char *>(ArMemHdr) + alignTo(Size, 2);
+
+  if (NextLoc == Parent->getMemoryBufferRef().getBufferEnd())
+    return nullptr;
+
+  return NextLoc;
+}
+
+Expected<const char *> BigArchiveMemberHeader::getNextChildLoc() const {
+  if (getOffset() ==
+      static_cast<const BigArchive *>(Parent)->getLastChildOffset())
+    return nullptr;
+
+  Expected<uint64_t> NextOffsetOrErr = getNextOffset();
+  if (!NextOffsetOrErr)
+    return NextOffsetOrErr.takeError();
+  return Parent->getData().data() + NextOffsetOrErr.get();
 }
 
 Archive::Child::Child(const Archive *Parent, StringRef Data,
                       uint16_t StartOfFile)
-    : Parent(Parent), Header(Parent, Data.data(), Data.size(), nullptr),
-      Data(Data), StartOfFile(StartOfFile) {}
+    : Parent(Parent), Data(Data), StartOfFile(StartOfFile) {
+  Header = Parent->createArchiveMemberHeader(Data.data(), Data.size(), nullptr);
+}
 
 Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err)
-    : Parent(Parent),
-      Header(Parent, Start,
-             Parent
-                 ? Parent->getData().size() - (Start - Parent->getData().data())
-                 : 0,
-             Err) {
-  if (!Start)
+    : Parent(Parent) {
+  if (!Start) {
+    Header = nullptr;
     return;
+  }
+
+  Header = Parent->createArchiveMemberHeader(
+      Start,
+      Parent ? Parent->getData().size() - (Start - Parent->getData().data())
+             : 0,
+      Err);
 
   // If we are pointed to real data, Start is not a nullptr, then there must be
   // a non-null Err pointer available to report malformed data on.  Only in
@@ -358,7 +473,7 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err)
   if (*Err)
     return;
 
-  uint64_t Size = Header.getSizeOf();
+  uint64_t Size = Header->getSizeOf();
   Data = StringRef(Start, Size);
   Expected<bool> isThinOrErr = isThinMember();
   if (!isThinOrErr) {
@@ -377,7 +492,7 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err)
   }
 
   // Setup StartOfFile and PaddingBytes.
-  StartOfFile = Header.getSizeOf();
+  StartOfFile = Header->getSizeOf();
   // Don't include attached name.
   Expected<StringRef> NameOrErr = getRawName();
   if (!NameOrErr) {
@@ -385,17 +500,20 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err)
     return;
   }
   StringRef Name = NameOrErr.get();
-  if (Name.startswith("#1/")) {
+
+  if (Parent->kind() == Archive::K_AIXBIG) {
+    // The actual start of the file is after the name and any necessary
+    // even-alignment padding.
+    StartOfFile += ((Name.size() + 1) >> 1) << 1;
+  } else if (Name.startswith("#1/")) {
     uint64_t NameSize;
-    if (Name.substr(3).rtrim(' ').getAsInteger(10, NameSize)) {
-      std::string Buf;
-      raw_string_ostream OS(Buf);
-      OS.write_escaped(Name.substr(3).rtrim(' '));
-      OS.flush();
+    StringRef RawNameSize = Name.substr(3).rtrim(' ');
+    if (RawNameSize.getAsInteger(10, NameSize)) {
       uint64_t Offset = Start - Parent->getData().data();
       *Err = malformedError("long name length characters after the #1/ are "
                             "not all decimal numbers: '" +
-                            Buf + "' for archive member header at offset " +
+                            RawNameSize +
+                            "' for archive member header at offset " +
                             Twine(Offset));
       return;
     }
@@ -405,21 +523,15 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err)
 
 Expected<uint64_t> Archive::Child::getSize() const {
   if (Parent->IsThin)
-    return Header.getSize();
+    return Header->getSize();
   return Data.size() - StartOfFile;
 }
 
 Expected<uint64_t> Archive::Child::getRawSize() const {
-  return Header.getSize();
+  return Header->getSize();
 }
 
-Expected<bool> Archive::Child::isThinMember() const {
-  Expected<StringRef> NameOrErr = Header.getRawName();
-  if (!NameOrErr)
-    return NameOrErr.takeError();
-  StringRef Name = NameOrErr.get();
-  return Parent->IsThin && Name != "/" && Name != "//" && Name != "/SYM64/";
-}
+Expected<bool> Archive::Child::isThinMember() const { return Header->isThin(); }
 
 Expected<std::string> Archive::Child::getFullName() const {
   Expected<bool> isThin = isThinMember();
@@ -462,15 +574,14 @@ Expected<StringRef> Archive::Child::getBuffer() const {
 }
 
 Expected<Archive::Child> Archive::Child::getNext() const {
-  size_t SpaceToSkip = Data.size();
-  // If it's odd, add 1 to make it even.
-  if (SpaceToSkip & 1)
-    ++SpaceToSkip;
+  Expected<const char *> NextLocOrErr = Header->getNextChildLoc();
+  if (!NextLocOrErr)
+    return NextLocOrErr.takeError();
 
-  const char *NextLoc = Data.data() + SpaceToSkip;
+  const char *NextLoc = *NextLocOrErr;
 
   // Check to see if this is at the end of the archive.
-  if (NextLoc == Parent->Data.getBufferEnd())
+  if (NextLoc == nullptr)
     return Child(nullptr, nullptr, nullptr);
 
   // Check to see if this is past the end of the archive.
@@ -505,7 +616,8 @@ Expected<StringRef> Archive::Child::getName() const {
   if (!RawSizeOrErr)
     return RawSizeOrErr.takeError();
   uint64_t RawSize = RawSizeOrErr.get();
-  Expected<StringRef> NameOrErr = Header.getName(Header.getSizeOf() + RawSize);
+  Expected<StringRef> NameOrErr =
+      Header->getName(Header->getSizeOf() + RawSize);
   if (!NameOrErr)
     return NameOrErr.takeError();
   StringRef Name = NameOrErr.get();
@@ -537,12 +649,39 @@ Archive::Child::getAsBinary(LLVMContext *Context) const {
 
 Expected<std::unique_ptr<Archive>> Archive::create(MemoryBufferRef Source) {
   Error Err = Error::success();
-  std::unique_ptr<Archive> Ret(new Archive(Source, Err));
+  std::unique_ptr<Archive> Ret;
+  StringRef Buffer = Source.getBuffer();
+
+  if (Buffer.startswith(BigArchiveMagic))
+    Ret = std::make_unique<BigArchive>(Source, Err);
+  else
+    Ret = std::make_unique<Archive>(Source, Err);
+
   if (Err)
     return std::move(Err);
   return std::move(Ret);
 }
 
+std::unique_ptr<AbstractArchiveMemberHeader>
+Archive::createArchiveMemberHeader(const char *RawHeaderPtr, uint64_t Size,
+                                   Error *Err) const {
+  ErrorAsOutParameter ErrAsOutParam(Err);
+  if (kind() != K_AIXBIG)
+    return std::make_unique<ArchiveMemberHeader>(this, RawHeaderPtr, Size, Err);
+  return std::make_unique<BigArchiveMemberHeader>(this, RawHeaderPtr, Size,
+                                                  Err);
+}
+
+uint64_t Archive::getArchiveMagicLen() const {
+  if (isThin())
+    return sizeof(ThinArchiveMagic) - 1;
+
+  if (Kind() == K_AIXBIG)
+    return sizeof(BigArchiveMagic) - 1;
+
+  return sizeof(ArchiveMagic) - 1;
+}
+
 void Archive::setFirstRegular(const Child &C) {
   FirstRegularData = C.Data;
   FirstRegularStartOfFile = C.StartOfFile;
@@ -553,10 +692,14 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   ErrorAsOutParameter ErrAsOutParam(&Err);
   StringRef Buffer = Data.getBuffer();
   // Check for sufficient magic.
-  if (Buffer.startswith(ThinMagic)) {
+  if (Buffer.startswith(ThinArchiveMagic)) {
     IsThin = true;
-  } else if (Buffer.startswith(Magic)) {
+  } else if (Buffer.startswith(ArchiveMagic)) {
+    IsThin = false;
+  } else if (Buffer.startswith(BigArchiveMagic)) {
+    Format = K_AIXBIG;
     IsThin = false;
+    return;
   } else {
     Err = make_error<GenericBinaryError>("file too small to be an archive",
                                          object_error::invalid_file_type);
@@ -788,7 +931,7 @@ Archive::child_iterator Archive::child_begin(Error &Err,
     return child_iterator::itr(
         Child(this, FirstRegularData, FirstRegularStartOfFile), Err);
 
-  const char *Loc = Data.getBufferStart() + strlen(Magic);
+  const char *Loc = Data.getBufferStart() + getFirstChildOffset();
   Child C(this, Loc, &Err);
   if (Err)
     return child_end();
@@ -997,6 +1140,38 @@ Expected<Optional<Archive::Child>> Archive::findSym(StringRef name) const {
 }
 
 // Returns true if archive file contains no member file.
-bool Archive::isEmpty() const { return Data.getBufferSize() == 8; }
+bool Archive::isEmpty() const {
+  return Data.getBufferSize() == getArchiveMagicLen();
+}
 
 bool Archive::hasSymbolTable() const { return !SymbolTable.empty(); }
+
+BigArchive::BigArchive(MemoryBufferRef Source, Error &Err)
+    : Archive(Source, Err) {
+  ErrorAsOutParameter ErrAsOutParam(&Err);
+  StringRef Buffer = Data.getBuffer();
+  ArFixLenHdr = reinterpret_cast<const FixLenHdr *>(Buffer.data());
+
+  StringRef RawOffset = getFieldRawString(ArFixLenHdr->FirstChildOffset);
+  if (RawOffset.getAsInteger(10, FirstChildOffset))
+    // TODO: Out-of-line.
+    Err = malformedError("malformed AIX big archive: first member offset \"" +
+                         RawOffset + "\" is not a number");
+
+  RawOffset = getFieldRawString(ArFixLenHdr->LastChildOffset);
+  if (RawOffset.getAsInteger(10, LastChildOffset))
+    // TODO: Out-of-line.
+    Err = malformedError("malformed AIX big archive: last member offset \"" +
+                         RawOffset + "\" is not a number");
+
+  child_iterator I = child_begin(Err, false);
+  if (Err)
+    return;
+  child_iterator E = child_end();
+  if (I == E) {
+    Err = Error::success();
+    return;
+  }
+  setFirstRegular(*I);
+  Err = Error::success();
+}
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index da8bcec7f3d4..053b3dafed95 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -137,6 +137,7 @@ static bool isBSDLike(object::Archive::Kind Kind) {
   case object::Archive::K_DARWIN:
   case object::Archive::K_DARWIN64:
     return true;
+  case object::Archive::K_AIXBIG:
   case object::Archive::K_COFF:
     break;
   }
@@ -199,6 +200,7 @@ static bool is64BitKind(object::Archive::Kind Kind) {
   case object::Archive::K_BSD:
   case object::Archive::K_DARWIN:
   case object::Archive::K_COFF:
+  case object::Archive::K_AIXBIG:
     return false;
   case object::Archive::K_DARWIN64:
   case object::Archive::K_GNU64:
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index 093ae1bbc267..dea3d90d3560 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/DataLayout.h"
@@ -22,13 +23,13 @@
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Object/SymbolicFile.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/VCSRevision.h"
@@ -41,6 +42,10 @@
 using namespace llvm;
 using namespace irsymtab;
 
+cl::opt<bool> DisableBitcodeVersionUpgrade(
+    "disable-bitcode-version-upgrade", cl::init(false), cl::Hidden,
+    cl::desc("Disable automatic bitcode upgrade for version mismatch"));
+
 static const char *PreservedSymbols[] = {
 #define HANDLE_LIBCALL(code, name) name,
 #include "llvm/IR/RuntimeLibcalls.def"
@@ -402,20 +407,22 @@ Expected<FileContents> irsymtab::readBitcode(const BitcodeFileContents &BFC) {
     return make_error<StringError>("Bitcode file does not contain any modules",
                                    inconvertibleErrorCode());
 
-  if (BFC.StrtabForSymtab.empty() ||
-      BFC.Symtab.size() < sizeof(storage::Header))
-    return upgrade(BFC.Mods);
-
-  // We cannot use the regular reader to read the version and producer, because
-  // it will expect the header to be in the current format. The only thing we
-  // can rely on is that the version and producer will be present as the first
-  // struct elements.
-  auto *Hdr = reinterpret_cast<const storage::Header *>(BFC.Symtab.data());
-  unsigned Version = Hdr->Version;
-  StringRef Producer = Hdr->Producer.get(BFC.StrtabForSymtab);
-  if (Version != storage::Header::kCurrentVersion ||
-      Producer != kExpectedProducerName)
-    return upgrade(BFC.Mods);
+  if (!DisableBitcodeVersionUpgrade) {
+    if (BFC.StrtabForSymtab.empty() ||
+        BFC.Symtab.size() < sizeof(storage::Header))
+      return upgrade(BFC.Mods);
+
+    // We cannot use the regular reader to read the version and producer,
+    // because it will expect the header to be in the current format. The only
+    // thing we can rely on is that the version and producer will be present as
+    // the first struct elements.
+    auto *Hdr = reinterpret_cast<const storage::Header *>(BFC.Symtab.data());
+    unsigned Version = Hdr->Version;
+    StringRef Producer = Hdr->Producer.get(BFC.StrtabForSymtab);
+    if (Version != storage::Header::kCurrentVersion ||
+        Producer != kExpectedProducerName)
+      return upgrade(BFC.Mods);
+  }
 
   FileContents FC;
   FC.TheReader = {{BFC.Symtab.data(), BFC.Symtab.size()},
diff --git a/llvm/lib/Object/Object.cpp b/llvm/lib/Object/Object.cpp
index 0659cf6a2d41..576eb8d069d6 100644
--- a/llvm/lib/Object/Object.cpp
+++ b/llvm/lib/Object/Object.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/MachOUniversal.h"
+#include "llvm/Support/MemAlloc.h"
 
 using namespace llvm;
 using namespace object;
diff --git a/llvm/lib/Object/TapiFile.cpp b/llvm/lib/Object/TapiFile.cpp
index 6b576260bdb1..83568e8d823a 100644
--- a/llvm/lib/Object/TapiFile.cpp
+++ b/llvm/lib/Object/TapiFile.cpp
@@ -45,8 +45,7 @@ TapiFile::TapiFile(MemoryBufferRef Source, const InterfaceFile &interface,
       Symbols.emplace_back(StringRef(), Symbol->getName(), getFlags(Symbol));
       break;
     case SymbolKind::ObjectiveCClass:
-      if (interface.getPlatforms().count(PlatformKind::macOS) &&
-          Arch == AK_i386) {
+      if (interface.getPlatforms().count(PLATFORM_MACOS) && Arch == AK_i386) {
         Symbols.emplace_back(ObjC1ClassNamePrefix, Symbol->getName(),
                              getFlags(Symbol));
       } else {
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index 9b0a5efacba7..f2f6d700ddd8 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -1112,8 +1112,12 @@ bool XCOFFSymbolRef::isFunction() const {
     return true;
 
   Expected<XCOFFCsectAuxRef> ExpCsectAuxEnt = getXCOFFCsectAuxRef();
-  if (!ExpCsectAuxEnt)
+  if (!ExpCsectAuxEnt) {
+    // If we could not get the CSECT auxiliary entry, then treat this symbol as
+    // if it isn't a function. Consume the error and return `false` to move on.
+    consumeError(ExpCsectAuxEnt.takeError());
     return false;
+  }
 
   const XCOFFCsectAuxRef CsectAuxRef = ExpCsectAuxEnt.get();
 
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index 9b9266998ea6..ffe2599beaf8 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -518,6 +518,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_RISCV_FLOAT_ABI_DOUBLE, EF_RISCV_FLOAT_ABI);
     BCaseMask(EF_RISCV_FLOAT_ABI_QUAD, EF_RISCV_FLOAT_ABI);
     BCase(EF_RISCV_RVE);
+    BCase(EF_RISCV_TSO);
     break;
   case ELF::EM_AMDGPU:
     BCaseMask(EF_AMDGPU_MACH_NONE, EF_AMDGPU_MACH);
diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
index e5ffb12df434..b9fad2982828 100644
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -481,9 +481,9 @@ void MachOWriter::writeLinkEditData(raw_ostream &OS) {
   typedef std::pair<uint64_t, writeHandler> writeOperation;
   std::vector<writeOperation> WriteQueue;
 
-  MachO::dyld_info_command *DyldInfoOnlyCmd = 0;
-  MachO::symtab_command *SymtabCmd = 0;
-  MachO::dysymtab_command *DSymtabCmd = 0;
+  MachO::dyld_info_command *DyldInfoOnlyCmd = nullptr;
+  MachO::symtab_command *SymtabCmd = nullptr;
+  MachO::dysymtab_command *DSymtabCmd = nullptr;
   for (auto &LC : Obj.LoadCommands) {
     switch (LC.Data.load_command_data.cmd) {
     case MachO::LC_SYMTAB:
diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index cf0d058c518c..2a7204d3f773 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -47,6 +47,7 @@ private:
   bool initRelocations(uint64_t &CurrentOffset);
   bool initStringTable();
   bool assignAddressesAndIndices();
+
   void writeFileHeader();
   void writeAuxFileHeader();
   void writeSectionHeader();
@@ -55,6 +56,15 @@ private:
   bool writeSymbols();
   void writeStringTable();
 
+  void writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym);
+  void writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym);
+  void writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym);
+  void writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym);
+  void writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym);
+  void writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym);
+  void writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym);
+  void writeAuxSymbol(const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym);
+
   XCOFFYAML::Object &Obj;
   bool Is64Bit = false;
   support::endian::Writer W;
@@ -190,12 +200,23 @@ bool XCOFFWriter::initStringTable() {
       }
     }
   } else {
-    for (XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
+    for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
       if (nameShouldBeInStringTable(YamlSym.SymbolName))
         StrTblBuilder.add(YamlSym.SymbolName);
     }
   }
 
+  // Check if the file name in the File Auxiliary Entry should be added to the
+  // string table.
+  for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
+    for (const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym :
+         YamlSym.AuxEntries) {
+      if (auto AS = dyn_cast<XCOFFYAML::FileAuxEnt>(AuxSym.get()))
+        if (nameShouldBeInStringTable(AS->FileNameOrString.getValueOr("")))
+          StrTblBuilder.add(AS->FileNameOrString.getValueOr(""));
+    }
+  }
+
   StrTblBuilder.finalize();
 
   size_t StrTblSize = StrTblBuilder.getSize();
@@ -216,9 +237,21 @@ bool XCOFFWriter::initFileHeader(uint64_t CurrentOffset) {
   InitFileHdr.NumberOfSections = Obj.Sections.size();
   InitFileHdr.NumberOfSymTableEntries = Obj.Symbols.size();
 
-  for (const XCOFFYAML::Symbol &YamlSym : Obj.Symbols)
+  for (XCOFFYAML::Symbol &YamlSym : Obj.Symbols) {
+    uint32_t AuxCount = YamlSym.AuxEntries.size();
+    if (YamlSym.NumberOfAuxEntries && *YamlSym.NumberOfAuxEntries < AuxCount) {
+      ErrHandler("specified NumberOfAuxEntries " +
+                 Twine(static_cast<uint32_t>(*YamlSym.NumberOfAuxEntries)) +
+                 " is less than the actual number "
+                 "of auxiliary entries " +
+                 Twine(AuxCount));
+      return false;
+    }
+    YamlSym.NumberOfAuxEntries =
+        YamlSym.NumberOfAuxEntries.getValueOr(AuxCount);
     // Add the number of auxiliary symbols to the total number.
-    InitFileHdr.NumberOfSymTableEntries += YamlSym.NumberOfAuxEntries;
+    InitFileHdr.NumberOfSymTableEntries += *YamlSym.NumberOfAuxEntries;
+  }
 
   // Calculate SymbolTableOffset for the file header.
   if (InitFileHdr.NumberOfSymTableEntries) {
@@ -491,6 +524,125 @@ bool XCOFFWriter::writeRelocations() {
   return true;
 }
 
+void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::CsectAuxEnt &AuxSym) {
+  if (Is64Bit) {
+    W.write<uint32_t>(AuxSym.SectionOrLengthLo.getValueOr(0));
+    W.write<uint32_t>(AuxSym.ParameterHashIndex.getValueOr(0));
+    W.write<uint16_t>(AuxSym.TypeChkSectNum.getValueOr(0));
+    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.getValueOr(0));
+    W.write<uint8_t>(AuxSym.StorageMappingClass.getValueOr(XCOFF::XMC_PR));
+    W.write<uint32_t>(AuxSym.SectionOrLengthHi.getValueOr(0));
+    W.write<uint8_t>(0);
+    W.write<uint8_t>(XCOFF::AUX_CSECT);
+  } else {
+    W.write<uint32_t>(AuxSym.SectionOrLength.getValueOr(0));
+    W.write<uint32_t>(AuxSym.ParameterHashIndex.getValueOr(0));
+    W.write<uint16_t>(AuxSym.TypeChkSectNum.getValueOr(0));
+    W.write<uint8_t>(AuxSym.SymbolAlignmentAndType.getValueOr(0));
+    W.write<uint8_t>(AuxSym.StorageMappingClass.getValueOr(XCOFF::XMC_PR));
+    W.write<uint32_t>(AuxSym.StabInfoIndex.getValueOr(0));
+    W.write<uint16_t>(AuxSym.StabSectNum.getValueOr(0));
+  }
+}
+
+void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::ExcpetionAuxEnt &AuxSym) {
+  assert(Is64Bit && "can't write the exception auxiliary symbol for XCOFF32");
+  W.write<uint64_t>(AuxSym.OffsetToExceptionTbl.getValueOr(0));
+  W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0));
+  W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0));
+  W.write<uint8_t>(0);
+  W.write<uint8_t>(XCOFF::AUX_EXCEPT);
+}
+
+void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FunctionAuxEnt &AuxSym) {
+  if (Is64Bit) {
+    W.write<uint64_t>(AuxSym.PtrToLineNum.getValueOr(0));
+    W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0));
+    W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0));
+    W.write<uint8_t>(0);
+    W.write<uint8_t>(XCOFF::AUX_FCN);
+  } else {
+    W.write<uint32_t>(AuxSym.OffsetToExceptionTbl.getValueOr(0));
+    W.write<uint32_t>(AuxSym.SizeOfFunction.getValueOr(0));
+    W.write<uint32_t>(AuxSym.PtrToLineNum.getValueOr(0));
+    W.write<uint32_t>(AuxSym.SymIdxOfNextBeyond.getValueOr(0));
+    W.OS.write_zeros(2);
+  }
+}
+
+void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::FileAuxEnt &AuxSym) {
+  StringRef FileName = AuxSym.FileNameOrString.getValueOr("");
+  if (nameShouldBeInStringTable(FileName)) {
+    W.write<int32_t>(0);
+    W.write<uint32_t>(StrTblBuilder.getOffset(FileName));
+  } else {
+    writeName(FileName, W);
+  }
+  W.OS.write_zeros(XCOFF::FileNamePadSize);
+  W.write<uint8_t>(AuxSym.FileStringType.getValueOr(XCOFF::XFT_FN));
+  if (Is64Bit) {
+    W.OS.write_zeros(2);
+    W.write<uint8_t>(XCOFF::AUX_FILE);
+  } else {
+    W.OS.write_zeros(3);
+  }
+}
+
+void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::BlockAuxEnt &AuxSym) {
+  if (Is64Bit) {
+    W.write<uint32_t>(AuxSym.LineNum.getValueOr(0));
+    W.OS.write_zeros(13);
+    W.write<uint8_t>(XCOFF::AUX_SYM);
+  } else {
+    W.OS.write_zeros(2);
+    W.write<uint16_t>(AuxSym.LineNumHi.getValueOr(0));
+    W.write<uint16_t>(AuxSym.LineNumLo.getValueOr(0));
+    W.OS.write_zeros(12);
+  }
+}
+
+void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
+  if (Is64Bit) {
+    W.write<uint64_t>(AuxSym.LengthOfSectionPortion.getValueOr(0));
+    W.write<uint64_t>(AuxSym.NumberOfRelocEnt.getValueOr(0));
+    W.write<uint8_t>(0);
+    W.write<uint8_t>(XCOFF::AUX_SECT);
+  } else {
+    W.write<uint32_t>(AuxSym.LengthOfSectionPortion.getValueOr(0));
+    W.OS.write_zeros(4);
+    W.write<uint32_t>(AuxSym.NumberOfRelocEnt.getValueOr(0));
+    W.OS.write_zeros(6);
+  }
+}
+
+void XCOFFWriter::writeAuxSymbol(const XCOFFYAML::SectAuxEntForStat &AuxSym) {
+  assert(!Is64Bit && "can't write the stat auxiliary symbol for XCOFF64");
+  W.write<uint32_t>(AuxSym.SectionLength.getValueOr(0));
+  W.write<uint16_t>(AuxSym.NumberOfRelocEnt.getValueOr(0));
+  W.write<uint16_t>(AuxSym.NumberOfLineNum.getValueOr(0));
+  W.OS.write_zeros(10);
+}
+
+void XCOFFWriter::writeAuxSymbol(
+    const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym) {
+  if (auto AS = dyn_cast<XCOFFYAML::CsectAuxEnt>(AuxSym.get()))
+    writeAuxSymbol(*AS);
+  else if (auto AS = dyn_cast<XCOFFYAML::FunctionAuxEnt>(AuxSym.get()))
+    writeAuxSymbol(*AS);
+  else if (auto AS = dyn_cast<XCOFFYAML::ExcpetionAuxEnt>(AuxSym.get()))
+    writeAuxSymbol(*AS);
+  else if (auto AS = dyn_cast<XCOFFYAML::FileAuxEnt>(AuxSym.get()))
+    writeAuxSymbol(*AS);
+  else if (auto AS = dyn_cast<XCOFFYAML::BlockAuxEnt>(AuxSym.get()))
+    writeAuxSymbol(*AS);
+  else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForDWARF>(AuxSym.get()))
+    writeAuxSymbol(*AS);
+  else if (auto AS = dyn_cast<XCOFFYAML::SectAuxEntForStat>(AuxSym.get()))
+    writeAuxSymbol(*AS);
+  else
+    llvm_unreachable("unknown auxiliary symbol type");
+}
+
 bool XCOFFWriter::writeSymbols() {
   int64_t PaddingSize =
       (uint64_t)InitFileHdr.SymbolTableOffset - (W.OS.tell() - StartOffset);
@@ -533,16 +685,25 @@ bool XCOFFWriter::writeSymbols() {
     }
     W.write<uint16_t>(YamlSym.Type);
     W.write<uint8_t>(YamlSym.StorageClass);
-    W.write<uint8_t>(YamlSym.NumberOfAuxEntries);
-
-    // Now output the auxiliary entry.
-    for (uint8_t I = 0, E = YamlSym.NumberOfAuxEntries; I < E; ++I) {
-      // TODO: Auxiliary entry is not supported yet.
-      // The auxiliary entries for a symbol follow its symbol table entry. The
-      // length of each auxiliary entry is the same as a symbol table entry (18
-      // bytes). The format and quantity of auxiliary entries depend on the
-      // storage class (n_sclass) and type (n_type) of the symbol table entry.
-      W.OS.write_zeros(XCOFF::SymbolTableEntrySize);
+
+    uint8_t NumOfAuxSym = YamlSym.NumberOfAuxEntries.getValueOr(0);
+    W.write<uint8_t>(NumOfAuxSym);
+
+    if (!NumOfAuxSym && !YamlSym.AuxEntries.size())
+      continue;
+
+    // Now write auxiliary entries.
+    if (!YamlSym.AuxEntries.size()) {
+      W.OS.write_zeros(XCOFF::SymbolTableEntrySize * NumOfAuxSym);
+    } else {
+      for (const std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym :
+           YamlSym.AuxEntries) {
+        writeAuxSymbol(AuxSym);
+      }
+      // Pad with zeros.
+      if (NumOfAuxSym > YamlSym.AuxEntries.size())
+        W.OS.write_zeros(XCOFF::SymbolTableEntrySize *
+                         (NumOfAuxSym - YamlSym.AuxEntries.size()));
     }
   }
   return true;
diff --git a/llvm/lib/ObjectYAML/XCOFFYAML.cpp b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
index 221cf3b064c0..44ef33501b65 100644
--- a/llvm/lib/ObjectYAML/XCOFFYAML.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFYAML.cpp
@@ -19,6 +19,8 @@ namespace XCOFFYAML {
 
 Object::Object() { memset(&Header, 0, sizeof(Header)); }
 
+AuxSymbolEnt::~AuxSymbolEnt() = default;
+
 } // namespace XCOFFYAML
 
 namespace yaml {
@@ -98,6 +100,56 @@ void ScalarEnumerationTraits<XCOFF::StorageClass>::enumeration(
 #undef ECase
 }
 
+void ScalarEnumerationTraits<XCOFF::StorageMappingClass>::enumeration(
+    IO &IO, XCOFF::StorageMappingClass &Value) {
+#define ECase(X) IO.enumCase(Value, #X, XCOFF::X)
+  ECase(XMC_PR);
+  ECase(XMC_RO);
+  ECase(XMC_DB);
+  ECase(XMC_GL);
+  ECase(XMC_XO);
+  ECase(XMC_SV);
+  ECase(XMC_SV64);
+  ECase(XMC_SV3264);
+  ECase(XMC_TI);
+  ECase(XMC_TB);
+  ECase(XMC_RW);
+  ECase(XMC_TC0);
+  ECase(XMC_TC);
+  ECase(XMC_TD);
+  ECase(XMC_DS);
+  ECase(XMC_UA);
+  ECase(XMC_BS);
+  ECase(XMC_UC);
+  ECase(XMC_TL);
+  ECase(XMC_UL);
+  ECase(XMC_TE);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<XCOFFYAML::AuxSymbolType>::enumeration(
+    IO &IO, XCOFFYAML::AuxSymbolType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, XCOFFYAML::X)
+  ECase(AUX_EXCEPT);
+  ECase(AUX_FCN);
+  ECase(AUX_SYM);
+  ECase(AUX_FILE);
+  ECase(AUX_CSECT);
+  ECase(AUX_SECT);
+  ECase(AUX_STAT);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<XCOFF::CFileStringType>::enumeration(
+    IO &IO, XCOFF::CFileStringType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, XCOFF::X)
+  ECase(XFT_FN);
+  ECase(XFT_CT);
+  ECase(XFT_CV);
+  ECase(XFT_CD);
+#undef ECase
+}
+
 struct NSectionFlags {
   NSectionFlags(IO &) : Flags(XCOFF::SectionTypeFlags(0)) {}
   NSectionFlags(IO &, uint32_t C) : Flags(XCOFF::SectionTypeFlags(C)) {}
@@ -173,6 +225,107 @@ void MappingTraits<XCOFFYAML::Section>::mapping(IO &IO,
   IO.mapOptional("Relocations", Sec.Relocations);
 }
 
+static void auxSymMapping(IO &IO, XCOFFYAML::CsectAuxEnt &AuxSym, bool Is64) {
+  IO.mapOptional("ParameterHashIndex", AuxSym.ParameterHashIndex);
+  IO.mapOptional("TypeChkSectNum", AuxSym.TypeChkSectNum);
+  IO.mapOptional("SymbolAlignmentAndType", AuxSym.SymbolAlignmentAndType);
+  IO.mapOptional("StorageMappingClass", AuxSym.StorageMappingClass);
+  if (Is64) {
+    IO.mapOptional("SectionOrLengthLo", AuxSym.SectionOrLengthLo);
+    IO.mapOptional("SectionOrLengthHi", AuxSym.SectionOrLengthHi);
+  } else {
+    IO.mapOptional("SectionOrLength", AuxSym.SectionOrLength);
+    IO.mapOptional("StabInfoIndex", AuxSym.StabInfoIndex);
+    IO.mapOptional("StabSectNum", AuxSym.StabSectNum);
+  }
+}
+
+static void auxSymMapping(IO &IO, XCOFFYAML::FileAuxEnt &AuxSym) {
+  IO.mapOptional("FileNameOrString", AuxSym.FileNameOrString);
+  IO.mapOptional("FileStringType", AuxSym.FileStringType);
+}
+
+static void auxSymMapping(IO &IO, XCOFFYAML::BlockAuxEnt &AuxSym, bool Is64) {
+  if (Is64) {
+    IO.mapOptional("LineNum", AuxSym.LineNum);
+  } else {
+    IO.mapOptional("LineNumHi", AuxSym.LineNumHi);
+    IO.mapOptional("LineNumLo", AuxSym.LineNumLo);
+  }
+}
+
+static void auxSymMapping(IO &IO, XCOFFYAML::FunctionAuxEnt &AuxSym,
+                          bool Is64) {
+  if (!Is64)
+    IO.mapOptional("OffsetToExceptionTbl", AuxSym.OffsetToExceptionTbl);
+  IO.mapOptional("SizeOfFunction", AuxSym.SizeOfFunction);
+  IO.mapOptional("SymIdxOfNextBeyond", AuxSym.SymIdxOfNextBeyond);
+  IO.mapOptional("PtrToLineNum", AuxSym.PtrToLineNum);
+}
+
+static void auxSymMapping(IO &IO, XCOFFYAML::ExcpetionAuxEnt &AuxSym) {
+  IO.mapOptional("OffsetToExceptionTbl", AuxSym.OffsetToExceptionTbl);
+  IO.mapOptional("SizeOfFunction", AuxSym.SizeOfFunction);
+  IO.mapOptional("SymIdxOfNextBeyond", AuxSym.SymIdxOfNextBeyond);
+}
+
+static void auxSymMapping(IO &IO, XCOFFYAML::SectAuxEntForDWARF &AuxSym) {
+  IO.mapOptional("LengthOfSectionPortion", AuxSym.LengthOfSectionPortion);
+  IO.mapOptional("NumberOfRelocEnt", AuxSym.NumberOfRelocEnt);
+}
+
+static void auxSymMapping(IO &IO, XCOFFYAML::SectAuxEntForStat &AuxSym) {
+  IO.mapOptional("SectionLength", AuxSym.SectionLength);
+  IO.mapOptional("NumberOfRelocEnt", AuxSym.NumberOfRelocEnt);
+  IO.mapOptional("NumberOfLineNum", AuxSym.NumberOfLineNum);
+}
+
+void MappingTraits<std::unique_ptr<XCOFFYAML::AuxSymbolEnt>>::mapping(
+    IO &IO, std::unique_ptr<XCOFFYAML::AuxSymbolEnt> &AuxSym) {
+  assert(!IO.outputting() && "We don't dump aux symbols currently.");
+  const bool Is64 =
+      static_cast<XCOFFYAML::Object *>(IO.getContext())->Header.Magic ==
+      (llvm::yaml::Hex16)XCOFF::XCOFF64;
+  XCOFFYAML::AuxSymbolType AuxType;
+  IO.mapRequired("Type", AuxType);
+  switch (AuxType) {
+  case XCOFFYAML::AUX_EXCEPT:
+    if (!Is64)
+      IO.setError("an auxiliary symbol of type AUX_EXCEPT cannot be defined in "
+                  "XCOFF32");
+    AuxSym.reset(new XCOFFYAML::ExcpetionAuxEnt());
+    auxSymMapping(IO, *cast<XCOFFYAML::ExcpetionAuxEnt>(AuxSym.get()));
+    break;
+  case XCOFFYAML::AUX_FCN:
+    AuxSym.reset(new XCOFFYAML::FunctionAuxEnt());
+    auxSymMapping(IO, *cast<XCOFFYAML::FunctionAuxEnt>(AuxSym.get()), Is64);
+    break;
+  case XCOFFYAML::AUX_SYM:
+    AuxSym.reset(new XCOFFYAML::BlockAuxEnt());
+    auxSymMapping(IO, *cast<XCOFFYAML::BlockAuxEnt>(AuxSym.get()), Is64);
+    break;
+  case XCOFFYAML::AUX_FILE:
+    AuxSym.reset(new XCOFFYAML::FileAuxEnt());
+    auxSymMapping(IO, *cast<XCOFFYAML::FileAuxEnt>(AuxSym.get()));
+    break;
+  case XCOFFYAML::AUX_CSECT:
+    AuxSym.reset(new XCOFFYAML::CsectAuxEnt());
+    auxSymMapping(IO, *cast<XCOFFYAML::CsectAuxEnt>(AuxSym.get()), Is64);
+    break;
+  case XCOFFYAML::AUX_SECT:
+    AuxSym.reset(new XCOFFYAML::SectAuxEntForDWARF());
+    auxSymMapping(IO, *cast<XCOFFYAML::SectAuxEntForDWARF>(AuxSym.get()));
+    break;
+  case XCOFFYAML::AUX_STAT:
+    if (Is64)
+      IO.setError(
+          "an auxiliary symbol of type AUX_STAT cannot be defined in XCOFF64");
+    AuxSym.reset(new XCOFFYAML::SectAuxEntForStat());
+    auxSymMapping(IO, *cast<XCOFFYAML::SectAuxEntForStat>(AuxSym.get()));
+    break;
+  }
+}
+
 void MappingTraits<XCOFFYAML::Symbol>::mapping(IO &IO, XCOFFYAML::Symbol &S) {
   IO.mapOptional("Name", S.SymbolName);
   IO.mapOptional("Value", S.Value);
@@ -181,6 +334,8 @@ void MappingTraits<XCOFFYAML::Symbol>::mapping(IO &IO, XCOFFYAML::Symbol &S) {
   IO.mapOptional("Type", S.Type);
   IO.mapOptional("StorageClass", S.StorageClass);
   IO.mapOptional("NumberOfAuxEntries", S.NumberOfAuxEntries);
+  if (!IO.outputting())
+    IO.mapOptional("AuxEntries", S.AuxEntries);
 }
 
 void MappingTraits<XCOFFYAML::StringTable>::mapping(IO &IO, XCOFFYAML::StringTable &Str) {
@@ -191,12 +346,14 @@ void MappingTraits<XCOFFYAML::StringTable>::mapping(IO &IO, XCOFFYAML::StringTab
 }
 
 void MappingTraits<XCOFFYAML::Object>::mapping(IO &IO, XCOFFYAML::Object &Obj) {
+  IO.setContext(&Obj);
   IO.mapTag("!XCOFF", true);
   IO.mapRequired("FileHeader", Obj.Header);
   IO.mapOptional("AuxiliaryHeader", Obj.AuxHeader);
   IO.mapOptional("Sections", Obj.Sections);
   IO.mapOptional("Symbols", Obj.Symbols);
   IO.mapOptional("StringTable", Obj.StrTbl);
+  IO.setContext(nullptr);
 }
 
 } // namespace yaml
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index d7615ef4e9bf..015ca1eec4df 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/DomPrinter.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -218,6 +219,7 @@
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/FixIrreducible.h"
 #include "llvm/Transforms/Utils/HelloWorld.h"
@@ -655,6 +657,8 @@ Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
                     ParamName)
                 .str(),
             inconvertibleErrorCode());
+    } else if (ParamName == "eager-checks") {
+      Result.EagerChecks = true;
     } else {
       return make_error<StringError>(
           formatv("invalid MemorySanitizer pass parameter '{0}' ", ParamName)
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index a6a36ff25402..6110bda02406 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -300,6 +300,8 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
   // TODO: Investigate promotion cap for O1.
   LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
   LPM1.addPass(SimpleLoopUnswitchPass());
+  if (EnableLoopFlatten)
+    LPM1.addPass(LoopFlattenPass());
 
   LPM2.addPass(LoopIdiomRecognizePass());
   LPM2.addPass(IndVarSimplifyPass());
@@ -335,8 +337,6 @@ PassBuilder::buildO1FunctionSimplificationPipeline(OptimizationLevel Level,
                                               /*UseBlockFrequencyInfo=*/true));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
-  if (EnableLoopFlatten)
-    FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
   // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM2),
@@ -475,6 +475,9 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM1.addPass(
       SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 &&
                              EnableO3NonTrivialUnswitching));
+  if (EnableLoopFlatten)
+    LPM1.addPass(LoopFlattenPass());
+
   LPM2.addPass(LoopIdiomRecognizePass());
   LPM2.addPass(IndVarSimplifyPass());
 
@@ -509,8 +512,6 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
                                               /*UseBlockFrequencyInfo=*/true));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
-  if (EnableLoopFlatten)
-    FPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
   // The loop passes in LPM2 (LoopIdiomRecognizePass, IndVarSimplifyPass,
   // LoopDeletionPass and LoopFullUnrollPass) do not preserve MemorySSA.
   // *All* loop passes must preserve it, in order to be able to use it.
@@ -1623,14 +1624,13 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   MainFPM.addPass(DSEPass());
   MainFPM.addPass(MergedLoadStoreMotionPass());
 
-  // More loops are countable; try to optimize them.
-  if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
-    MainFPM.addPass(createFunctionToLoopPassAdaptor(LoopFlattenPass()));
 
   if (EnableConstraintElimination)
     MainFPM.addPass(ConstraintEliminationPass());
 
   LoopPassManager LPM;
+  if (EnableLoopFlatten && Level.getSpeedupLevel() > 1)
+    LPM.addPass(LoopFlattenPass());
   LPM.addPass(IndVarSimplifyPass());
   LPM.addPass(LoopDeletionPass());
   // FIXME: Add loop interchange.
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 74613a7fcce0..8e0af11b854d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -48,9 +48,11 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
 MODULE_PASS("cg-profile", CGProfilePass())
+MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
 MODULE_PASS("constmerge", ConstantMergePass())
 MODULE_PASS("cross-dso-cfi", CrossDSOCFIPass())
 MODULE_PASS("deadargelim", DeadArgumentEliminationPass())
+MODULE_PASS("debugify", NewPMDebugifyPass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass())
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
@@ -62,6 +64,7 @@ MODULE_PASS("globalsplit", GlobalSplitPass())
 MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass())
+MODULE_PASS("print<inline-advisor>", InlineAdvisorAnalysisPrinterPass(dbgs()))
 MODULE_PASS("inliner-wrapper-no-mandatory-first", ModuleInlinerWrapperPass(
   getInlineParams(),
   false))
@@ -254,6 +257,8 @@ FUNCTION_PASS("div-rem-pairs", DivRemPairsPass())
 FUNCTION_PASS("dse", DSEPass())
 FUNCTION_PASS("dot-cfg", CFGPrinterPass())
 FUNCTION_PASS("dot-cfg-only", CFGOnlyPrinterPass())
+FUNCTION_PASS("dot-dom", DomTreePrinterPass())
+FUNCTION_PASS("dot-dom-only", DomTreeOnlyPrinterPass())
 FUNCTION_PASS("fix-irreducible", FixIrreduciblePass())
 FUNCTION_PASS("flattencfg", FlattenCFGPass())
 FUNCTION_PASS("make-guards-explicit", MakeGuardsExplicitPass())
@@ -410,7 +415,7 @@ FUNCTION_PASS_WITH_PARAMS("msan",
                              return MemorySanitizerPass(Opts);
                            },
                           parseMSanPassOptions,
-                          "recover;kernel;track-origins=N")
+                          "recover;kernel;eager-checks;track-origins=N")
 FUNCTION_PASS_WITH_PARAMS("simplifycfg",
                           "SimplifyCFGPass",
                            [](SimplifyCFGOptions Opts) {
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 23c825c78713..c42b1cb26f13 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -441,7 +441,7 @@ const Module *getModuleForComparison(Any IR) {
 
 } // namespace
 
-template <typename T> ChangeReporter<T>::~ChangeReporter<T>() {
+template <typename T> ChangeReporter<T>::~ChangeReporter() {
   assert(BeforeStack.empty() && "Problem with Change Printer stack.");
 }
 
diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp
index afef71f5b5ad..72d1addab01e 100644
--- a/llvm/lib/ProfileData/GCOV.cpp
+++ b/llvm/lib/ProfileData/GCOV.cpp
@@ -346,7 +346,7 @@ StringRef GCOVFunction::getName(bool demangle) const {
         }
       }
       demangled = Name;
-    } while (0);
+    } while (false);
   }
   return demangled;
 }
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 34e0c5ebcd58..051655e1fed6 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -119,9 +119,6 @@ static std::string getInstrProfErrString(instrprof_error Err,
   case instrprof_error::unable_to_correlate_profile:
     OS << "unable to correlate profile";
     break;
-  case instrprof_error::unsupported_debug_format:
-    OS << "unsupported debug info format (only DWARF is supported)";
-    break;
   case instrprof_error::invalid_prof:
     OS << "invalid profile created. Please file a bug "
           "at: " BUG_REPORT_URL
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index f9c113027da2..8e38a6869d07 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -23,7 +23,8 @@ Expected<object::SectionRef> getCountersSection(const object::ObjectFile &Obj) {
       if (SectionName.get() == INSTR_PROF_CNTS_SECT_NAME)
         return Section;
   return make_error<InstrProfError>(
-      instrprof_error::unable_to_correlate_profile);
+      instrprof_error::unable_to_correlate_profile,
+      "could not find counter section (" INSTR_PROF_CNTS_SECT_NAME ")");
 }
 
 const char *InstrProfCorrelator::FunctionNameAttributeName = "Function Name";
@@ -54,9 +55,9 @@ InstrProfCorrelator::get(StringRef DebugInfoFilename) {
     // TODO: Enable profile correlation when there are multiple objects in a
     // dSYM bundle.
     if (DsymObjectsOrErr->size() > 1)
-      return createStringError(
-          std::error_code(),
-          "Profile correlation using multiple objects is not yet supported");
+      return make_error<InstrProfError>(
+          instrprof_error::unable_to_correlate_profile,
+          "using multiple objects is not yet supported");
     DebugInfoFilename = *DsymObjectsOrErr->begin();
   }
   auto BufferOrErr =
@@ -84,7 +85,16 @@ InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) {
       return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj);
   }
   return make_error<InstrProfError>(
-      instrprof_error::unable_to_correlate_profile);
+      instrprof_error::unable_to_correlate_profile, "not an object file");
+}
+
+Optional<size_t> InstrProfCorrelator::getDataSize() const {
+  if (auto *C = dyn_cast<InstrProfCorrelatorImpl<uint32_t>>(this)) {
+    return C->getDataSize();
+  } else if (auto *C = dyn_cast<InstrProfCorrelatorImpl<uint64_t>>(this)) {
+    return C->getDataSize();
+  }
+  return {};
 }
 
 namespace llvm {
@@ -120,16 +130,23 @@ InstrProfCorrelatorImpl<IntPtrT>::get(
     return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(std::move(DICtx),
                                                                std::move(Ctx));
   }
-  return make_error<InstrProfError>(instrprof_error::unsupported_debug_format);
+  return make_error<InstrProfError>(
+      instrprof_error::unable_to_correlate_profile,
+      "unsupported debug info format (only DWARF is supported)");
 }
 
 template <class IntPtrT>
 Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData() {
-  assert(Data.empty() && CompressedNames.empty() && Names.empty());
+  assert(Data.empty() && Names.empty() && NamesVec.empty());
   correlateProfileDataImpl();
+  if (Data.empty() || NamesVec.empty())
+    return make_error<InstrProfError>(
+        instrprof_error::unable_to_correlate_profile,
+        "could not find any profile metadata in debug info");
   auto Result =
-      collectPGOFuncNameStrings(Names, /*doCompression=*/true, CompressedNames);
-  Names.clear();
+      collectPGOFuncNameStrings(NamesVec, /*doCompression=*/false, Names);
+  CounterOffsets.clear();
+  NamesVec.clear();
   return Result;
 }
 
@@ -139,6 +156,9 @@ void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName,
                                                 IntPtrT CounterOffset,
                                                 IntPtrT FunctionPtr,
                                                 uint32_t NumCounters) {
+  // Check if a probe was already added for this counter offset.
+  if (!CounterOffsets.insert(CounterOffset).second)
+    return;
   Data.push_back({
       maybeSwap<uint64_t>(IndexedInstrProf::ComputeHash(FunctionName)),
       maybeSwap<uint64_t>(CFGHash),
@@ -151,7 +171,7 @@ void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName,
       maybeSwap<uint32_t>(NumCounters),
       /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)},
   });
-  Names.push_back(FunctionName.str());
+  NamesVec.push_back(FunctionName.str());
 }
 
 template <class IntPtrT>
@@ -163,13 +183,19 @@ DwarfInstrProfCorrelator<IntPtrT>::getLocation(const DWARFDie &Die) const {
     return {};
   }
   auto &DU = *Die.getDwarfUnit();
+  auto AddressSize = DU.getAddressByteSize();
   for (auto &Location : *Locations) {
-    auto AddressSize = DU.getAddressByteSize();
     DataExtractor Data(Location.Expr, DICtx->isLittleEndian(), AddressSize);
     DWARFExpression Expr(Data, AddressSize);
-    for (auto &Op : Expr)
-      if (Op.getCode() == dwarf::DW_OP_addr)
+    for (auto &Op : Expr) {
+      if (Op.getCode() == dwarf::DW_OP_addr) {
         return Op.getRawOperand(0);
+      } else if (Op.getCode() == dwarf::DW_OP_addrx) {
+        uint64_t Index = Op.getRawOperand(0);
+        if (auto SA = DU.getAddrOffsetSectionItem(Index))
+          return SA->Address;
+      }
+    }
   }
   return {};
 }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 37cdf4dd1fe2..861ff61df510 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -383,22 +383,21 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
 
   CountersDelta = swap(Header.CountersDelta);
   NamesDelta = swap(Header.NamesDelta);
-  auto DataSize = swap(Header.DataSize);
+  auto NumData = swap(Header.DataSize);
   auto PaddingBytesBeforeCounters = swap(Header.PaddingBytesBeforeCounters);
-  auto CountersSize = swap(Header.CountersSize);
+  auto CountersSize = swap(Header.CountersSize) * getCounterTypeSize();
   auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters);
   auto NamesSize = swap(Header.NamesSize);
   ValueKindLast = swap(Header.ValueKindLast);
 
-  auto DataSizeInBytes = DataSize * sizeof(RawInstrProf::ProfileData<IntPtrT>);
+  auto DataSize = NumData * sizeof(RawInstrProf::ProfileData<IntPtrT>);
   auto PaddingSize = getNumPaddingBytes(NamesSize);
 
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdsSize;
-  ptrdiff_t CountersOffset =
-      DataOffset + DataSizeInBytes + PaddingBytesBeforeCounters;
-  ptrdiff_t NamesOffset = CountersOffset + (sizeof(uint64_t) * CountersSize) +
-                          PaddingBytesAfterCounters;
+  ptrdiff_t CountersOffset = DataOffset + DataSize + PaddingBytesBeforeCounters;
+  ptrdiff_t NamesOffset =
+      CountersOffset + CountersSize + PaddingBytesAfterCounters;
   ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
@@ -412,12 +411,12 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     assert(CountersDelta == 0 && NamesDelta == 0);
     Data = Correlator->getDataPointer();
     DataEnd = Data + Correlator->getDataSize();
-    NamesStart = Correlator->getCompressedNamesPointer();
-    NamesEnd = NamesStart + Correlator->getCompressedNamesSize();
+    NamesStart = Correlator->getNamesPointer();
+    NamesEnd = NamesStart + Correlator->getNamesSize();
   } else {
     Data = reinterpret_cast<const RawInstrProf::ProfileData<IntPtrT> *>(
         Start + DataOffset);
-    DataEnd = Data + DataSize;
+    DataEnd = Data + NumData;
     NamesStart = Start + NamesOffset;
     NamesEnd = NamesStart + NamesSize;
   }
@@ -425,7 +424,8 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   // Binary ids start just after the header.
   BinaryIdsStart =
       reinterpret_cast<const uint8_t *>(&Header) + sizeof(RawInstrProf::Header);
-  CountersStart = reinterpret_cast<const uint64_t *>(Start + CountersOffset);
+  CountersStart = Start + CountersOffset;
+  CountersEnd = CountersStart + CountersSize;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
 
   const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd();
@@ -459,58 +459,36 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
   if (NumCounters == 0)
     return error(instrprof_error::malformed, "number of counters is zero");
 
-  ArrayRef<uint64_t> RawCounts;
-  if (Correlator) {
-    uint64_t CounterOffset = swap<IntPtrT>(Data->CounterPtr) / sizeof(uint64_t);
-    RawCounts =
-        makeArrayRef<uint64_t>(CountersStart + CounterOffset, NumCounters);
-  } else {
-    IntPtrT CounterPtr = Data->CounterPtr;
-    ptrdiff_t CounterOffset = getCounterOffset(CounterPtr);
-    if (CounterOffset < 0)
-      return error(
-          instrprof_error::malformed,
-          ("counter offset " + Twine(CounterOffset) + " is negative").str());
-
-    // Check bounds. Note that the counter pointer embedded in the data record
-    // may itself be corrupt.
-    auto *NamesStartAsCounter = reinterpret_cast<const uint64_t *>(NamesStart);
-    ptrdiff_t MaxNumCounters = NamesStartAsCounter - CountersStart;
-    if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters)
-      return error(instrprof_error::malformed,
-                   "counter pointer is out of bounds");
-    // We need to compute the in-buffer counter offset from the in-memory
-    // address distance. The initial CountersDelta is the in-memory address
-    // difference start(__llvm_prf_cnts)-start(__llvm_prf_data), so
-    // SrcData->CounterPtr - CountersDelta computes the offset into the
-    // in-buffer counter section.
-    if (CounterOffset > MaxNumCounters)
-      return error(instrprof_error::malformed,
-                   ("counter offset " + Twine(CounterOffset) +
-                    " is greater than the maximum number of counters " +
-                    Twine((uint32_t)MaxNumCounters))
-                       .str());
-
-    if (((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters)
-      return error(instrprof_error::malformed,
-                   ("number of counters " +
-                    Twine(((uint32_t)CounterOffset + NumCounters)) +
-                    " is greater than the maximum number of counters " +
-                    Twine((uint32_t)MaxNumCounters))
-                       .str());
-    // CountersDelta decreases as we advance to the next data record.
-    CountersDelta -= sizeof(*Data);
-
-    RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters);
-  }
+  ptrdiff_t CounterBaseOffset = swap(Data->CounterPtr) - CountersDelta;
+  if (CounterBaseOffset < 0)
+    return error(
+        instrprof_error::malformed,
+        ("counter offset " + Twine(CounterBaseOffset) + " is negative").str());
 
-  if (ShouldSwapBytes) {
-    Record.Counts.clear();
-    Record.Counts.reserve(RawCounts.size());
-    for (uint64_t Count : RawCounts)
-      Record.Counts.push_back(swap(Count));
-  } else
-    Record.Counts = RawCounts;
+  if (CounterBaseOffset >= CountersEnd - CountersStart)
+    return error(instrprof_error::malformed,
+                 ("counter offset " + Twine(CounterBaseOffset) +
+                  " is greater than the maximum counter offset " +
+                  Twine(CountersEnd - CountersStart - 1))
+                     .str());
+
+  uint64_t MaxNumCounters =
+      (CountersEnd - (CountersStart + CounterBaseOffset)) /
+      getCounterTypeSize();
+  if (NumCounters > MaxNumCounters)
+    return error(instrprof_error::malformed,
+                 ("number of counters " + Twine(NumCounters) +
+                  " is greater than the maximum number of counters " +
+                  Twine(MaxNumCounters))
+                     .str());
+
+  Record.Counts.clear();
+  Record.Counts.reserve(NumCounters);
+  for (uint32_t I = 0; I < NumCounters; I++) {
+    const auto *CounterValue = reinterpret_cast<const uint64_t *>(
+        CountersStart + CounterBaseOffset + I * getCounterTypeSize());
+    Record.Counts.push_back(swap(*CounterValue));
+  }
 
   return success();
 }
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index da16309fb82c..80c02faaba04 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -655,6 +655,8 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
       Summary->setPartialProfile(true);
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext))
       FunctionSamples::ProfileIsCSFlat = ProfileIsCSFlat = true;
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsCSNested))
+      FunctionSamples::ProfileIsCSNested = ProfileIsCSNested;
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       FunctionSamples::ProfileIsFS = ProfileIsFS = true;
     break;
@@ -688,9 +690,6 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection(
     ProfileIsProbeBased =
         hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsProbeBased);
     FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased;
-    ProfileIsCSNested =
-        hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested);
-    FunctionSamples::ProfileIsCSNested = ProfileIsCSNested;
     bool HasAttribute =
         hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute);
     if (std::error_code EC = readFuncMetadata(HasAttribute))
@@ -1276,6 +1275,8 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
       Flags.append("partial,");
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext))
       Flags.append("context,");
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagIsCSNested))
+      Flags.append("context-nested,");
     if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFSDiscriminator))
       Flags.append("fs-discriminator,");
     break;
@@ -1288,8 +1289,6 @@ static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
       Flags.append("probe,");
     if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute))
       Flags.append("attr,");
-    if (hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsCSNested))
-      Flags.append("preinlined,");
     break;
   default:
     break;
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 6f02bd203a9f..b575425d4e94 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -323,13 +323,13 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection(
     setToCompressSection(SecProfileSymbolList);
   if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased)
     addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased);
-  if (Type == SecFuncMetadata && FunctionSamples::ProfileIsCSNested)
-    addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsCSNested);
   if (Type == SecFuncMetadata &&
       (FunctionSamples::ProfileIsCSFlat || FunctionSamples::ProfileIsCSNested))
     addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagHasAttribute);
   if (Type == SecProfSummary && FunctionSamples::ProfileIsCSFlat)
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext);
+  if (Type == SecProfSummary && FunctionSamples::ProfileIsCSNested)
+    addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagIsCSNested);
   if (Type == SecProfSummary && FunctionSamples::ProfileIsFS)
     addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFSDiscriminator);
 
diff --git a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
index 36ba93564771..0810bf531db8 100644
--- a/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
+++ b/llvm/lib/Remarks/BitstreamRemarkSerializer.cpp
@@ -18,7 +18,7 @@ using namespace llvm::remarks;
 
 BitstreamRemarkSerializerHelper::BitstreamRemarkSerializerHelper(
     BitstreamRemarkContainerType ContainerType)
-    : Encoded(), R(), Bitstream(Encoded), ContainerType(ContainerType) {}
+    : Bitstream(Encoded), ContainerType(ContainerType) {}
 
 static void push(SmallVectorImpl<uint64_t> &R, StringRef Str) {
   append_range(R, Str);
diff --git a/llvm/lib/Remarks/Remark.cpp b/llvm/lib/Remarks/Remark.cpp
index 057d1a378599..e6b7de1a2cf5 100644
--- a/llvm/lib/Remarks/Remark.cpp
+++ b/llvm/lib/Remarks/Remark.cpp
@@ -111,7 +111,7 @@ LLVMRemarkEntryGetFirstArg(LLVMRemarkEntryRef Remark) {
   ArrayRef<Argument> Args = unwrap(Remark)->Args;
   // No arguments to iterate on.
   if (Args.empty())
-    return NULL;
+    return nullptr;
   return reinterpret_cast<LLVMRemarkArgRef>(
       const_cast<Argument *>(Args.begin()));
 }
@@ -119,13 +119,13 @@ LLVMRemarkEntryGetFirstArg(LLVMRemarkEntryRef Remark) {
 extern "C" LLVMRemarkArgRef
 LLVMRemarkEntryGetNextArg(LLVMRemarkArgRef ArgIt, LLVMRemarkEntryRef Remark) {
   // No more arguments to iterate on.
-  if (ArgIt == NULL)
-    return NULL;
+  if (ArgIt == nullptr)
+    return nullptr;
 
   auto It = (ArrayRef<Argument>::const_iterator)ArgIt;
   auto Next = std::next(It);
   if (Next == unwrap(Remark)->Args.end())
-    return NULL;
+    return nullptr;
 
   return reinterpret_cast<LLVMRemarkArgRef>(const_cast<Argument *>(Next));
 }
diff --git a/llvm/lib/Remarks/RemarkStreamer.cpp b/llvm/lib/Remarks/RemarkStreamer.cpp
index 2f00b8e73670..543b00723659 100644
--- a/llvm/lib/Remarks/RemarkStreamer.cpp
+++ b/llvm/lib/Remarks/RemarkStreamer.cpp
@@ -26,7 +26,7 @@ static cl::opt<cl::boolOrDefault> EnableRemarksSection(
 RemarkStreamer::RemarkStreamer(
     std::unique_ptr<remarks::RemarkSerializer> RemarkSerializer,
     Optional<StringRef> FilenameIn)
-    : PassFilter(), RemarkSerializer(std::move(RemarkSerializer)),
+    : RemarkSerializer(std::move(RemarkSerializer)),
       Filename(FilenameIn ? Optional<std::string>(FilenameIn->str()) : None) {}
 
 Error RemarkStreamer::setFilter(StringRef Filter) {
diff --git a/llvm/lib/Remarks/RemarkStringTable.cpp b/llvm/lib/Remarks/RemarkStringTable.cpp
index 5f462f01bb9a..03d93baba038 100644
--- a/llvm/lib/Remarks/RemarkStringTable.cpp
+++ b/llvm/lib/Remarks/RemarkStringTable.cpp
@@ -20,7 +20,7 @@
 using namespace llvm;
 using namespace llvm::remarks;
 
-StringTable::StringTable(const ParsedStringTable &Other) : StrTab() {
+StringTable::StringTable(const ParsedStringTable &Other) {
   for (unsigned i = 0, e = Other.size(); i < e; ++i)
     if (Expected<StringRef> MaybeStr = Other[i])
       add(*MaybeStr);
diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp
index 3d9996c931ae..a32629c9f557 100644
--- a/llvm/lib/Remarks/YAMLRemarkParser.cpp
+++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp
@@ -171,7 +171,7 @@ YAMLRemarkParser::YAMLRemarkParser(StringRef Buf)
 
 YAMLRemarkParser::YAMLRemarkParser(StringRef Buf,
                                    Optional<ParsedStringTable> StrTab)
-    : RemarkParser{Format::YAML}, StrTab(std::move(StrTab)), LastErrorMessage(),
+    : RemarkParser{Format::YAML}, StrTab(std::move(StrTab)),
       SM(setupSM(LastErrorMessage)), Stream(Buf, SM), YAMLIt(Stream.begin()) {}
 
 Error YAMLRemarkParser::error(StringRef Message, yaml::Node &Node) {
diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp
index 4bc9c8487131..cdf7c8ade9aa 100644
--- a/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/llvm/lib/Support/AArch64TargetParser.cpp
@@ -114,6 +114,12 @@ bool AArch64::getExtensionFeatures(uint64_t Extensions,
     Features.push_back("+sme-f64");
   if (Extensions & AArch64::AEK_SMEI64)
     Features.push_back("+sme-i64");
+  if (Extensions & AArch64::AEK_HBC)
+    Features.push_back("+hbc");
+  if (Extensions & AArch64::AEK_MOPS)
+    Features.push_back("+mops");
+  if (Extensions & AArch64::AEK_PERFMON)
+    Features.push_back("+perfmon");
 
   return true;
 }
@@ -136,12 +142,16 @@ bool AArch64::getArchFeatures(AArch64::ArchKind AK,
     Features.push_back("+v8.6a");
   if (AK == AArch64::ArchKind::ARMV8_7A)
     Features.push_back("+v8.7a");
+  if (AK == AArch64::ArchKind::ARMV8_8A)
+    Features.push_back("+v8.8a");
   if (AK == AArch64::ArchKind::ARMV9A)
     Features.push_back("+v9a");
   if (AK == AArch64::ArchKind::ARMV9_1A)
     Features.push_back("+v9.1a");
   if (AK == AArch64::ArchKind::ARMV9_2A)
     Features.push_back("+v9.2a");
+  if (AK == AArch64::ArchKind::ARMV9_3A)
+    Features.push_back("+v9.3a");
   if(AK == AArch64::ArchKind::ARMV8R)
     Features.push_back("+v8r");
 
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 4940b61602d1..b536e9a9a6d0 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -24,9 +24,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include <climits>
 #include <cmath>
-#include <cstdlib>
 #include <cstring>
 using namespace llvm;
 
diff --git a/llvm/lib/Support/ARMAttributeParser.cpp b/llvm/lib/Support/ARMAttributeParser.cpp
index 241cfb1eedbe..908e56319025 100644
--- a/llvm/lib/Support/ARMAttributeParser.cpp
+++ b/llvm/lib/Support/ARMAttributeParser.cpp
@@ -9,8 +9,6 @@
 #include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/LEB128.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
@@ -70,7 +68,7 @@ const ARMAttributeParser::DisplayHandler ARMAttributeParser::displayRoutines[] =
 
 Error ARMAttributeParser::stringAttribute(AttrType tag) {
   StringRef tagName =
-      ELFAttrs::attrTypeAsString(tag, tagToStringMap, /*TagPrefix=*/false);
+      ELFAttrs::attrTypeAsString(tag, tagToStringMap, /*hasTagPrefix=*/false);
   StringRef desc = de.getCStrRef(cursor);
 
   if (sw) {
diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp
index 4405ed176fe2..d7294b5b1074 100644
--- a/llvm/lib/Support/ARMTargetParser.cpp
+++ b/llvm/lib/Support/ARMTargetParser.cpp
@@ -77,6 +77,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV8_5A:
   case ArchKind::ARMV8_6A:
   case ArchKind::ARMV8_7A:
+  case ArchKind::ARMV8_8A:
   case ArchKind::ARMV8R:
   case ArchKind::ARMV8MBaseline:
   case ArchKind::ARMV8MMainline:
@@ -85,6 +86,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV9A:
   case ArchKind::ARMV9_1A:
   case ArchKind::ARMV9_2A:
+  case ArchKind::ARMV9_3A:
     return 9;
   case ArchKind::INVALID:
     return 0;
@@ -117,9 +119,11 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
   case ArchKind::ARMV8_5A:
   case ArchKind::ARMV8_6A:
   case ArchKind::ARMV8_7A:
+  case ArchKind::ARMV8_8A:
   case ArchKind::ARMV9A:
   case ArchKind::ARMV9_1A:
   case ArchKind::ARMV9_2A:
+  case ArchKind::ARMV9_3A:
     return ProfileKind::A;
   case ArchKind::ARMV2:
   case ArchKind::ARMV2A:
@@ -164,10 +168,12 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v8.5a", "v8.5-a")
       .Case("v8.6a", "v8.6-a")
       .Case("v8.7a", "v8.7-a")
+      .Case("v8.8a", "v8.8-a")
       .Case("v8r", "v8-r")
       .Cases("v9", "v9a", "v9-a")
       .Case("v9.1a", "v9.1-a")
       .Case("v9.2a", "v9.2-a")
+      .Case("v9.3a", "v9.3-a")
       .Case("v8m.base", "v8-m.base")
       .Case("v8m.main", "v8-m.main")
       .Case("v8.1m.main", "v8.1-m.main")
diff --git a/llvm/lib/Support/ARMWinEH.cpp b/llvm/lib/Support/ARMWinEH.cpp
index 2e2fcf28451f..8e7fa1149082 100644
--- a/llvm/lib/Support/ARMWinEH.cpp
+++ b/llvm/lib/Support/ARMWinEH.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ARMWinEH.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace ARM {
diff --git a/llvm/lib/Support/BinaryStreamError.cpp b/llvm/lib/Support/BinaryStreamError.cpp
index f22523f09ac8..9b8f6862b65c 100644
--- a/llvm/lib/Support/BinaryStreamError.cpp
+++ b/llvm/lib/Support/BinaryStreamError.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/BinaryStreamError.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/BlockFrequency.cpp b/llvm/lib/Support/BlockFrequency.cpp
index 2b63294f3789..702165ac480b 100644
--- a/llvm/lib/Support/BlockFrequency.cpp
+++ b/llvm/lib/Support/BlockFrequency.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
 #include <cassert>
 
 using namespace llvm;
diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp
index 8c685640f791..d6902f660e39 100644
--- a/llvm/lib/Support/Caching.cpp
+++ b/llvm/lib/Support/Caching.cpp
@@ -30,8 +30,6 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef,
                                      Twine TempFilePrefixRef,
                                      Twine CacheDirectoryPathRef,
                                      AddBufferFn AddBuffer) {
-  if (std::error_code EC = sys::fs::create_directories(CacheDirectoryPathRef))
-    return errorCodeToError(EC);
 
   // Create local copies which are safely captured-by-copy in lambdas
   SmallString<64> CacheName, TempFilePrefix, CacheDirectoryPath;
@@ -140,6 +138,12 @@ Expected<FileCache> llvm::localCache(Twine CacheNameRef,
     };
 
     return [=](size_t Task) -> Expected<std::unique_ptr<CachedFileStream>> {
+      // Create the cache directory if not already done. Doing this lazily
+      // ensures the filesystem isn't mutated until the cache is.
+      if (std::error_code EC = sys::fs::create_directories(
+              CacheDirectoryPath, /*IgnoreExisting=*/true))
+        return errorCodeToError(EC);
+
       // Write to a temporary to avoid race condition
       SmallString<64> TempFilenameModel;
       sys::path::append(TempFilenameModel, CacheDirectoryPath,
diff --git a/llvm/lib/Support/CodeGenCoverage.cpp b/llvm/lib/Support/CodeGenCoverage.cpp
index 93f386b6e23d..73e0fb3edce8 100644
--- a/llvm/lib/Support/CodeGenCoverage.cpp
+++ b/llvm/lib/Support/CodeGenCoverage.cpp
@@ -27,7 +27,7 @@ CodeGenCoverage::CodeGenCoverage() {}
 
 void CodeGenCoverage::setCovered(uint64_t RuleID) {
   if (RuleCoverage.size() <= RuleID)
-    RuleCoverage.resize(RuleID + 1, 0);
+    RuleCoverage.resize(RuleID + 1, false);
   RuleCoverage[RuleID] = true;
 }
 
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 4153a69abf5d..71a6ebf2a72e 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -22,7 +22,7 @@
 #include "llvm-c/Support.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -45,7 +45,6 @@
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
-#include <map>
 #include <string>
 using namespace llvm;
 using namespace cl;
@@ -1078,11 +1077,45 @@ static bool hasUTF8ByteOrderMark(ArrayRef<char> S) {
   return (S.size() >= 3 && S[0] == '\xef' && S[1] == '\xbb' && S[2] == '\xbf');
 }
 
+// Substitute <CFGDIR> with the file's base path.
+static void ExpandBasePaths(StringRef BasePath, StringSaver &Saver,
+                            const char *&Arg) {
+  assert(sys::path::is_absolute(BasePath));
+  constexpr StringLiteral Token("<CFGDIR>");
+  const StringRef ArgString(Arg);
+
+  SmallString<128> ResponseFile;
+  StringRef::size_type StartPos = 0;
+  for (StringRef::size_type TokenPos = ArgString.find(Token);
+       TokenPos != StringRef::npos;
+       TokenPos = ArgString.find(Token, StartPos)) {
+    // Token may appear more than once per arg (e.g. comma-separated linker
+    // args). Support by using path-append on any subsequent appearances.
+    const StringRef LHS = ArgString.substr(StartPos, TokenPos - StartPos);
+    if (ResponseFile.empty())
+      ResponseFile = LHS;
+    else
+      llvm::sys::path::append(ResponseFile, LHS);
+    ResponseFile.append(BasePath);
+    StartPos = TokenPos + Token.size();
+  }
+
+  if (!ResponseFile.empty()) {
+    // Path-append the remaining arg substring if at least one token appeared.
+    const StringRef Remaining = ArgString.substr(StartPos);
+    if (!Remaining.empty())
+      llvm::sys::path::append(ResponseFile, Remaining);
+    Arg = Saver.save(ResponseFile.str()).data();
+  }
+}
+
 // FName must be an absolute path.
-static llvm::Error ExpandResponseFile(
-    StringRef FName, StringSaver &Saver, TokenizerCallback Tokenizer,
-    SmallVectorImpl<const char *> &NewArgv, bool MarkEOLs, bool RelativeNames,
-    llvm::vfs::FileSystem &FS) {
+static llvm::Error ExpandResponseFile(StringRef FName, StringSaver &Saver,
+                                      TokenizerCallback Tokenizer,
+                                      SmallVectorImpl<const char *> &NewArgv,
+                                      bool MarkEOLs, bool RelativeNames,
+                                      bool ExpandBasePath,
+                                      llvm::vfs::FileSystem &FS) {
   assert(sys::path::is_absolute(FName));
   llvm::ErrorOr<std::unique_ptr<MemoryBuffer>> MemBufOrErr =
       FS.getBufferForFile(FName);
@@ -1116,8 +1149,15 @@ static llvm::Error ExpandResponseFile(
   // file, replace the included response file names with their full paths
   // obtained by required resolution.
   for (auto &Arg : NewArgv) {
+    if (!Arg)
+      continue;
+
+    // Substitute <CFGDIR> with the file's base path.
+    if (ExpandBasePath)
+      ExpandBasePaths(BasePath, Saver, Arg);
+
     // Skip non-rsp file arguments.
-    if (!Arg || Arg[0] != '@')
+    if (Arg[0] != '@')
       continue;
 
     StringRef FileName(Arg + 1);
@@ -1129,7 +1169,7 @@ static llvm::Error ExpandResponseFile(
     ResponseFile.push_back('@');
     ResponseFile.append(BasePath);
     llvm::sys::path::append(ResponseFile, FileName);
-    Arg = Saver.save(ResponseFile.c_str()).data();
+    Arg = Saver.save(ResponseFile.str()).data();
   }
   return Error::success();
 }
@@ -1138,7 +1178,7 @@ static llvm::Error ExpandResponseFile(
 /// StringSaver and tokenization strategy.
 bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
                              SmallVectorImpl<const char *> &Argv, bool MarkEOLs,
-                             bool RelativeNames,
+                             bool RelativeNames, bool ExpandBasePath,
                              llvm::Optional<llvm::StringRef> CurrentDir,
                              llvm::vfs::FileSystem &FS) {
   bool AllExpanded = true;
@@ -1218,7 +1258,7 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
     SmallVector<const char *, 0> ExpandedArgv;
     if (llvm::Error Err =
             ExpandResponseFile(FName, Saver, Tokenizer, ExpandedArgv, MarkEOLs,
-                               RelativeNames, FS)) {
+                               RelativeNames, ExpandBasePath, FS)) {
       // We couldn't read this file, so we leave it in the argument stream and
       // move on.
       // TODO: The error should be propagated up the stack.
@@ -1250,11 +1290,11 @@ bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
 
 bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
                              SmallVectorImpl<const char *> &Argv, bool MarkEOLs,
-                             bool RelativeNames,
+                             bool RelativeNames, bool ExpandBasePath,
                              llvm::Optional<StringRef> CurrentDir) {
   return ExpandResponseFiles(Saver, std::move(Tokenizer), Argv, MarkEOLs,
-                             RelativeNames, std::move(CurrentDir),
-                             *vfs::getRealFileSystem());
+                             RelativeNames, ExpandBasePath,
+                             std::move(CurrentDir), *vfs::getRealFileSystem());
 }
 
 bool cl::expandResponseFiles(int Argc, const char *const *Argv,
@@ -1281,16 +1321,17 @@ bool cl::readConfigFile(StringRef CfgFile, StringSaver &Saver,
     llvm::sys::path::append(AbsPath, CfgFile);
     CfgFile = AbsPath.str();
   }
-  if (llvm::Error Err =
-          ExpandResponseFile(CfgFile, Saver, cl::tokenizeConfigFile, Argv,
-                             /*MarkEOLs=*/false, /*RelativeNames=*/true,
-                             *llvm::vfs::getRealFileSystem())) {
+  if (llvm::Error Err = ExpandResponseFile(
+          CfgFile, Saver, cl::tokenizeConfigFile, Argv,
+          /*MarkEOLs=*/false, /*RelativeNames=*/true, /*ExpandBasePath=*/true,
+          *llvm::vfs::getRealFileSystem())) {
     // TODO: The error should be propagated up the stack.
     llvm::consumeError(std::move(Err));
     return false;
   }
   return ExpandResponseFiles(Saver, cl::tokenizeConfigFile, Argv,
-                             /*MarkEOLs=*/false, /*RelativeNames=*/true);
+                             /*MarkEOLs=*/false, /*RelativeNames=*/true,
+                             /*ExpandBasePath=*/true, llvm::None);
 }
 
 static void initCommonOptions();
@@ -2297,7 +2338,7 @@ public:
 protected:
   void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) override {
     std::vector<OptionCategory *> SortedCategories;
-    std::map<OptionCategory *, std::vector<Option *>> CategorizedOptions;
+    DenseMap<OptionCategory *, std::vector<Option *>> CategorizedOptions;
 
     // Collect registered option categories into vector in preparation for
     // sorting.
@@ -2309,17 +2350,13 @@ protected:
     array_pod_sort(SortedCategories.begin(), SortedCategories.end(),
                    OptionCategoryCompare);
 
-    // Create map to empty vectors.
-    for (OptionCategory *Category : SortedCategories)
-      CategorizedOptions[Category] = std::vector<Option *>();
-
     // Walk through pre-sorted options and assign into categories.
     // Because the options are already alphabetically sorted the
     // options within categories will also be alphabetically sorted.
     for (size_t I = 0, E = Opts.size(); I != E; ++I) {
       Option *Opt = Opts[I].second;
       for (auto &Cat : Opt->Categories) {
-        assert(CategorizedOptions.count(Cat) > 0 &&
+        assert(find(SortedCategories, Cat) != SortedCategories.end() &&
                "Option has an unregistered category");
         CategorizedOptions[Cat].push_back(Opt);
       }
diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp
index b6aaf373a522..2ee3074b840e 100644
--- a/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -9,7 +9,6 @@
 #include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ExitCodes.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ThreadLocal.h"
@@ -17,6 +16,10 @@
 #include <mutex>
 #include <setjmp.h>
 
+#if !defined(_MSC_VER) && !defined(_WIN32)
+#include "llvm/Support/ExitCodes.h"
+#endif
+
 using namespace llvm;
 
 namespace {
diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
index a6daee00bd43..f1b730e2b58c 100644
--- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -37,7 +37,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <iterator>
 #include <map>
 using namespace llvm;
 
diff --git a/llvm/lib/Support/DataExtractor.cpp b/llvm/lib/Support/DataExtractor.cpp
index 133d674275e8..8cf312191153 100644
--- a/llvm/lib/Support/DataExtractor.cpp
+++ b/llvm/lib/Support/DataExtractor.cpp
@@ -9,7 +9,6 @@
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/SwapByteOrder.h"
 
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index 077629670e40..69f39386798c 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -67,7 +67,7 @@ UnsignedDivisonByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
   unsigned P;
   APInt NC, Delta, Q1, R1, Q2, R2;
   struct UnsignedDivisonByConstantInfo Retval;
-  Retval.IsAdd = 0; // initialize "add" indicator
+  Retval.IsAdd = false; // initialize "add" indicator
   APInt AllOnes = APInt::getAllOnes(D.getBitWidth()).lshr(LeadingZeros);
   APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
   APInt SignedMax = APInt::getSignedMaxValue(D.getBitWidth());
@@ -89,12 +89,12 @@ UnsignedDivisonByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
     }
     if ((R2 + 1).uge(D - R2)) {
       if (Q2.uge(SignedMax))
-        Retval.IsAdd = 1;
+        Retval.IsAdd = true;
       Q2 = Q2 + Q2 + 1;     // update Q2
       R2 = R2 + R2 + 1 - D; // update R2
     } else {
       if (Q2.uge(SignedMin))
-        Retval.IsAdd = 1;
+        Retval.IsAdd = true;
       Q2 = Q2 + Q2;     // update Q2
       R2 = R2 + R2 + 1; // update R2
     }
diff --git a/llvm/lib/Support/ELFAttributeParser.cpp b/llvm/lib/Support/ELFAttributeParser.cpp
index 1206553343ef..cf8a666e92bc 100644
--- a/llvm/lib/Support/ELFAttributeParser.cpp
+++ b/llvm/lib/Support/ELFAttributeParser.cpp
@@ -7,10 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ELFAttributeParser.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/LEB128.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Support/FileOutputBuffer.cpp b/llvm/lib/Support/FileOutputBuffer.cpp
index 4b4406c4c9f4..c11ee59da0dd 100644
--- a/llvm/lib/Support/FileOutputBuffer.cpp
+++ b/llvm/lib/Support/FileOutputBuffer.cpp
@@ -11,11 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Memory.h"
-#include "llvm/Support/Path.h"
 #include <system_error>
 
 #if !defined(_MSC_VER) && !defined(__MINGW32__)
diff --git a/llvm/lib/Support/FileUtilities.cpp b/llvm/lib/Support/FileUtilities.cpp
index dbe28e56b2c3..489b8d119e6f 100644
--- a/llvm/lib/Support/FileUtilities.cpp
+++ b/llvm/lib/Support/FileUtilities.cpp
@@ -12,16 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FileUtilities.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cctype>
-#include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
diff --git a/llvm/lib/Support/GraphWriter.cpp b/llvm/lib/Support/GraphWriter.cpp
index 696e6b7a99d8..e875e18a7e92 100644
--- a/llvm/lib/Support/GraphWriter.cpp
+++ b/llvm/lib/Support/GraphWriter.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
@@ -26,7 +25,11 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
+
+#ifdef __APPLE__
+#include "llvm/Support/CommandLine.h"
+#endif
+
 #include <string>
 #include <system_error>
 #include <vector>
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index 7b14616f6fea..9a4470289bcf 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -83,12 +83,12 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
 
   StringRef::const_iterator CIP = CPUInfoStart;
 
-  StringRef::const_iterator CPUStart = 0;
+  StringRef::const_iterator CPUStart = nullptr;
   size_t CPULen = 0;
 
   // We need to find the first line which starts with cpu, spaces, and a colon.
   // After the colon, there may be some additional spaces and then the cpu type.
-  while (CIP < CPUInfoEnd && CPUStart == 0) {
+  while (CIP < CPUInfoEnd && CPUStart == nullptr) {
     if (CIP < CPUInfoEnd && *CIP == '\n')
       ++CIP;
 
@@ -118,12 +118,12 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       }
     }
 
-    if (CPUStart == 0)
+    if (CPUStart == nullptr)
       while (CIP < CPUInfoEnd && *CIP != '\n')
         ++CIP;
   }
 
-  if (CPUStart == 0)
+  if (CPUStart == nullptr)
     return generic;
 
   return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
@@ -213,6 +213,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd44", "cortex-x1")
         .Case("0xd0c", "neoverse-n1")
         .Case("0xd49", "neoverse-n2")
+        .Case("0xd40", "neoverse-v1")
         .Default("generic");
   }
 
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index 152de6ebae0a..2b7173b28940 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -7,14 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/Error.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
-#include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
-#include <string>
+#include "llvm/Support/SwapByteOrder.h"
 
 #ifdef _WIN32
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Windows/WindowsSupport.h"
 #endif
 
diff --git a/llvm/lib/Support/JSON.cpp b/llvm/lib/Support/JSON.cpp
index 17b36ed51850..20babbe56d86 100644
--- a/llvm/lib/Support/JSON.cpp
+++ b/llvm/lib/Support/JSON.cpp
@@ -12,6 +12,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/NativeFormatting.h"
 #include <cctype>
 
 namespace llvm {
diff --git a/llvm/lib/Support/LowLevelType.cpp b/llvm/lib/Support/LowLevelType.cpp
index ecf557997ad1..0282cd9bd79e 100644
--- a/llvm/lib/Support/LowLevelType.cpp
+++ b/llvm/lib/Support/LowLevelType.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 
 LLT::LLT(MVT VT) {
   if (VT.isVector()) {
-    bool asVector = VT.getVectorNumElements() > 1;
+    bool asVector = VT.getVectorMinNumElements() > 1;
     init(/*IsPointer=*/false, asVector, /*IsScalar=*/!asVector,
          VT.getVectorElementCount(), VT.getVectorElementType().getSizeInBits(),
          /*AddressSpace=*/0);
diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp
index 9dceb4d418cd..caadde389504 100644
--- a/llvm/lib/Support/MD5.cpp
+++ b/llvm/lib/Support/MD5.cpp
@@ -40,10 +40,9 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
 #include <array>
 #include <cstdint>
 #include <cstring>
@@ -281,14 +280,12 @@ StringRef MD5::result() {
 
 SmallString<32> MD5::MD5Result::digest() const {
   SmallString<32> Str;
-  raw_svector_ostream Res(Str);
-  for (int i = 0; i < 16; ++i)
-    Res << format("%.2x", Bytes[i]);
+  toHex(Bytes, /*LowerCase*/ true, Str);
   return Str;
 }
 
-void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
-  Str = Result.digest();
+void MD5::stringifyResult(MD5Result &Result, SmallVectorImpl<char> &Str) {
+  toHex(Result.Bytes, /*LowerCase*/ true, Str);
 }
 
 std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
diff --git a/llvm/lib/Support/MSP430AttributeParser.cpp b/llvm/lib/Support/MSP430AttributeParser.cpp
index a9948a158fc0..a230a3a70adb 100644
--- a/llvm/lib/Support/MSP430AttributeParser.cpp
+++ b/llvm/lib/Support/MSP430AttributeParser.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/MSP430AttributeParser.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 using namespace llvm::MSP430Attrs;
diff --git a/llvm/lib/Support/MemAlloc.cpp b/llvm/lib/Support/MemAlloc.cpp
index 7aaa0dc6e205..07a26cf26480 100644
--- a/llvm/lib/Support/MemAlloc.cpp
+++ b/llvm/lib/Support/MemAlloc.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/MemAlloc.h"
+#include <new>
 
 // These are out of line to have __cpp_aligned_new not affect ABI.
 
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index d3fa3c6f065d..7816779cca1d 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -14,16 +14,15 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/AutoConvert.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Errc.h"
-#include "llvm/Support/Errno.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include <cassert>
-#include <cerrno>
 #include <cstring>
 #include <new>
 #include <sys/types.h>
@@ -220,28 +219,16 @@ public:
   MemoryBuffer::BufferKind getBufferKind() const override {
     return MemoryBuffer::MemoryBuffer_MMap;
   }
+
+  void dontNeedIfMmap() override { MFR.dontNeed(); }
 };
 } // namespace
 
 static ErrorOr<std::unique_ptr<WritableMemoryBuffer>>
 getMemoryBufferForStream(sys::fs::file_t FD, const Twine &BufferName) {
-  const ssize_t ChunkSize = 4096*4;
-  SmallString<ChunkSize> Buffer;
-
-  // Read into Buffer until we hit EOF.
-  size_t Size = Buffer.size();
-  for (;;) {
-    Buffer.resize_for_overwrite(Size + ChunkSize);
-    Expected<size_t> ReadBytes = sys::fs::readNativeFile(
-        FD, makeMutableArrayRef(Buffer.begin() + Size, ChunkSize));
-    if (!ReadBytes)
-      return errorToErrorCode(ReadBytes.takeError());
-    if (*ReadBytes == 0)
-      break;
-    Size += *ReadBytes;
-  }
-  Buffer.truncate(Size);
-
+  SmallString<sys::fs::DefaultReadChunkSize> Buffer;
+  if (Error E = sys::fs::readNativeFileToEOF(FD, Buffer))
+    return errorToErrorCode(std::move(E));
   return getMemBufferCopyImpl(Buffer, BufferName);
 }
 
diff --git a/llvm/lib/Support/NativeFormatting.cpp b/llvm/lib/Support/NativeFormatting.cpp
index 254d18d797b3..0a797046bb68 100644
--- a/llvm/lib/Support/NativeFormatting.cpp
+++ b/llvm/lib/Support/NativeFormatting.cpp
@@ -13,7 +13,6 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include <float.h>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 71e3a1362f7e..4977c188f934 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -174,3 +174,35 @@ void TaskGroup::spawn(std::function<void()> F) {
 } // namespace parallel
 } // namespace llvm
 #endif // LLVM_ENABLE_THREADS
+
+void llvm::parallelForEachN(size_t Begin, size_t End,
+                            llvm::function_ref<void(size_t)> Fn) {
+  // If we have zero or one items, then do not incur the overhead of spinning up
+  // a task group.  They are surprisingly expensive, and because they do not
+  // support nested parallelism, a single entry task group can block parallel
+  // execution underneath them.
+#if LLVM_ENABLE_THREADS
+  auto NumItems = End - Begin;
+  if (NumItems > 1 && parallel::strategy.ThreadsRequested != 1) {
+    // Limit the number of tasks to MaxTasksPerGroup to limit job scheduling
+    // overhead on large inputs.
+    auto TaskSize = NumItems / parallel::detail::MaxTasksPerGroup;
+    if (TaskSize == 0)
+      TaskSize = 1;
+
+    parallel::detail::TaskGroup TG;
+    for (; Begin + TaskSize < End; Begin += TaskSize) {
+      TG.spawn([=, &Fn] {
+        for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
+          Fn(I);
+      });
+    }
+    for (; Begin != End; ++Begin)
+      Fn(Begin);
+    return;
+  }
+#endif
+
+  for (; Begin != End; ++Begin)
+    Fn(Begin);
+}
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 7c99d088911c..63d8d4ee4648 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Support/Path.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Config/config.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Endian.h"
@@ -1167,6 +1168,25 @@ const char *mapped_file_region::const_data() const {
   return reinterpret_cast<const char *>(Mapping);
 }
 
+Error readNativeFileToEOF(file_t FileHandle, SmallVectorImpl<char> &Buffer,
+                          ssize_t ChunkSize) {
+  // Install a handler to truncate the buffer to the correct size on exit.
+  size_t Size = Buffer.size();
+  auto TruncateOnExit = make_scope_exit([&]() { Buffer.truncate(Size); });
+
+  // Read into Buffer until we hit EOF.
+  for (;;) {
+    Buffer.resize_for_overwrite(Size + ChunkSize);
+    Expected<size_t> ReadBytes = readNativeFile(
+        FileHandle, makeMutableArrayRef(Buffer.begin() + Size, ChunkSize));
+    if (!ReadBytes)
+      return ReadBytes.takeError();
+    if (*ReadBytes == 0)
+      return Error::success();
+    Size += *ReadBytes;
+  }
+}
+
 } // end namespace fs
 } // end namespace sys
 } // end namespace llvm
@@ -1234,7 +1254,8 @@ Error TempFile::keep(const Twine &Name) {
 #ifdef _WIN32
   // If we can't cancel the delete don't rename.
   auto H = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
-  std::error_code RenameEC = setDeleteDisposition(H, false);
+  std::error_code RenameEC =
+      RemoveOnClose ? std::error_code() : setDeleteDisposition(H, false);
   bool ShouldDelete = false;
   if (!RenameEC) {
     RenameEC = rename_handle(H, Name);
diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp
index 0d07057f1df0..fa91405fee10 100644
--- a/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/llvm/lib/Support/PrettyStackTrace.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm-c/ErrorHandling.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -21,6 +20,10 @@
 #include "llvm/Support/Watchdog.h"
 #include "llvm/Support/raw_ostream.h"
 
+#ifdef __APPLE__
+#include "llvm/ADT/SmallString.h"
+#endif
+
 #include <atomic>
 #include <cassert>
 #include <cstdarg>
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index e2e4340f44e9..6c59d8a7ef04 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -9,6 +9,7 @@
 #include "llvm/Support/RISCVISAInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Errc.h"
@@ -46,25 +47,56 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"f", RISCVExtensionVersion{2, 0}},
     {"d", RISCVExtensionVersion{2, 0}},
     {"c", RISCVExtensionVersion{2, 0}},
-};
 
-static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
-    {"v", RISCVExtensionVersion{0, 10}},
+    {"zfhmin", RISCVExtensionVersion{1, 0}},
+    {"zfh", RISCVExtensionVersion{1, 0}},
+
     {"zba", RISCVExtensionVersion{1, 0}},
     {"zbb", RISCVExtensionVersion{1, 0}},
     {"zbc", RISCVExtensionVersion{1, 0}},
+    {"zbs", RISCVExtensionVersion{1, 0}},
+
+    {"zbkb", RISCVExtensionVersion{1, 0}},
+    {"zbkc", RISCVExtensionVersion{1, 0}},
+    {"zbkx", RISCVExtensionVersion{1, 0}},
+    {"zknd", RISCVExtensionVersion{1, 0}},
+    {"zkne", RISCVExtensionVersion{1, 0}},
+    {"zknh", RISCVExtensionVersion{1, 0}},
+    {"zksed", RISCVExtensionVersion{1, 0}},
+    {"zksh", RISCVExtensionVersion{1, 0}},
+    {"zkr", RISCVExtensionVersion{1, 0}},
+    {"zkn", RISCVExtensionVersion{1, 0}},
+    {"zks", RISCVExtensionVersion{1, 0}},
+    {"zkt", RISCVExtensionVersion{1, 0}},
+    {"zk", RISCVExtensionVersion{1, 0}},
+
+    {"v", RISCVExtensionVersion{1, 0}},
+    {"zvl32b", RISCVExtensionVersion{1, 0}},
+    {"zvl64b", RISCVExtensionVersion{1, 0}},
+    {"zvl128b", RISCVExtensionVersion{1, 0}},
+    {"zvl256b", RISCVExtensionVersion{1, 0}},
+    {"zvl512b", RISCVExtensionVersion{1, 0}},
+    {"zvl1024b", RISCVExtensionVersion{1, 0}},
+    {"zvl2048b", RISCVExtensionVersion{1, 0}},
+    {"zvl4096b", RISCVExtensionVersion{1, 0}},
+    {"zvl8192b", RISCVExtensionVersion{1, 0}},
+    {"zvl16384b", RISCVExtensionVersion{1, 0}},
+    {"zvl32768b", RISCVExtensionVersion{1, 0}},
+    {"zvl65536b", RISCVExtensionVersion{1, 0}},
+    {"zve32x", RISCVExtensionVersion{1, 0}},
+    {"zve32f", RISCVExtensionVersion{1, 0}},
+    {"zve64x", RISCVExtensionVersion{1, 0}},
+    {"zve64f", RISCVExtensionVersion{1, 0}},
+    {"zve64d", RISCVExtensionVersion{1, 0}},
+};
+
+static const RISCVSupportedExtension SupportedExperimentalExtensions[] = {
     {"zbe", RISCVExtensionVersion{0, 93}},
     {"zbf", RISCVExtensionVersion{0, 93}},
     {"zbm", RISCVExtensionVersion{0, 93}},
     {"zbp", RISCVExtensionVersion{0, 93}},
     {"zbr", RISCVExtensionVersion{0, 93}},
-    {"zbs", RISCVExtensionVersion{1, 0}},
     {"zbt", RISCVExtensionVersion{0, 93}},
-
-    {"zvlsseg", RISCVExtensionVersion{0, 10}},
-
-    {"zfhmin", RISCVExtensionVersion{0, 1}},
-    {"zfh", RISCVExtensionVersion{0, 1}},
 };
 
 static bool stripExperimentalPrefix(StringRef &Ext) {
@@ -78,9 +110,9 @@ static bool stripExperimentalPrefix(StringRef &Ext) {
 // NOTE: This function is NOT able to take empty strings or strings that only
 // have version numbers and no extension name. It assumes the extension name
 // will be at least more than one character.
-static size_t findFirstNonVersionCharacter(const StringRef &Ext) {
-  if (Ext.size() == 0)
-    llvm_unreachable("Already guarded by if-statement in ::parseArchString");
+static size_t findFirstNonVersionCharacter(StringRef Ext) {
+   assert(!Ext.empty() &&
+          "Already guarded by if-statement in ::parseArchString");
 
   int Pos = Ext.size() - 1;
   while (Pos > 0 && isDigit(Ext[Pos]))
@@ -276,16 +308,13 @@ bool RISCVISAInfo::compareExtension(const std::string &LHS,
 void RISCVISAInfo::toFeatures(
     std::vector<StringRef> &Features,
     std::function<StringRef(const Twine &)> StrAlloc) const {
-  for (auto &Ext : Exts) {
+  for (auto const &Ext : Exts) {
     StringRef ExtName = Ext.first;
 
     if (ExtName == "i")
       continue;
 
-    if (ExtName == "zvlsseg") {
-      Features.push_back("+experimental-v");
-      Features.push_back("+experimental-zvlsseg");
-    } else if (isExperimentalExtension(ExtName)) {
+    if (isExperimentalExtension(ExtName)) {
       Features.push_back(StrAlloc("+experimental-" + ExtName));
     } else {
       Features.push_back(StrAlloc("+" + ExtName));
@@ -434,6 +463,8 @@ RISCVISAInfo::parseFeatures(unsigned XLen,
 
   ISAInfo->updateImplication();
   ISAInfo->updateFLen();
+  ISAInfo->updateMinVLen();
+  ISAInfo->updateMaxELen();
 
   if (Error Result = ISAInfo->checkDependency())
     return std::move(Result);
@@ -657,6 +688,8 @@ RISCVISAInfo::parseArchString(StringRef Arch, bool EnableExperimentalExtension,
 
   ISAInfo->updateImplication();
   ISAInfo->updateFLen();
+  ISAInfo->updateMinVLen();
+  ISAInfo->updateMaxELen();
 
   if (Error Result = ISAInfo->checkDependency())
     return std::move(Result);
@@ -669,6 +702,12 @@ Error RISCVISAInfo::checkDependency() {
   bool HasE = Exts.count("e") == 1;
   bool HasD = Exts.count("d") == 1;
   bool HasF = Exts.count("f") == 1;
+  bool HasZve32x = Exts.count("zve32x") == 1;
+  bool HasZve32f = Exts.count("zve32f") == 1;
+  bool HasZve64d = Exts.count("zve64d") == 1;
+  bool HasV = Exts.count("v") == 1;
+  bool HasVector = HasZve32x || HasV;
+  bool HasZvl = MinVLen != 0;
 
   if (HasE && !IsRv32)
     return createStringError(
@@ -683,6 +722,29 @@ Error RISCVISAInfo::checkDependency() {
     return createStringError(errc::invalid_argument,
                              "d requires f extension to also be specified");
 
+  // FIXME: Consider Zfinx in the future
+  if (HasZve32f && !HasF)
+    return createStringError(
+        errc::invalid_argument,
+        "zve32f requires f extension to also be specified");
+
+  // FIXME: Consider Zdinx in the future
+  if (HasZve64d && !HasD)
+    return createStringError(
+        errc::invalid_argument,
+        "zve64d requires d extension to also be specified");
+
+  if (HasZvl && !HasVector)
+    return createStringError(
+        errc::invalid_argument,
+        "zvl*b requires v or zve* extension to also be specified");
+
+  // Could not implement Zve* extension and the V extension at the same time.
+  if (HasZve32x && HasV)
+    return createStringError(
+        errc::invalid_argument,
+        "It is illegal to specify the v extension with zve* extensions");
+
   // Additional dependency checks.
   // TODO: The 'q' extension requires rv64.
   // TODO: It is illegal to specify 'e' extensions with 'f' and 'd'.
@@ -690,8 +752,27 @@ Error RISCVISAInfo::checkDependency() {
   return Error::success();
 }
 
-static const char *ImpliedExtsV[] = {"zvlsseg"};
+static const char *ImpliedExtsV[] = {"zvl128b", "f", "d"};
 static const char *ImpliedExtsZfh[] = {"zfhmin"};
+static const char *ImpliedExtsZve64d[] = {"zve64f"};
+static const char *ImpliedExtsZve64f[] = {"zve64x", "zve32f"};
+static const char *ImpliedExtsZve64x[] = {"zve32x", "zvl64b"};
+static const char *ImpliedExtsZve32f[] = {"zve32x"};
+static const char *ImpliedExtsZve32x[] = {"zvl32b"};
+static const char *ImpliedExtsZvl65536b[] = {"zvl32768b"};
+static const char *ImpliedExtsZvl32768b[] = {"zvl16384b"};
+static const char *ImpliedExtsZvl16384b[] = {"zvl8192b"};
+static const char *ImpliedExtsZvl8192b[] = {"zvl4096b"};
+static const char *ImpliedExtsZvl4096b[] = {"zvl2048b"};
+static const char *ImpliedExtsZvl2048b[] = {"zvl1024b"};
+static const char *ImpliedExtsZvl1024b[] = {"zvl512b"};
+static const char *ImpliedExtsZvl512b[] = {"zvl256b"};
+static const char *ImpliedExtsZvl256b[] = {"zvl128b"};
+static const char *ImpliedExtsZvl128b[] = {"zvl64b"};
+static const char *ImpliedExtsZvl64b[] = {"zvl32b"};
+static const char *ImpliedExtsZk[] = {"zkn", "zkt", "zkr"};
+static const char *ImpliedExtsZkn[] = {"zbkb", "zbkc", "zbkx", "zkne", "zknd", "zknh"};
+static const char *ImpliedExtsZks[] = {"zbkb", "zbkc", "zbkx", "zksed", "zksh"};
 
 struct ImpliedExtsEntry {
   StringLiteral Name;
@@ -707,6 +788,25 @@ struct ImpliedExtsEntry {
 static constexpr ImpliedExtsEntry ImpliedExts[] = {
     {{"v"}, {ImpliedExtsV}},
     {{"zfh"}, {ImpliedExtsZfh}},
+    {{"zk"}, {ImpliedExtsZk}},
+    {{"zkn"}, {ImpliedExtsZkn}},
+    {{"zks"}, {ImpliedExtsZks}},
+    {{"zve32f"}, {ImpliedExtsZve32f}},
+    {{"zve32x"}, {ImpliedExtsZve32x}},
+    {{"zve64d"}, {ImpliedExtsZve64d}},
+    {{"zve64f"}, {ImpliedExtsZve64f}},
+    {{"zve64x"}, {ImpliedExtsZve64x}},
+    {{"zvl1024b"}, {ImpliedExtsZvl1024b}},
+    {{"zvl128b"}, {ImpliedExtsZvl128b}},
+    {{"zvl16384b"}, {ImpliedExtsZvl16384b}},
+    {{"zvl2048b"}, {ImpliedExtsZvl2048b}},
+    {{"zvl256b"}, {ImpliedExtsZvl256b}},
+    {{"zvl32768b"}, {ImpliedExtsZvl32768b}},
+    {{"zvl4096b"}, {ImpliedExtsZvl4096b}},
+    {{"zvl512b"}, {ImpliedExtsZvl512b}},
+    {{"zvl64b"}, {ImpliedExtsZvl64b}},
+    {{"zvl65536b"}, {ImpliedExtsZvl65536b}},
+    {{"zvl8192b"}, {ImpliedExtsZvl8192b}},
 };
 
 void RISCVISAInfo::updateImplication() {
@@ -721,12 +821,25 @@ void RISCVISAInfo::updateImplication() {
   }
 
   assert(llvm::is_sorted(ImpliedExts) && "Table not sorted by Name");
-  for (auto &Ext : Exts) {
-    auto I = llvm::lower_bound(ImpliedExts, Ext.first);
-    if (I != std::end(ImpliedExts) && I->Name == Ext.first) {
-      for (auto &ImpliedExt : I->Exts) {
+
+  // This loop may execute over 1 iteration since implication can be layered
+  // Exits loop if no more implication is applied
+  SmallSetVector<StringRef, 16> WorkList;
+  for (auto const &Ext : Exts)
+    WorkList.insert(Ext.first);
+
+  while (!WorkList.empty()) {
+    StringRef ExtName = WorkList.pop_back_val();
+    auto I = llvm::lower_bound(ImpliedExts, ExtName);
+    if (I != std::end(ImpliedExts) && I->Name == ExtName) {
+      for (const char *ImpliedExt : I->Exts) {
+        if (WorkList.count(ImpliedExt))
+          continue;
+        if (Exts.count(ImpliedExt))
+          continue;
         auto Version = findDefaultVersion(ImpliedExt);
         addExtension(ImpliedExt, Version->Major, Version->Minor);
+        WorkList.insert(ImpliedExt);
       }
     }
   }
@@ -741,6 +854,41 @@ void RISCVISAInfo::updateFLen() {
     FLen = 32;
 }
 
+void RISCVISAInfo::updateMinVLen() {
+  for (auto const &Ext : Exts) {
+    StringRef ExtName = Ext.first;
+    bool IsZvlExt = ExtName.consume_front("zvl") && ExtName.consume_back("b");
+    if (IsZvlExt) {
+      unsigned ZvlLen;
+      if (!ExtName.getAsInteger(10, ZvlLen))
+        MinVLen = std::max(MinVLen, ZvlLen);
+    }
+  }
+}
+
+void RISCVISAInfo::updateMaxELen() {
+  // handles EEW restriction by sub-extension zve
+  for (auto const &Ext : Exts) {
+    StringRef ExtName = Ext.first;
+    bool IsZveExt = ExtName.consume_front("zve");
+    if (IsZveExt) {
+      if (ExtName.back() == 'f')
+        MaxELenFp = std::max(MaxELenFp, 32u);
+      if (ExtName.back() == 'd')
+        MaxELenFp = std::max(MaxELenFp, 64u);
+      ExtName = ExtName.drop_back();
+      unsigned ZveELen;
+      ExtName.getAsInteger(10, ZveELen);
+      MaxELen = std::max(MaxELen, ZveELen);
+    }
+    if (ExtName == "v") {
+      MaxELenFp = 64;
+      MaxELen = 64;
+      return;
+    }
+  }
+}
+
 std::string RISCVISAInfo::toString() const {
   std::string Buffer;
   raw_string_ostream Arch(Buffer);
@@ -748,7 +896,7 @@ std::string RISCVISAInfo::toString() const {
   Arch << "rv" << XLen;
 
   ListSeparator LS("_");
-  for (auto &Ext : Exts) {
+  for (auto const &Ext : Exts) {
     StringRef ExtName = Ext.first;
     auto ExtInfo = Ext.second;
     Arch << LS << ExtName;
@@ -757,3 +905,17 @@ std::string RISCVISAInfo::toString() const {
 
   return Arch.str();
 }
+
+std::vector<std::string> RISCVISAInfo::toFeatureVector() const {
+  std::vector<std::string> FeatureVector;
+  for (auto const &Ext : Exts) {
+    std::string ExtName = Ext.first;
+    if (ExtName == "i") // i is not recognized in clang -cc1
+      continue;
+    std::string Feature = isExperimentalExtension(ExtName)
+                              ? "+experimental-" + ExtName
+                              : "+" + ExtName;
+    FeatureVector.push_back(Feature);
+  }
+  return FeatureVector;
+}
diff --git a/llvm/lib/Support/ScopedPrinter.cpp b/llvm/lib/Support/ScopedPrinter.cpp
index ea90a24eaced..a434e50e8c1f 100644
--- a/llvm/lib/Support/ScopedPrinter.cpp
+++ b/llvm/lib/Support/ScopedPrinter.cpp
@@ -1,7 +1,6 @@
 #include "llvm/Support/ScopedPrinter.h"
 
 #include "llvm/Support/Format.h"
-#include <cctype>
 
 using namespace llvm::support;
 
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index c018dc92bf40..5ce41c987029 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/StringSaver.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Support/Signposts.cpp b/llvm/lib/Support/Signposts.cpp
index 58fafb26cdf3..074dddc81c80 100644
--- a/llvm/lib/Support/Signposts.cpp
+++ b/llvm/lib/Support/Signposts.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Signposts.h"
-#include "llvm/Support/Timer.h"
 
 #include "llvm/Config/config.h"
 #if LLVM_SUPPORT_XCODE_SIGNPOSTS
diff --git a/llvm/lib/Support/SmallPtrSet.cpp b/llvm/lib/Support/SmallPtrSet.cpp
index f6e2dfb8a6c9..cbb87ea8717c 100644
--- a/llvm/lib/Support/SmallPtrSet.cpp
+++ b/llvm/lib/Support/SmallPtrSet.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemAlloc.h"
 #include <algorithm>
diff --git a/llvm/lib/Support/SmallVector.cpp b/llvm/lib/Support/SmallVector.cpp
index 2d7721e4e1fb..8cafbc7fad0d 100644
--- a/llvm/lib/Support/SmallVector.cpp
+++ b/llvm/lib/Support/SmallVector.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/MemAlloc.h"
 #include <cstdint>
 #ifdef LLVM_ENABLE_EXCEPTIONS
 #include <stdexcept>
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 1939ed9e9547..137b37f2b1c3 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -15,7 +15,6 @@
 
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/VirtualFileSystem.h"
diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp
index f65d3846623c..012c785b4351 100644
--- a/llvm/lib/Support/StringMap.cpp
+++ b/llvm/lib/Support/StringMap.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/MathExtras.h"
 
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index 652303fdb6a0..3ed08ed38661 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -597,3 +597,11 @@ bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
 hash_code llvm::hash_value(StringRef S) {
   return hash_combine_range(S.begin(), S.end());
 }
+
+unsigned DenseMapInfo<StringRef, void>::getHashValue(StringRef Val) {
+  assert(Val.data() != getEmptyKey().data() &&
+         "Cannot hash the empty key!");
+  assert(Val.data() != getTombstoneKey().data() &&
+         "Cannot hash the tombstone key!");
+  return (unsigned)(hash_value(Val));
+}
diff --git a/llvm/lib/Support/SymbolRemappingReader.cpp b/llvm/lib/Support/SymbolRemappingReader.cpp
index 1caf0947216e..90997ab0a6ce 100644
--- a/llvm/lib/Support/SymbolRemappingReader.cpp
+++ b/llvm/lib/Support/SymbolRemappingReader.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp
index bc60bdea5f62..0105cd2e8153 100644
--- a/llvm/lib/Support/TargetParser.cpp
+++ b/llvm/lib/Support/TargetParser.cpp
@@ -13,10 +13,8 @@
 
 #include "llvm/Support/TargetParser.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/ADT/Triple.h"
 
 using namespace llvm;
 using namespace AMDGPU;
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index 54ea84d4bd6d..9f92ae1c7a7c 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -13,8 +13,12 @@
 #include "llvm/Support/ThreadPool.h"
 
 #include "llvm/Config/llvm-config.h"
+
+#if LLVM_ENABLE_THREADS
 #include "llvm/Support/Threading.h"
+#else
 #include "llvm/Support/raw_ostream.h"
+#endif
 
 using namespace llvm;
 
@@ -117,6 +121,10 @@ void ThreadPool::wait() {
   }
 }
 
+bool ThreadPool::isWorkerThread() const {
+  report_fatal_error("LLVM compiled without multithreading");
+}
+
 ThreadPool::~ThreadPool() { wait(); }
 
 #endif
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index 2b094a4983a0..9380fa01c84a 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -11,10 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/TimeProfiler.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Threading.h"
diff --git a/llvm/lib/Support/ToolOutputFile.cpp b/llvm/lib/Support/ToolOutputFile.cpp
index c192ce60f31c..c2ca97a59c62 100644
--- a/llvm/lib/Support/ToolOutputFile.cpp
+++ b/llvm/lib/Support/ToolOutputFile.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Signals.h"
 using namespace llvm;
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index 2819dc0c139a..20dea8c302a5 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/SwapByteOrder.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/Support/ARMTargetParser.h"
 #include "llvm/Support/VersionTuple.h"
 #include <cassert>
 #include <cstring>
@@ -663,12 +663,16 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8_6a;
   case ARM::ArchKind::ARMV8_7A:
     return Triple::ARMSubArch_v8_7a;
+  case ARM::ArchKind::ARMV8_8A:
+    return Triple::ARMSubArch_v8_8a;
   case ARM::ArchKind::ARMV9A:
     return Triple::ARMSubArch_v9;
   case ARM::ArchKind::ARMV9_1A:
     return Triple::ARMSubArch_v9_1a;
   case ARM::ArchKind::ARMV9_2A:
     return Triple::ARMSubArch_v9_2a;
+  case ARM::ArchKind::ARMV9_3A:
+    return Triple::ARMSubArch_v9_3a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
diff --git a/llvm/lib/Support/TypeSize.cpp b/llvm/lib/Support/TypeSize.cpp
index abb81016a0ba..a80fde83e3bc 100644
--- a/llvm/lib/Support/TypeSize.cpp
+++ b/llvm/lib/Support/TypeSize.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Support/TypeSize.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/WithColor.h"
 
 #include "DebugOptions.h"
 
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index f5cb5895d95d..788460d657fe 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -273,7 +273,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
     // the program, and not the eventual binary file. Therefore, call realpath
     // so this behaves the same on all platforms.
 #if _POSIX_VERSION >= 200112 || defined(__GLIBC__)
-    if (char *real_path = realpath(exe_path, NULL)) {
+    if (char *real_path = realpath(exe_path, nullptr)) {
       std::string ret = std::string(real_path);
       free(real_path);
       return ret;
@@ -380,20 +380,22 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
     return std::error_code();
   }
 
-  result.reserve(PATH_MAX);
+  result.resize_for_overwrite(PATH_MAX);
 
   while (true) {
-    if (::getcwd(result.data(), result.capacity()) == nullptr) {
+    if (::getcwd(result.data(), result.size()) == nullptr) {
       // See if there was a real error.
-      if (errno != ENOMEM)
+      if (errno != ENOMEM) {
+        result.clear();
         return std::error_code(errno, std::generic_category());
+      }
       // Otherwise there just wasn't enough space.
-      result.reserve(result.capacity() * 2);
+      result.resize_for_overwrite(result.capacity() * 2);
     } else
       break;
   }
 
-  result.set_size(strlen(result.data()));
+  result.truncate(strlen(result.data()));
   return std::error_code();
 }
 
@@ -870,6 +872,17 @@ void mapped_file_region::unmapImpl() {
     ::munmap(Mapping, Size);
 }
 
+void mapped_file_region::dontNeedImpl() {
+  assert(Mode == mapped_file_region::readonly);
+#if defined(__MVS__) || defined(_AIX)
+  // If we don't have madvise, or it isn't beneficial, treat this as a no-op.
+  return;
+#else
+  if (Mapping)
+    ::madvise(Mapping, Size, MADV_DONTNEED);
+#endif
+}
+
 int mapped_file_region::alignment() {
   return Process::getPageSizeEstimate();
 }
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index bec4e8dbe06c..f15e301874c4 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -35,7 +35,6 @@
 #include "llvm/Support/FileSystem/UniqueID.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/YAMLParser.h"
@@ -46,9 +45,7 @@
 #include <cstdint>
 #include <iterator>
 #include <limits>
-#include <map>
 #include <memory>
-#include <mutex>
 #include <string>
 #include <system_error>
 #include <utility>
@@ -574,6 +571,11 @@ public:
   }
   virtual ~InMemoryNode() = default;
 
+  /// Return the \p Status for this node. \p RequestedName should be the name
+  /// through which the caller referred to this node. It will override
+  /// \p Status::Name in the return value, to mimic the behavior of \p RealFile.
+  virtual Status getStatus(const Twine &RequestedName) const = 0;
+
   /// Get the filename of this node (the name without the directory part).
   StringRef getFileName() const { return FileName; }
   InMemoryNodeKind getKind() const { return Kind; }
@@ -589,10 +591,7 @@ public:
       : InMemoryNode(Stat.getName(), IME_File), Stat(std::move(Stat)),
         Buffer(std::move(Buffer)) {}
 
-  /// Return the \p Status for this node. \p RequestedName should be the name
-  /// through which the caller referred to this node. It will override
-  /// \p Status::Name in the return value, to mimic the behavior of \p RealFile.
-  Status getStatus(const Twine &RequestedName) const {
+  Status getStatus(const Twine &RequestedName) const override {
     return Status::copyWithNewName(Stat, RequestedName);
   }
   llvm::MemoryBuffer *getBuffer() const { return Buffer.get(); }
@@ -616,6 +615,10 @@ public:
       : InMemoryNode(Path, IME_HardLink), ResolvedFile(ResolvedFile) {}
   const InMemoryFile &getResolvedFile() const { return ResolvedFile; }
 
+  Status getStatus(const Twine &RequestedName) const override {
+    return ResolvedFile.getStatus(RequestedName);
+  }
+
   std::string toString(unsigned Indent) const override {
     return std::string(Indent, ' ') + "HardLink to -> " +
            ResolvedFile.toString(0);
@@ -668,7 +671,7 @@ public:
   /// Return the \p Status for this node. \p RequestedName should be the name
   /// through which the caller referred to this node. It will override
   /// \p Status::Name in the return value, to mimic the behavior of \p RealFile.
-  Status getStatus(const Twine &RequestedName) const {
+  Status getStatus(const Twine &RequestedName) const override {
     return Status::copyWithNewName(Stat, RequestedName);
   }
 
@@ -704,17 +707,6 @@ public:
   }
 };
 
-namespace {
-Status getNodeStatus(const InMemoryNode *Node, const Twine &RequestedName) {
-  if (auto Dir = dyn_cast<detail::InMemoryDirectory>(Node))
-    return Dir->getStatus(RequestedName);
-  if (auto File = dyn_cast<detail::InMemoryFile>(Node))
-    return File->getStatus(RequestedName);
-  if (auto Link = dyn_cast<detail::InMemoryHardLink>(Node))
-    return Link->getResolvedFile().getStatus(RequestedName);
-  llvm_unreachable("Unknown node type");
-}
-} // namespace
 } // namespace detail
 
 // The UniqueID of in-memory files is derived from path and content.
@@ -734,6 +726,16 @@ static sys::fs::UniqueID getDirectoryID(sys::fs::UniqueID Parent,
   return getUniqueID(llvm::hash_combine(Parent.getFile(), Name));
 }
 
+Status detail::NewInMemoryNodeInfo::makeStatus() const {
+  UniqueID UID =
+      (Type == sys::fs::file_type::directory_file)
+          ? getDirectoryID(DirUID, Name)
+          : getFileID(DirUID, Name, Buffer ? Buffer->getBuffer() : "");
+
+  return Status(Path, UID, llvm::sys::toTimePoint(ModificationTime), User,
+                Group, Buffer ? Buffer->getBufferSize() : 0, Type, Perms);
+}
+
 InMemoryFileSystem::InMemoryFileSystem(bool UseNormalizedPaths)
     : Root(new detail::InMemoryDirectory(
           Status("", getDirectoryID(llvm::sys::fs::UniqueID(), ""),
@@ -754,7 +756,7 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
                                  Optional<uint32_t> Group,
                                  Optional<llvm::sys::fs::file_type> Type,
                                  Optional<llvm::sys::fs::perms> Perms,
-                                 const detail::InMemoryFile *HardLinkTarget) {
+                                 MakeNodeFn MakeNode) {
   SmallString<128> Path;
   P.toVector(Path);
 
@@ -775,7 +777,6 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
   const auto ResolvedGroup = Group.getValueOr(0);
   const auto ResolvedType = Type.getValueOr(sys::fs::file_type::regular_file);
   const auto ResolvedPerms = Perms.getValueOr(sys::fs::all_all);
-  assert(!(HardLinkTarget && Buffer) && "HardLink cannot have a buffer");
   // Any intermediate directories we create should be accessible by
   // the owner, even if Perms says otherwise for the final path.
   const auto NewDirectoryPerms = ResolvedPerms | sys::fs::owner_all;
@@ -786,27 +787,10 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
     if (!Node) {
       if (I == E) {
         // End of the path.
-        std::unique_ptr<detail::InMemoryNode> Child;
-        if (HardLinkTarget)
-          Child.reset(new detail::InMemoryHardLink(P.str(), *HardLinkTarget));
-        else {
-          // Create a new file or directory.
-          Status Stat(
-              P.str(),
-              (ResolvedType == sys::fs::file_type::directory_file)
-                  ? getDirectoryID(Dir->getUniqueID(), Name)
-                  : getFileID(Dir->getUniqueID(), Name, Buffer->getBuffer()),
-              llvm::sys::toTimePoint(ModificationTime), ResolvedUser,
-              ResolvedGroup, Buffer->getBufferSize(), ResolvedType,
-              ResolvedPerms);
-          if (ResolvedType == sys::fs::file_type::directory_file) {
-            Child.reset(new detail::InMemoryDirectory(std::move(Stat)));
-          } else {
-            Child.reset(
-                new detail::InMemoryFile(std::move(Stat), std::move(Buffer)));
-          }
-        }
-        Dir->addChild(Name, std::move(Child));
+        Dir->addChild(
+            Name, MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime,
+                            std::move(Buffer), ResolvedUser, ResolvedGroup,
+                            ResolvedType, ResolvedPerms}));
         return true;
       }
 
@@ -850,7 +834,15 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime,
                                  Optional<llvm::sys::fs::file_type> Type,
                                  Optional<llvm::sys::fs::perms> Perms) {
   return addFile(P, ModificationTime, std::move(Buffer), User, Group, Type,
-                 Perms, /*HardLinkTarget=*/nullptr);
+                 Perms,
+                 [](detail::NewInMemoryNodeInfo NNI)
+                     -> std::unique_ptr<detail::InMemoryNode> {
+                   Status Stat = NNI.makeStatus();
+                   if (Stat.getType() == sys::fs::file_type::directory_file)
+                     return std::make_unique<detail::InMemoryDirectory>(Stat);
+                   return std::make_unique<detail::InMemoryFile>(
+                       Stat, std::move(NNI.Buffer));
+                 });
 }
 
 bool InMemoryFileSystem::addFileNoOwn(const Twine &P, time_t ModificationTime,
@@ -861,7 +853,15 @@ bool InMemoryFileSystem::addFileNoOwn(const Twine &P, time_t ModificationTime,
                                       Optional<llvm::sys::fs::perms> Perms) {
   return addFile(P, ModificationTime, llvm::MemoryBuffer::getMemBuffer(Buffer),
                  std::move(User), std::move(Group), std::move(Type),
-                 std::move(Perms));
+                 std::move(Perms),
+                 [](detail::NewInMemoryNodeInfo NNI)
+                     -> std::unique_ptr<detail::InMemoryNode> {
+                   Status Stat = NNI.makeStatus();
+                   if (Stat.getType() == sys::fs::file_type::directory_file)
+                     return std::make_unique<detail::InMemoryDirectory>(Stat);
+                   return std::make_unique<detail::InMemoryFile>(
+                       Stat, std::move(NNI.Buffer));
+                 });
 }
 
 static ErrorOr<const detail::InMemoryNode *>
@@ -916,14 +916,17 @@ bool InMemoryFileSystem::addHardLink(const Twine &FromPath,
   // before. Resolved ToPath must be a File.
   if (!ToNode || FromNode || !isa<detail::InMemoryFile>(*ToNode))
     return false;
-  return this->addFile(FromPath, 0, nullptr, None, None, None, None,
-                       cast<detail::InMemoryFile>(*ToNode));
+  return addFile(FromPath, 0, nullptr, None, None, None, None,
+                 [&](detail::NewInMemoryNodeInfo NNI) {
+                   return std::make_unique<detail::InMemoryHardLink>(
+                       NNI.Path.str(), *cast<detail::InMemoryFile>(*ToNode));
+                 });
 }
 
 llvm::ErrorOr<Status> InMemoryFileSystem::status(const Twine &Path) {
   auto Node = lookupInMemoryNode(*this, Root.get(), Path);
   if (Node)
-    return detail::getNodeStatus(*Node, Path);
+    return (*Node)->getStatus(Path);
   return Node.getError();
 }
 
@@ -1649,10 +1652,19 @@ private:
                                         sys::path::Style::windows_backslash)) {
         path_style = sys::path::Style::windows_backslash;
       } else {
-        assert(NameValueNode && "Name presence should be checked earlier");
-        error(NameValueNode,
+        // Relative VFS root entries are made absolute to the current working
+        // directory, then we can determine the path style from that.
+        auto EC = sys::fs::make_absolute(Name);
+        if (EC) {
+          assert(NameValueNode && "Name presence should be checked earlier");
+          error(
+              NameValueNode,
               "entry with relative path at the root level is not discoverable");
-        return nullptr;
+          return nullptr;
+        }
+        path_style = sys::path::is_absolute(Name, sys::path::Style::posix)
+                         ? sys::path::Style::posix
+                         : sys::path::Style::windows_backslash;
       }
     }
 
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index b15e71a9ce2a..5f1a364ea1a8 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -132,7 +132,8 @@ const file_t kInvalidFile = INVALID_HANDLE_VALUE;
 
 std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
   SmallVector<wchar_t, MAX_PATH> PathName;
-  DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.capacity());
+  PathName.resize_for_overwrite(PathName.capacity());
+  DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.size());
 
   // A zero return value indicates a failure other than insufficient space.
   if (Size == 0)
@@ -145,7 +146,7 @@ std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
 
   // On success, GetModuleFileNameW returns the number of characters written to
   // the buffer not including the NULL terminator.
-  PathName.set_size(Size);
+  PathName.truncate(Size);
 
   // Convert the result from UTF-16 to UTF-8.
   SmallVector<char, MAX_PATH> PathNameUTF8;
@@ -201,8 +202,8 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
   DWORD len = MAX_PATH;
 
   do {
-    cur_path.reserve(len);
-    len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
+    cur_path.resize_for_overwrite(len);
+    len = ::GetCurrentDirectoryW(cur_path.size(), cur_path.data());
 
     // A zero return value indicates a failure other than insufficient space.
     if (len == 0)
@@ -210,11 +211,11 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
 
     // If there's insufficient space, the len returned is larger than the len
     // given.
-  } while (len > cur_path.capacity());
+  } while (len > cur_path.size());
 
   // On success, GetCurrentDirectoryW returns the number of characters not
   // including the null-terminator.
-  cur_path.set_size(len);
+  cur_path.truncate(len);
 
   if (std::error_code EC =
           UTF16ToUTF8(cur_path.begin(), cur_path.size(), result))
@@ -328,7 +329,7 @@ static std::error_code is_local_internal(SmallVectorImpl<wchar_t> &Path,
   // the null terminator, it will leave the output unterminated.  Push a null
   // terminator onto the end to ensure that this never happens.
   VolumePath.push_back(L'\0');
-  VolumePath.set_size(wcslen(VolumePath.data()));
+  VolumePath.truncate(wcslen(VolumePath.data()));
   const wchar_t *P = VolumePath.data();
 
   UINT Type = ::GetDriveTypeW(P);
@@ -364,18 +365,19 @@ std::error_code is_local(const Twine &path, bool &result) {
 
 static std::error_code realPathFromHandle(HANDLE H,
                                           SmallVectorImpl<wchar_t> &Buffer) {
+  Buffer.resize_for_overwrite(Buffer.capacity());
   DWORD CountChars = ::GetFinalPathNameByHandleW(
       H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED);
   if (CountChars && CountChars >= Buffer.capacity()) {
     // The buffer wasn't big enough, try again.  In this case the return value
     // *does* indicate the size of the null terminator.
-    Buffer.reserve(CountChars);
+    Buffer.resize_for_overwrite(CountChars);
     CountChars = ::GetFinalPathNameByHandleW(
-        H, Buffer.begin(), Buffer.capacity(), FILE_NAME_NORMALIZED);
+        H, Buffer.begin(), Buffer.size(), FILE_NAME_NORMALIZED);
   }
+  Buffer.truncate(CountChars);
   if (CountChars == 0)
     return mapWindowsError(GetLastError());
-  Buffer.set_size(CountChars);
   return std::error_code();
 }
 
@@ -959,6 +961,8 @@ void mapped_file_region::unmapImpl() {
   }
 }
 
+void mapped_file_region::dontNeedImpl() {}
+
 int mapped_file_region::alignment() {
   SYSTEM_INFO SysInfo;
   ::GetSystemInfo(&SysInfo);
@@ -1448,14 +1452,14 @@ static bool getTempDirEnvVar(const wchar_t *Var, SmallVectorImpl<char> &Res) {
   SmallVector<wchar_t, 1024> Buf;
   size_t Size = 1024;
   do {
-    Buf.reserve(Size);
-    Size = GetEnvironmentVariableW(Var, Buf.data(), Buf.capacity());
+    Buf.resize_for_overwrite(Size);
+    Size = GetEnvironmentVariableW(Var, Buf.data(), Buf.size());
     if (Size == 0)
       return false;
 
     // Try again with larger buffer.
-  } while (Size > Buf.capacity());
-  Buf.set_size(Size);
+  } while (Size > Buf.size());
+  Buf.truncate(Size);
 
   return !windows::UTF16ToUTF8(Buf.data(), Size, Res);
 }
@@ -1504,7 +1508,7 @@ std::error_code CodePageToUTF16(unsigned codepage,
     }
 
     utf16.reserve(len + 1);
-    utf16.set_size(len);
+    utf16.resize_for_overwrite(len);
 
     len = ::MultiByteToWideChar(codepage, MB_ERR_INVALID_CHARS, original.begin(),
                                 original.size(), utf16.begin(), utf16.size());
@@ -1544,8 +1548,8 @@ std::error_code UTF16ToCodePage(unsigned codepage, const wchar_t *utf16,
       return mapWindowsError(::GetLastError());
     }
 
-    converted.reserve(len);
-    converted.set_size(len);
+    converted.reserve(len + 1);
+    converted.resize_for_overwrite(len);
 
     // Now do the actual conversion.
     len = ::WideCharToMultiByte(codepage, 0, utf16, utf16_len, converted.data(),
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 6732063b562e..dfaab1613de1 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -129,16 +129,16 @@ Optional<std::string> Process::GetEnv(StringRef Name) {
   SmallVector<wchar_t, MAX_PATH> Buf;
   size_t Size = MAX_PATH;
   do {
-    Buf.reserve(Size);
+    Buf.resize_for_overwrite(Size);
     SetLastError(NO_ERROR);
     Size =
-      GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity());
+      GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.size());
     if (Size == 0 && GetLastError() == ERROR_ENVVAR_NOT_FOUND)
       return None;
 
     // Try again with larger buffer.
-  } while (Size > Buf.capacity());
-  Buf.set_size(Size);
+  } while (Size > Buf.size());
+  Buf.truncate(Size);
 
   // Convert the result from UTF-16 to UTF-8.
   SmallVector<char, MAX_PATH> Res;
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index a9cf2db7ec72..ee633411584f 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -72,7 +72,7 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
     SmallVector<wchar_t, MAX_PATH> U16Result;
     DWORD Len = MAX_PATH;
     do {
-      U16Result.reserve(Len);
+      U16Result.resize_for_overwrite(Len);
       // Lets attach the extension manually. That is needed for files
       // with a point in name like aaa.bbb. SearchPathW will not add extension
       // from its argument to such files because it thinks they already had one.
@@ -82,13 +82,13 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
         return EC;
 
       Len = ::SearchPathW(Path, c_str(U16NameExt), nullptr,
-                          U16Result.capacity(), U16Result.data(), nullptr);
-    } while (Len > U16Result.capacity());
+                          U16Result.size(), U16Result.data(), nullptr);
+    } while (Len > U16Result.size());
 
     if (Len == 0)
       continue;
 
-    U16Result.set_size(Len);
+    U16Result.truncate(Len);
 
     if (std::error_code EC =
         windows::UTF16ToUTF8(U16Result.data(), U16Result.size(), U8Result))
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
index ab49ac548f89..10f9692d217e 100644
--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/Support/X86TargetParser.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Triple.h"
 #include <numeric>
 
 using namespace llvm;
diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index 2adf37a511d1..200261d3ed5c 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Unicode.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index aa6163a76161..8cdd03149bcf 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -18,13 +18,12 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Unicode.h"
+#include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <cstdlib>
 #include <cstring>
 #include <string>
 #include <vector>
@@ -300,7 +299,7 @@ void Input::endEnumScalar() {
 bool Input::beginBitSetScalar(bool &DoClear) {
   BitValuesUsed.clear();
   if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
-    BitValuesUsed.insert(BitValuesUsed.begin(), SQ->Entries.size(), false);
+    BitValuesUsed.resize(SQ->Entries.size());
   } else {
     setError(CurrentNode, "expected sequence of bit values");
   }
@@ -527,8 +526,9 @@ std::vector<StringRef> Output::keys() {
 }
 
 bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
-                          bool &UseDefault, void *&) {
+                          bool &UseDefault, void *&SaveInfo) {
   UseDefault = false;
+  SaveInfo = nullptr;
   if (Required || !SameAsDefault || WriteDefaultValues) {
     auto State = StateStack.back();
     if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) {
@@ -599,7 +599,8 @@ void Output::endSequence() {
   StateStack.pop_back();
 }
 
-bool Output::preflightElement(unsigned, void *&) {
+bool Output::preflightElement(unsigned, void *&SaveInfo) {
+  SaveInfo = nullptr;
   return true;
 }
 
@@ -627,7 +628,7 @@ void Output::endFlowSequence() {
   outputUpToEndOfLine(" ]");
 }
 
-bool Output::preflightFlowElement(unsigned, void *&) {
+bool Output::preflightFlowElement(unsigned, void *&SaveInfo) {
   if (NeedFlowSequenceComma)
     output(", ");
   if (WrapColumn && Column > WrapColumn) {
@@ -637,6 +638,7 @@ bool Output::preflightFlowElement(unsigned, void *&) {
     Column = ColumnAtFlowStart;
     output("  ");
   }
+  SaveInfo = nullptr;
   return true;
 }
 
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 4590a3d19b0d..e4b747b68bea 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Duration.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
@@ -24,10 +25,8 @@
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include <algorithm>
-#include <cctype>
 #include <cerrno>
 #include <cstdio>
-#include <iterator>
 #include <sys/stat.h>
 
 // <fcntl.h> may provide O_BINARY.
@@ -643,13 +642,14 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered,
 
   // Get the starting position.
   off_t loc = ::lseek(FD, 0, SEEK_CUR);
-#ifdef _WIN32
-  // MSVCRT's _lseek(SEEK_CUR) doesn't return -1 for pipes.
   sys::fs::file_status Status;
   std::error_code EC = status(FD, Status);
-  SupportsSeeking = !EC && Status.type() == sys::fs::file_type::regular_file;
+  IsRegularFile = Status.type() == sys::fs::file_type::regular_file;
+#ifdef _WIN32
+  // MSVCRT's _lseek(SEEK_CUR) doesn't return -1 for pipes.
+  SupportsSeeking = !EC && IsRegularFile;
 #else
-  SupportsSeeking = loc != (off_t)-1;
+  SupportsSeeking = !EC && loc != (off_t)-1;
 #endif
   if (!SupportsSeeking)
     pos = 0;
@@ -869,8 +869,8 @@ Expected<sys::fs::FileLocker> raw_fd_ostream::lock() {
 }
 
 Expected<sys::fs::FileLocker>
-raw_fd_ostream::tryLockFor(std::chrono::milliseconds Timeout) {
-  std::error_code EC = sys::fs::tryLockFile(FD, Timeout);
+raw_fd_ostream::tryLockFor(Duration const& Timeout) {
+  std::error_code EC = sys::fs::tryLockFile(FD, Timeout.getDuration());
   if (!EC)
     return sys::fs::FileLocker(FD);
   return errorCodeToError(EC);
@@ -914,8 +914,7 @@ raw_fd_stream::raw_fd_stream(StringRef Filename, std::error_code &EC)
   if (EC)
     return;
 
-  // Do not support non-seekable files.
-  if (!supportsSeeking())
+  if (!isRegularFile())
     EC = std::make_error_code(std::errc::invalid_argument);
 }
 
@@ -937,10 +936,6 @@ bool raw_fd_stream::classof(const raw_ostream *OS) {
 //  raw_string_ostream
 //===----------------------------------------------------------------------===//
 
-raw_string_ostream::~raw_string_ostream() {
-  flush();
-}
-
 void raw_string_ostream::write_impl(const char *Ptr, size_t Size) {
   OS.append(Ptr, Size);
 }
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 6ccca4d69f40..3709a375ed1b 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -3203,7 +3203,8 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) {
   // iteration variable being assigned.
 
   ListInit *EmptyList = ListInit::get({}, BitRecTy::get());
-  ListInit *SingletonList = ListInit::get({BitInit::get(1)}, BitRecTy::get());
+  ListInit *SingletonList =
+      ListInit::get({BitInit::get(true)}, BitRecTy::get());
   RecTy *BitListTy = ListRecTy::get(BitRecTy::get());
 
   // The foreach containing the then-clause selects SingletonList if
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index b0dd30c13137..4d1464901777 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -26,7 +26,6 @@ class AArch64Subtarget;
 class AArch64TargetMachine;
 class FunctionPass;
 class InstructionSelector;
-class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
 FunctionPass *createAArch64RedundantCopyEliminationPass();
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index cb17fd94c335..b87468d5c8de 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -416,6 +416,12 @@ def FeatureHCX : SubtargetFeature<
 def FeatureLS64 : SubtargetFeature<"ls64", "HasLS64",
     "true", "Enable Armv8.7-A LD64B/ST64B Accelerator Extension">;
 
+def FeatureHBC : SubtargetFeature<"hbc", "HasHBC",
+    "true", "Enable Armv8.8-A Hinted Conditional Branches Extension">;
+
+def FeatureMOPS : SubtargetFeature<"mops", "HasMOPS",
+    "true", "Enable Armv8.8-A memcpy and memset acceleration instructions">;
+
 def FeatureBRBE : SubtargetFeature<"brbe", "HasBRBE",
     "true", "Enable Branch Record Buffer Extension">;
 
@@ -497,6 +503,10 @@ def HasV8_7aOps : SubtargetFeature<
   "v8.7a", "HasV8_7aOps", "true", "Support ARM v8.7a instructions",
   [HasV8_6aOps, FeatureXS, FeatureWFxT, FeatureHCX]>;
 
+def HasV8_8aOps : SubtargetFeature<
+  "v8.8a", "HasV8_8aOps", "true", "Support ARM v8.8a instructions",
+  [HasV8_7aOps, FeatureHBC, FeatureMOPS]>;
+
 def HasV9_0aOps : SubtargetFeature<
   "v9a", "HasV9_0aOps", "true", "Support ARM v9a instructions",
   [HasV8_5aOps, FeatureSVE2]>;
@@ -509,21 +519,22 @@ def HasV9_2aOps : SubtargetFeature<
   "v9.2a", "HasV9_2aOps", "true", "Support ARM v9.2a instructions",
   [HasV8_7aOps, HasV9_1aOps]>;
 
+def HasV9_3aOps : SubtargetFeature<
+  "v9.3a", "HasV9_3aOps", "true", "Support ARM v9.3a instructions",
+  [HasV8_8aOps, HasV9_2aOps]>;
+
 def HasV8_0rOps : SubtargetFeature<
   "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions",
   [//v8.1
   FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2,
   //v8.2
-  FeaturePerfMon, FeatureRAS, FeaturePsUAO, FeatureSM4,
-  FeatureSHA3, FeatureCCPP, FeatureFullFP16, FeaturePAN_RWV,
+  FeatureRAS, FeaturePsUAO, FeatureCCPP, FeaturePAN_RWV,
   //v8.3
   FeatureComplxNum, FeatureCCIDX, FeatureJS,
   FeaturePAuth, FeatureRCPC,
   //v8.4
-  FeatureDotProd, FeatureFP16FML, FeatureTRACEV8_4,
-  FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO,
-  //v8.5
-  FeatureSSBS, FeaturePredRes, FeatureSB, FeatureSpecRestrict]>;
+  FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI,
+  FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -955,7 +966,9 @@ def ProcessorFeatures {
   list<SubtargetFeature> A710 = [HasV9_0aOps, FeatureNEON, FeaturePerfMon,
                                  FeatureETE, FeatureMTE, FeatureFP16FML,
                                  FeatureSVE2BitPerm, FeatureBF16, FeatureMatMulInt8];
-  list<SubtargetFeature> R82  = [HasV8_0rOps];
+  list<SubtargetFeature> R82  = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
+                                 FeatureFP16FML, FeatureSSBS, FeaturePredRes,
+                                 FeatureSB, FeatureSpecRestrict];
   list<SubtargetFeature> X1   = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureRCPC, FeaturePerfMon,
                                  FeatureSPE, FeatureFullFP16, FeatureDotProd];
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index c90601443934..f26151536a58 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -468,7 +468,7 @@ def CSR_Darwin_AArch64_TLS
 // CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS.
 def CSR_Darwin_AArch64_CXX_TLS
     : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
-                           (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+                           (sub (sequence "X%u", 1, 28), X9, X15, X16, X17, X18, X19),
                            (sequence "D%u", 0, 31))>;
 
 // CSRs that are handled by prologue, epilogue.
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index ee6e670fe3cd..109b739528bf 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -443,7 +443,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask;
   bool FalseZero = FalseLanes == AArch64::FalseLanesZero;
 
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
 
   if (DType == AArch64::DestructiveBinary)
@@ -989,7 +989,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                    .addReg(DstReg, RegState::Kill)
                    .addReg(DstReg, DstFlags | RegState::Implicit);
       } else {
-        unsigned DstReg = MI.getOperand(0).getReg();
+        Register DstReg = MI.getOperand(0).getReg();
         MIB2 = BuildMI(MBB, MBBI, DL, TII->get(AArch64::LDRXui))
                    .add(MI.getOperand(0))
                    .addUse(DstReg, RegState::Kill);
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 3dc694df509d..c67fa62c7a92 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -355,7 +355,7 @@ unsigned AArch64FastISel::fastMaterializeAlloca(const AllocaInst *AI) {
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    Register ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addFrameIndex(SI->second)
@@ -378,7 +378,7 @@ unsigned AArch64FastISel::materializeInt(const ConstantInt *CI, MVT VT) {
   const TargetRegisterClass *RC = (VT == MVT::i64) ? &AArch64::GPR64RegClass
                                                    : &AArch64::GPR32RegClass;
   unsigned ZeroReg = (VT == MVT::i64) ? AArch64::XZR : AArch64::WZR;
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
           ResultReg).addReg(ZeroReg, getKillRegState(true));
   return ResultReg;
@@ -410,11 +410,11 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
     const TargetRegisterClass *RC = Is64Bit ?
         &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
 
-    unsigned TmpReg = createResultReg(RC);
+    Register TmpReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc1), TmpReg)
         .addImm(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
 
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(TmpReg, getKillRegState(true));
@@ -427,12 +427,12 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
   Align Alignment = DL.getPrefTypeAlign(CFP->getType());
 
   unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment);
-  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  Register ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
           ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
 
   unsigned Opc = Is64Bit ? AArch64::LDRDui : AArch64::LDRSui;
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(ADRPReg)
       .addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
@@ -455,7 +455,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
   if (!DestEVT.isSimple())
     return 0;
 
-  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  Register ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   unsigned ResultReg;
 
   if (OpFlags & AArch64II::MO_GOT) {
@@ -482,7 +482,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 
     // LDRWui produces a 32-bit register, but pointers in-register are 64-bits
     // so we must extend the result on ILP32.
-    unsigned Result64 = createResultReg(&AArch64::GPR64RegClass);
+    Register Result64 = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::SUBREG_TO_REG))
         .addDef(Result64)
@@ -751,7 +751,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
         if (const auto *C = dyn_cast<ConstantInt>(RHS))
           if (C->getValue() == 0xffffffff) {
             Addr.setExtendType(AArch64_AM::UXTW);
-            unsigned Reg = getRegForValue(LHS);
+            Register Reg = getRegForValue(LHS);
             if (!Reg)
               return false;
             Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32);
@@ -760,7 +760,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
           }
       }
 
-    unsigned Reg = getRegForValue(Src);
+    Register Reg = getRegForValue(Src);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
@@ -821,7 +821,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
       }
     }
 
-    unsigned Reg = getRegForValue(Src);
+    Register Reg = getRegForValue(Src);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
@@ -847,7 +847,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
         Addr.setExtendType(AArch64_AM::LSL);
         Addr.setExtendType(AArch64_AM::UXTW);
 
-        unsigned Reg = getRegForValue(LHS);
+        Register Reg = getRegForValue(LHS);
         if (!Reg)
           return false;
         Reg = fastEmitInst_extractsubreg(MVT::i32, Reg, AArch64::sub_32);
@@ -879,7 +879,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
       break;
 
     Addr.setShift(0);
-    unsigned Reg = getRegForValue(Src);
+    Register Reg = getRegForValue(Src);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
@@ -888,7 +888,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
   } // end switch
 
   if (Addr.isRegBase() && !Addr.getReg()) {
-    unsigned Reg = getRegForValue(Obj);
+    Register Reg = getRegForValue(Obj);
     if (!Reg)
       return false;
     Addr.setReg(Reg);
@@ -896,7 +896,7 @@ bool AArch64FastISel::computeAddress(const Value *Obj, Address &Addr, Type *Ty)
   }
 
   if (!Addr.getOffsetReg()) {
-    unsigned Reg = getRegForValue(Obj);
+    Register Reg = getRegForValue(Obj);
     if (!Reg)
       return false;
     Addr.setOffsetReg(Reg);
@@ -1034,7 +1034,7 @@ bool AArch64FastISel::simplifyAddress(Address &Addr, MVT VT) {
   // continue. This should almost never happen.
   if ((ImmediateOffsetNeedsLowering || Addr.getOffsetReg()) && Addr.isFIBase())
   {
-    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    Register ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
       .addFrameIndex(Addr.getFI())
@@ -1178,7 +1178,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
             SI->getOpcode() == Instruction::AShr   )
           std::swap(LHS, RHS);
 
-  unsigned LHSReg = getRegForValue(LHS);
+  Register LHSReg = getRegForValue(LHS);
   if (!LHSReg)
     return 0;
 
@@ -1207,13 +1207,13 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
     if (const auto *SI = dyn_cast<BinaryOperator>(RHS))
       if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1)))
         if ((SI->getOpcode() == Instruction::Shl) && (C->getZExtValue() < 4)) {
-          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          Register RHSReg = getRegForValue(SI->getOperand(0));
           if (!RHSReg)
             return 0;
           return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType,
                                C->getZExtValue(), SetFlags, WantResult);
         }
-    unsigned RHSReg = getRegForValue(RHS);
+    Register RHSReg = getRegForValue(RHS);
     if (!RHSReg)
       return 0;
     return emitAddSub_rx(UseAdd, RetVT, LHSReg, RHSReg, ExtendType, 0,
@@ -1232,7 +1232,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
 
       assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
       uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
-      unsigned RHSReg = getRegForValue(MulLHS);
+      Register RHSReg = getRegForValue(MulLHS);
       if (!RHSReg)
         return 0;
       ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, AArch64_AM::LSL,
@@ -1255,7 +1255,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
         }
         uint64_t ShiftVal = C->getZExtValue();
         if (ShiftType != AArch64_AM::InvalidShiftExtend) {
-          unsigned RHSReg = getRegForValue(SI->getOperand(0));
+          Register RHSReg = getRegForValue(SI->getOperand(0));
           if (!RHSReg)
             return 0;
           ResultReg = emitAddSub_rs(UseAdd, RetVT, LHSReg, RHSReg, ShiftType,
@@ -1267,7 +1267,7 @@ unsigned AArch64FastISel::emitAddSub(bool UseAdd, MVT RetVT, const Value *LHS,
     }
   }
 
-  unsigned RHSReg = getRegForValue(RHS);
+  Register RHSReg = getRegForValue(RHS);
   if (!RHSReg)
     return 0;
 
@@ -1489,7 +1489,7 @@ bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
     if (CFP->isZero() && !CFP->isNegative())
       UseImm = true;
 
-  unsigned LHSReg = getRegForValue(LHS);
+  Register LHSReg = getRegForValue(LHS);
   if (!LHSReg)
     return false;
 
@@ -1500,7 +1500,7 @@ bool AArch64FastISel::emitFCmp(MVT RetVT, const Value *LHS, const Value *RHS) {
     return true;
   }
 
-  unsigned RHSReg = getRegForValue(RHS);
+  Register RHSReg = getRegForValue(RHS);
   if (!RHSReg)
     return false;
 
@@ -1577,7 +1577,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
       if (isa<ConstantInt>(SI->getOperand(1)))
         std::swap(LHS, RHS);
 
-  unsigned LHSReg = getRegForValue(LHS);
+  Register LHSReg = getRegForValue(LHS);
   if (!LHSReg)
     return 0;
 
@@ -1602,7 +1602,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
       assert(isa<ConstantInt>(MulRHS) && "Expected a ConstantInt.");
       uint64_t ShiftVal = cast<ConstantInt>(MulRHS)->getValue().logBase2();
 
-      unsigned RHSReg = getRegForValue(MulLHS);
+      Register RHSReg = getRegForValue(MulLHS);
       if (!RHSReg)
         return 0;
       ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal);
@@ -1616,7 +1616,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
     if (const auto *SI = dyn_cast<ShlOperator>(RHS))
       if (const auto *C = dyn_cast<ConstantInt>(SI->getOperand(1))) {
         uint64_t ShiftVal = C->getZExtValue();
-        unsigned RHSReg = getRegForValue(SI->getOperand(0));
+        Register RHSReg = getRegForValue(SI->getOperand(0));
         if (!RHSReg)
           return 0;
         ResultReg = emitLogicalOp_rs(ISDOpc, RetVT, LHSReg, RHSReg, ShiftVal);
@@ -1625,7 +1625,7 @@ unsigned AArch64FastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
       }
   }
 
-  unsigned RHSReg = getRegForValue(RHS);
+  Register RHSReg = getRegForValue(RHS);
   if (!RHSReg)
     return 0;
 
@@ -1673,7 +1673,7 @@ unsigned AArch64FastISel::emitLogicalOp_ri(unsigned ISDOpc, MVT RetVT,
   if (!AArch64_AM::isLogicalImmediate(Imm, RegSize))
     return 0;
 
-  unsigned ResultReg =
+  Register ResultReg =
       fastEmitInst_ri(Opc, RC, LHSReg,
                       AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
   if (RetVT >= MVT::i8 && RetVT <= MVT::i16 && ISDOpc != ISD::AND) {
@@ -1715,7 +1715,7 @@ unsigned AArch64FastISel::emitLogicalOp_rs(unsigned ISDOpc, MVT RetVT,
     RC = &AArch64::GPR64RegClass;
     break;
   }
-  unsigned ResultReg =
+  Register ResultReg =
       fastEmitInst_rri(Opc, RC, LHSReg, RHSReg,
                        AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftImm));
   if (RetVT >= MVT::i8 && RetVT <= MVT::i16) {
@@ -1841,7 +1841,7 @@ unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
   }
 
   // Create the base instruction, then add the operands.
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                     TII.get(Opc), ResultReg);
   addLoadStoreOperands(Addr, MIB, MachineMemOperand::MOLoad, ScaleFactor, MMO);
@@ -1856,7 +1856,7 @@ unsigned AArch64FastISel::emitLoad(MVT VT, MVT RetVT, Address Addr,
   // For zero-extending loads to 64bit we emit a 32bit load and then convert
   // the 32bit reg to a 64bit reg.
   if (WantZExt && RetVT == MVT::i64 && VT <= MVT::i32) {
-    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    Register Reg64 = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), Reg64)
         .addImm(0)
@@ -1991,7 +1991,7 @@ bool AArch64FastISel::selectLoad(const Instruction *I) {
     // The integer extend hasn't been emitted yet. FastISel or SelectionDAG
     // could select it. Emit a copy to subreg if necessary. FastISel will remove
     // it when it selects the integer extend.
-    unsigned Reg = lookUpRegForValue(IntExtVal);
+    Register Reg = lookUpRegForValue(IntExtVal);
     auto *MI = MRI.getUniqueVRegDef(Reg);
     if (!MI) {
       if (RetVT == MVT::i64 && VT <= MVT::i32) {
@@ -2174,7 +2174,7 @@ bool AArch64FastISel::selectStore(const Instruction *I) {
     // The non-atomic instructions are sufficient for relaxed stores.
     if (isReleaseOrStronger(Ord)) {
       // The STLR addressing mode only supports a base reg; pass that directly.
-      unsigned AddrReg = getRegForValue(PtrV);
+      Register AddrReg = getRegForValue(PtrV);
       return emitStoreRelease(VT, SrcReg, AddrReg,
                               createMachineMemOperandFor(I));
     }
@@ -2339,7 +2339,7 @@ bool AArch64FastISel::emitCompareAndBranch(const BranchInst *BI) {
   unsigned Opc = OpcTable[IsBitTest][IsCmpNE][Is64Bit];
   const MCInstrDesc &II = TII.get(Opc);
 
-  unsigned SrcReg = getRegForValue(LHS);
+  Register SrcReg = getRegForValue(LHS);
   if (!SrcReg)
     return false;
 
@@ -2454,7 +2454,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     if (foldXALUIntrinsic(CC, I, BI->getCondition())) {
       // Fake request the condition, otherwise the intrinsic might be completely
       // optimized away.
-      unsigned CondReg = getRegForValue(BI->getCondition());
+      Register CondReg = getRegForValue(BI->getCondition());
       if (!CondReg)
         return false;
 
@@ -2468,7 +2468,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
     }
   }
 
-  unsigned CondReg = getRegForValue(BI->getCondition());
+  Register CondReg = getRegForValue(BI->getCondition());
   if (CondReg == 0)
     return false;
 
@@ -2480,7 +2480,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
   }
 
   const MCInstrDesc &II = TII.get(Opcode);
-  unsigned ConstrainedCondReg
+  Register ConstrainedCondReg
     = constrainOperandRegClass(II, CondReg, II.getNumDefs());
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
       .addReg(ConstrainedCondReg)
@@ -2493,7 +2493,7 @@ bool AArch64FastISel::selectBranch(const Instruction *I) {
 
 bool AArch64FastISel::selectIndirectBr(const Instruction *I) {
   const IndirectBrInst *BI = cast<IndirectBrInst>(I);
-  unsigned AddrReg = getRegForValue(BI->getOperand(0));
+  Register AddrReg = getRegForValue(BI->getOperand(0));
   if (AddrReg == 0)
     return false;
 
@@ -2563,7 +2563,7 @@ bool AArch64FastISel::selectCmp(const Instruction *I) {
   }
 
   if (CondCodes) {
-    unsigned TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
+    Register TmpReg1 = createResultReg(&AArch64::GPR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
             TmpReg1)
         .addReg(AArch64::WZR, getKillRegState(true))
@@ -2630,18 +2630,18 @@ bool AArch64FastISel::optimizeSelect(const SelectInst *SI) {
   if (!Opc)
     return false;
 
-  unsigned Src1Reg = getRegForValue(Src1Val);
+  Register Src1Reg = getRegForValue(Src1Val);
   if (!Src1Reg)
     return false;
 
-  unsigned Src2Reg = getRegForValue(Src2Val);
+  Register Src2Reg = getRegForValue(Src2Val);
   if (!Src2Reg)
     return false;
 
   if (NeedExtraOp)
     Src1Reg = emitLogicalOp_ri(ISD::XOR, MVT::i32, Src1Reg, 1);
 
-  unsigned ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg,
+  Register ResultReg = fastEmitInst_rr(Opc, &AArch64::GPR32RegClass, Src1Reg,
                                        Src2Reg);
   updateValueMap(SI, ResultReg);
   return true;
@@ -2690,7 +2690,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) {
   // Try to pickup the flags, so we don't have to emit another compare.
   if (foldXALUIntrinsic(CC, I, Cond)) {
     // Fake request the condition to force emission of the XALU intrinsic.
-    unsigned CondReg = getRegForValue(Cond);
+    Register CondReg = getRegForValue(Cond);
     if (!CondReg)
       return false;
   } else if (isa<CmpInst>(Cond) && cast<CmpInst>(Cond)->hasOneUse() &&
@@ -2711,7 +2711,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) {
     }
 
     if (FoldSelect) {
-      unsigned SrcReg = getRegForValue(FoldSelect);
+      Register SrcReg = getRegForValue(FoldSelect);
       if (!SrcReg)
         return false;
 
@@ -2739,7 +2739,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) {
     }
     assert((CC != AArch64CC::AL) && "Unexpected condition code.");
   } else {
-    unsigned CondReg = getRegForValue(Cond);
+    Register CondReg = getRegForValue(Cond);
     if (!CondReg)
       return false;
 
@@ -2753,8 +2753,8 @@ bool AArch64FastISel::selectSelect(const Instruction *I) {
         .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
   }
 
-  unsigned Src1Reg = getRegForValue(SI->getTrueValue());
-  unsigned Src2Reg = getRegForValue(SI->getFalseValue());
+  Register Src1Reg = getRegForValue(SI->getTrueValue());
+  Register Src2Reg = getRegForValue(SI->getFalseValue());
 
   if (!Src1Reg || !Src2Reg)
     return false;
@@ -2762,7 +2762,7 @@ bool AArch64FastISel::selectSelect(const Instruction *I) {
   if (ExtraCC != AArch64CC::AL)
     Src2Reg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, ExtraCC);
 
-  unsigned ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, CC);
+  Register ResultReg = fastEmitInst_rri(Opc, RC, Src1Reg, Src2Reg, CC);
   updateValueMap(I, ResultReg);
   return true;
 }
@@ -2772,11 +2772,11 @@ bool AArch64FastISel::selectFPExt(const Instruction *I) {
   if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
     return false;
 
-  unsigned Op = getRegForValue(V);
+  Register Op = getRegForValue(V);
   if (Op == 0)
     return false;
 
-  unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+  Register ResultReg = createResultReg(&AArch64::FPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
           ResultReg).addReg(Op);
   updateValueMap(I, ResultReg);
@@ -2788,11 +2788,11 @@ bool AArch64FastISel::selectFPTrunc(const Instruction *I) {
   if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
     return false;
 
-  unsigned Op = getRegForValue(V);
+  Register Op = getRegForValue(V);
   if (Op == 0)
     return false;
 
-  unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+  Register ResultReg = createResultReg(&AArch64::FPR32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
           ResultReg).addReg(Op);
   updateValueMap(I, ResultReg);
@@ -2805,7 +2805,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
 
-  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  Register SrcReg = getRegForValue(I->getOperand(0));
   if (SrcReg == 0)
     return false;
 
@@ -2825,7 +2825,7 @@ bool AArch64FastISel::selectFPToInt(const Instruction *I, bool Signed) {
     else
       Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
   }
-  unsigned ResultReg = createResultReg(
+  Register ResultReg = createResultReg(
       DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg);
@@ -2844,7 +2844,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
   assert((DestVT == MVT::f32 || DestVT == MVT::f64) &&
          "Unexpected value type.");
 
-  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  Register SrcReg = getRegForValue(I->getOperand(0));
   if (!SrcReg)
     return false;
 
@@ -2871,7 +2871,7 @@ bool AArch64FastISel::selectIntToFP(const Instruction *I, bool Signed) {
       Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
   }
 
-  unsigned ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg);
+  Register ResultReg = fastEmitInst_r(Opc, TLI.getRegClassFor(DestVT), SrcReg);
   updateValueMap(I, ResultReg);
   return true;
 }
@@ -2975,11 +2975,11 @@ bool AArch64FastISel::fastLowerArguments() {
     } else
       llvm_unreachable("Unexpected value type.");
 
-    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(DstReg, getKillRegState(true));
@@ -3009,7 +3009,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
     const Value *ArgVal = CLI.OutVals[VA.getValNo()];
     MVT ArgVT = OutVTs[VA.getValNo()];
 
-    unsigned ArgReg = getRegForValue(ArgVal);
+    Register ArgReg = getRegForValue(ArgVal);
     if (!ArgReg)
       return false;
 
@@ -3104,7 +3104,7 @@ bool AArch64FastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
     if (CopyVT.isVector() && !Subtarget->isLittleEndian())
       return false;
 
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    Register ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(RVLocs[0].getLocReg());
@@ -3209,14 +3209,14 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     else if (Addr.getGlobalValue())
       MIB.addGlobalAddress(Addr.getGlobalValue(), 0, 0);
     else if (Addr.getReg()) {
-      unsigned Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
+      Register Reg = constrainOperandRegClass(II, Addr.getReg(), 0);
       MIB.addReg(Reg);
     } else
       return false;
   } else {
     unsigned CallReg = 0;
     if (Symbol) {
-      unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+      Register ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
               ADRPReg)
           .addSym(Symbol, AArch64II::MO_GOT | AArch64II::MO_PAGE);
@@ -3438,7 +3438,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
 
     // SP = FP + Fixed Object + 16
     int FI = MFI.CreateFixedObject(4, 0, false);
-    unsigned ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    Register ResultReg = createResultReg(&AArch64::GPR64spRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::ADDXri), ResultReg)
             .addFrameIndex(FI)
@@ -3568,10 +3568,10 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       Opc = AArch64::FABSDr;
       break;
     }
-    unsigned SrcReg = getRegForValue(II->getOperand(0));
+    Register SrcReg = getRegForValue(II->getOperand(0));
     if (!SrcReg)
       return false;
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg);
     updateValueMap(II, ResultReg);
@@ -3593,7 +3593,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (!isTypeLegal(RetTy, VT))
       return false;
 
-    unsigned Op0Reg = getRegForValue(II->getOperand(0));
+    Register Op0Reg = getRegForValue(II->getOperand(0));
     if (!Op0Reg)
       return false;
 
@@ -3671,17 +3671,17 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       break;
     case Intrinsic::smul_with_overflow: {
       CC = AArch64CC::NE;
-      unsigned LHSReg = getRegForValue(LHS);
+      Register LHSReg = getRegForValue(LHS);
       if (!LHSReg)
         return false;
 
-      unsigned RHSReg = getRegForValue(RHS);
+      Register RHSReg = getRegForValue(RHS);
       if (!RHSReg)
         return false;
 
       if (VT == MVT::i32) {
         MulReg = emitSMULL_rr(MVT::i64, LHSReg, RHSReg);
-        unsigned MulSubReg =
+        Register MulSubReg =
             fastEmitInst_extractsubreg(VT, MulReg, AArch64::sub_32);
         // cmp xreg, wreg, sxtw
         emitAddSub_rx(/*UseAdd=*/false, MVT::i64, MulReg, MulSubReg,
@@ -3701,11 +3701,11 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     }
     case Intrinsic::umul_with_overflow: {
       CC = AArch64CC::NE;
-      unsigned LHSReg = getRegForValue(LHS);
+      Register LHSReg = getRegForValue(LHS);
       if (!LHSReg)
         return false;
 
-      unsigned RHSReg = getRegForValue(RHS);
+      Register RHSReg = getRegForValue(RHS);
       if (!RHSReg)
         return false;
 
@@ -3799,7 +3799,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
     if (!VA.isRegLoc())
       return false;
 
-    unsigned Reg = getRegForValue(RV);
+    Register Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
 
@@ -3879,7 +3879,7 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) {
       DestVT != MVT::i1)
     return false;
 
-  unsigned SrcReg = getRegForValue(Op);
+  Register SrcReg = getRegForValue(Op);
   if (!SrcReg)
     return false;
 
@@ -3906,7 +3906,7 @@ bool AArch64FastISel::selectTrunc(const Instruction *I) {
       break;
     }
     // Issue an extract_subreg to get the lower 32-bits.
-    unsigned Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg,
+    Register Reg32 = fastEmitInst_extractsubreg(MVT::i32, SrcReg,
                                                 AArch64::sub_32);
     // Create the AND instruction which performs the actual truncation.
     ResultReg = emitAnd_ri(MVT::i32, Reg32, Mask);
@@ -4007,7 +4007,7 @@ unsigned AArch64FastISel::emitLSL_rr(MVT RetVT, unsigned Op0Reg,
   if (NeedTrunc)
     Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask);
 
-  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
+  Register ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
   if (NeedTrunc)
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
   return ResultReg;
@@ -4033,7 +4033,7 @@ unsigned AArch64FastISel::emitLSL_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
   // Just emit a copy for "zero" shifts.
   if (Shift == 0) {
     if (RetVT == SrcVT) {
-      unsigned ResultReg = createResultReg(RC);
+      Register ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
           .addReg(Op0);
@@ -4110,7 +4110,7 @@ unsigned AArch64FastISel::emitLSR_rr(MVT RetVT, unsigned Op0Reg,
     Op0Reg = emitAnd_ri(MVT::i32, Op0Reg, Mask);
     Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask);
   }
-  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
+  Register ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
   if (NeedTrunc)
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
   return ResultReg;
@@ -4136,7 +4136,7 @@ unsigned AArch64FastISel::emitLSR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
   // Just emit a copy for "zero" shifts.
   if (Shift == 0) {
     if (RetVT == SrcVT) {
-      unsigned ResultReg = createResultReg(RC);
+      Register ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
       .addReg(Op0);
@@ -4226,7 +4226,7 @@ unsigned AArch64FastISel::emitASR_rr(MVT RetVT, unsigned Op0Reg,
     Op0Reg = emitIntExt(RetVT, Op0Reg, MVT::i32, /*isZExt=*/false);
     Op1Reg = emitAnd_ri(MVT::i32, Op1Reg, Mask);
   }
-  unsigned ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
+  Register ResultReg = fastEmitInst_rr(Opc, RC, Op0Reg, Op1Reg);
   if (NeedTrunc)
     ResultReg = emitAnd_ri(MVT::i32, ResultReg, Mask);
   return ResultReg;
@@ -4252,7 +4252,7 @@ unsigned AArch64FastISel::emitASR_ri(MVT RetVT, MVT SrcVT, unsigned Op0,
   // Just emit a copy for "zero" shifts.
   if (Shift == 0) {
     if (RetVT == SrcVT) {
-      unsigned ResultReg = createResultReg(RC);
+      Register ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
       .addReg(Op0);
@@ -4428,7 +4428,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
     return false;
 
   // Check if the load instruction has already been selected.
-  unsigned Reg = lookUpRegForValue(LI);
+  Register Reg = lookUpRegForValue(LI);
   if (!Reg)
     return false;
 
@@ -4456,7 +4456,7 @@ bool AArch64FastISel::optimizeIntExtLoad(const Instruction *I, MVT RetVT,
   }
 
   if (IsZExt) {
-    unsigned Reg64 = createResultReg(&AArch64::GPR64RegClass);
+    Register Reg64 = createResultReg(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(AArch64::SUBREG_TO_REG), Reg64)
         .addImm(0)
@@ -4490,7 +4490,7 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) {
   if (optimizeIntExtLoad(I, RetVT, SrcVT))
     return true;
 
-  unsigned SrcReg = getRegForValue(I->getOperand(0));
+  Register SrcReg = getRegForValue(I->getOperand(0));
   if (!SrcReg)
     return false;
 
@@ -4499,7 +4499,7 @@ bool AArch64FastISel::selectIntExt(const Instruction *I) {
   if (const auto *Arg = dyn_cast<Argument>(I->getOperand(0))) {
     if ((IsZExt && Arg->hasZExtAttr()) || (!IsZExt && Arg->hasSExtAttr())) {
       if (RetVT == MVT::i64 && SrcVT != MVT::i64) {
-        unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+        Register ResultReg = createResultReg(&AArch64::GPR64RegClass);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(AArch64::SUBREG_TO_REG), ResultReg)
             .addImm(0)
@@ -4543,21 +4543,21 @@ bool AArch64FastISel::selectRem(const Instruction *I, unsigned ISDOpcode) {
     break;
   }
   unsigned MSubOpc = Is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  Register Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
 
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  Register Src1Reg = getRegForValue(I->getOperand(1));
   if (!Src1Reg)
     return false;
 
   const TargetRegisterClass *RC =
       (DestVT == MVT::i64) ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
-  unsigned QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, Src1Reg);
+  Register QuotReg = fastEmitInst_rr(DivOpc, RC, Src0Reg, Src1Reg);
   assert(QuotReg && "Unexpected DIV instruction emission failure.");
   // The remainder is computed as numerator - (quotient * denominator) using the
   // MSUB instruction.
-  unsigned ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, Src1Reg, Src0Reg);
+  Register ResultReg = fastEmitInst_rrr(MSubOpc, RC, QuotReg, Src1Reg, Src0Reg);
   updateValueMap(I, ResultReg);
   return true;
 }
@@ -4602,7 +4602,7 @@ bool AArch64FastISel::selectMul(const Instruction *I) {
         }
       }
 
-      unsigned Src0Reg = getRegForValue(Src0);
+      Register Src0Reg = getRegForValue(Src0);
       if (!Src0Reg)
         return false;
 
@@ -4615,11 +4615,11 @@ bool AArch64FastISel::selectMul(const Instruction *I) {
       }
     }
 
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  Register Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
 
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  Register Src1Reg = getRegForValue(I->getOperand(1));
   if (!Src1Reg)
     return false;
 
@@ -4666,7 +4666,7 @@ bool AArch64FastISel::selectShift(const Instruction *I) {
       }
     }
 
-    unsigned Op0Reg = getRegForValue(Op0);
+    Register Op0Reg = getRegForValue(Op0);
     if (!Op0Reg)
       return false;
 
@@ -4689,11 +4689,11 @@ bool AArch64FastISel::selectShift(const Instruction *I) {
     return true;
   }
 
-  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  Register Op0Reg = getRegForValue(I->getOperand(0));
   if (!Op0Reg)
     return false;
 
-  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  Register Op1Reg = getRegForValue(I->getOperand(1));
   if (!Op1Reg)
     return false;
 
@@ -4746,11 +4746,11 @@ bool AArch64FastISel::selectBitCast(const Instruction *I) {
   case MVT::f32: RC = &AArch64::FPR32RegClass; break;
   case MVT::f64: RC = &AArch64::FPR64RegClass; break;
   }
-  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  Register Op0Reg = getRegForValue(I->getOperand(0));
   if (!Op0Reg)
     return false;
 
-  unsigned ResultReg = fastEmitInst_r(Opc, RC, Op0Reg);
+  Register ResultReg = fastEmitInst_r(Opc, RC, Op0Reg);
   if (!ResultReg)
     return false;
 
@@ -4810,7 +4810,7 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) {
     return selectBinaryOp(I, ISD::SDIV);
 
   unsigned Lg2 = C.countTrailingZeros();
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
+  Register Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
 
@@ -4840,7 +4840,7 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) {
     SelectOpc = AArch64::CSELWr;
     RC = &AArch64::GPR32RegClass;
   }
-  unsigned SelectReg = fastEmitInst_rri(SelectOpc, RC, AddReg, Src0Reg,
+  Register SelectReg = fastEmitInst_rri(SelectOpc, RC, AddReg, Src0Reg,
                                         AArch64CC::LT);
   if (!SelectReg)
     return false;
@@ -4866,7 +4866,7 @@ bool AArch64FastISel::selectSDiv(const Instruction *I) {
 /// have to duplicate it for AArch64, because otherwise we would fail during the
 /// sign-extend emission.
 unsigned AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
-  unsigned IdxN = getRegForValue(Idx);
+  Register IdxN = getRegForValue(Idx);
   if (IdxN == 0)
     // Unhandled operand. Halt "fast" selection and bail.
     return 0;
@@ -4889,7 +4889,7 @@ bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
   if (Subtarget->isTargetILP32())
     return false;
 
-  unsigned N = getRegForValue(I->getOperand(0));
+  Register N = getRegForValue(I->getOperand(0));
   if (!N)
     return false;
 
@@ -4983,16 +4983,16 @@ bool AArch64FastISel::selectAtomicCmpXchg(const AtomicCmpXchgInst *I) {
 
   const MCInstrDesc &II = TII.get(Opc);
 
-  const unsigned AddrReg = constrainOperandRegClass(
+  const Register AddrReg = constrainOperandRegClass(
       II, getRegForValue(I->getPointerOperand()), II.getNumDefs());
-  const unsigned DesiredReg = constrainOperandRegClass(
+  const Register DesiredReg = constrainOperandRegClass(
       II, getRegForValue(I->getCompareOperand()), II.getNumDefs() + 1);
-  const unsigned NewReg = constrainOperandRegClass(
+  const Register NewReg = constrainOperandRegClass(
       II, getRegForValue(I->getNewValOperand()), II.getNumDefs() + 2);
 
-  const unsigned ResultReg1 = createResultReg(ResRC);
-  const unsigned ResultReg2 = createResultReg(&AArch64::GPR32RegClass);
-  const unsigned ScratchReg = createResultReg(&AArch64::GPR32RegClass);
+  const Register ResultReg1 = createResultReg(ResRC);
+  const Register ResultReg2 = createResultReg(&AArch64::GPR32RegClass);
+  const Register ScratchReg = createResultReg(&AArch64::GPR32RegClass);
 
   // FIXME: MachineMemOperand doesn't support cmpxchg yet.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 638e45b30d99..a4d20735e2b1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -547,7 +547,7 @@ void AArch64FrameLowering::emitCalleeSavedFrameMoves(
     return;
 
   for (const auto &Info : CSI) {
-    unsigned Reg = Info.getReg();
+    Register Reg = Info.getReg();
 
     // Not all unwinders may know about SVE registers, so assume the lowest
     // common demoninator.
@@ -1653,8 +1653,7 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
   // The AUTIASP instruction assembles to a hint instruction before v8.3a so
   // this instruction can safely used for any v8a architecture.
   // From v8.3a onwards there are optimised authenticate LR and return
-  // instructions, namely RETA{A,B}, that can be used instead. In this case the
-  // DW_CFA_AARCH64_negate_ra_state can't be emitted.
+  // instructions, namely RETA{A,B}, that can be used instead.
   if (Subtarget.hasPAuth() && MBBI != MBB.end() &&
       MBBI->getOpcode() == AArch64::RET_ReallyLR) {
     BuildMI(MBB, MBBI, DL,
@@ -1666,12 +1665,6 @@ static void InsertReturnAddressAuth(MachineFunction &MF,
         MBB, MBBI, DL,
         TII->get(MFI.shouldSignWithBKey() ? AArch64::AUTIBSP : AArch64::AUTIASP))
         .setMIFlag(MachineInstr::FrameDestroy);
-
-    unsigned CFIIndex =
-        MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex)
-        .setMIFlags(MachineInstr::FrameDestroy);
   }
 }
 
@@ -2292,7 +2285,7 @@ static void computeCalleeSaveRegisterPairs(
   // MachO's compact unwind format relies on all registers being stored in
   // pairs.
   assert((!produceCompactUnwindFrame(MF) ||
-          CC == CallingConv::PreserveMost ||
+          CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
           (Count & 1) == 0) &&
          "Odd number of callee-saved regs to spill!");
   int ByteOffset = AFI->getCalleeSavedStackSize();
@@ -2331,7 +2324,7 @@ static void computeCalleeSaveRegisterPairs(
 
     // Add the next reg to the pair if it is in the same register class.
     if (unsigned(i + RegInc) < Count) {
-      unsigned NextReg = CSI[i + RegInc].getReg();
+      Register NextReg = CSI[i + RegInc].getReg();
       bool IsFirst = i == FirstReg;
       switch (RPI.Type) {
       case RegPairInfo::GPR:
@@ -2387,7 +2380,7 @@ static void computeCalleeSaveRegisterPairs(
     // MachO's compact unwind format relies on all registers being stored in
     // adjacent register pairs.
     assert((!produceCompactUnwindFrame(MF) ||
-            CC == CallingConv::PreserveMost ||
+            CC == CallingConv::PreserveMost || CC == CallingConv::CXX_FAST_TLS ||
             (RPI.isPaired() &&
              ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
               RPI.Reg1 + 1 == RPI.Reg2))) &&
@@ -3135,7 +3128,7 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
   DebugLoc DL;
   RS->enterBasicBlockEnd(MBB);
   RS->backward(std::prev(MBBI));
-  unsigned DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
+  Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
   assert(DstReg && "There must be a free register after frame setup");
   BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
   BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index e6d997f91b47..31f57cbc49f2 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -26,9 +26,8 @@ public:
       : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
                             true /*StackRealignable*/) {}
 
-  void
-  emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI) const override;
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) const;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index fe9b2f8883b9..899f069abdd4 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5147,5 +5147,5 @@ bool AArch64DAGToDAGISel::SelectAllActivePredicate(SDValue N) {
   const AArch64TargetLowering *TLI =
       static_cast<const AArch64TargetLowering *>(getTargetLowering());
 
-  return TLI->isAllActivePredicate(N);
+  return TLI->isAllActivePredicate(*CurDAG, N);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e141179fb5c8..a26bbc77f248 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -962,6 +962,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setMinFunctionAlignment(Align(4));
   // Set preferred alignments.
   setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
+  setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
   setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
 
   // Only change the limit for entries in a jump table if specified by
@@ -1205,6 +1206,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
       setOperationAction(ISD::ABS, VT, Custom);
+      setOperationAction(ISD::ABDS, VT, Custom);
+      setOperationAction(ISD::ABDU, VT, Custom);
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
@@ -1245,6 +1248,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SELECT_CC, VT, Expand);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
 
       // There are no legal MVT::nxv16f## based types.
       if (VT != MVT::nxv16i1) {
@@ -1831,6 +1835,28 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
     Known = KnownBits::commonBits(Known, Known2);
     break;
   }
+  case AArch64ISD::BICi: {
+    // Compute the bit cleared value.
+    uint64_t Mask =
+        ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
+    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+    Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
+    break;
+  }
+  case AArch64ISD::VLSHR: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
+    Known = KnownBits::lshr(Known, Known2);
+    break;
+  }
+  case AArch64ISD::VASHR: {
+    KnownBits Known2;
+    Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+    Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
+    Known = KnownBits::ashr(Known, Known2);
+    break;
+  }
   case AArch64ISD::LOADgot:
   case AArch64ISD::ADDlow: {
     if (!Subtarget->isTargetILP32())
@@ -1971,6 +1997,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::CSINC)
     MAKE_CASE(AArch64ISD::THREAD_POINTER)
     MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+    MAKE_CASE(AArch64ISD::ABDS_PRED)
+    MAKE_CASE(AArch64ISD::ABDU_PRED)
     MAKE_CASE(AArch64ISD::ADD_PRED)
     MAKE_CASE(AArch64ISD::MUL_PRED)
     MAKE_CASE(AArch64ISD::MULHS_PRED)
@@ -2173,6 +2201,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::INSR)
     MAKE_CASE(AArch64ISD::PTEST)
     MAKE_CASE(AArch64ISD::PTRUE)
+    MAKE_CASE(AArch64ISD::PFALSE)
     MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
     MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
@@ -5173,6 +5202,10 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerFixedLengthVectorSelectToSVE(Op, DAG);
   case ISD::ABS:
     return LowerABS(Op, DAG);
+  case ISD::ABDS:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
+  case ISD::ABDU:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
   case ISD::BITREVERSE:
     return LowerBitreverse(Op, DAG);
   case ISD::BSWAP:
@@ -5380,7 +5413,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
       // Transform the arguments in physical registers into virtual ones.
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
       // If this is an 8, 16 or 32-bit value, it is really passed promoted
@@ -5542,7 +5575,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
       // Conservatively forward X8, since it might be used for aggregate return.
       if (!CCInfo.isAllocated(AArch64::X8)) {
-        unsigned X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
+        Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
         Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
       }
     }
@@ -5626,7 +5659,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+      Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store =
           DAG.getStore(Val.getValue(1), DL, Val, FIN,
@@ -5656,7 +5689,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
 
       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+        Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
 
         SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
@@ -7256,6 +7289,9 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
     return getSVESafeBitCast(VT, IntResult, DAG);
   }
 
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
   if (SrcVT.bitsLT(VT))
     In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
   else if (SrcVT.bitsGT(VT))
@@ -7795,10 +7831,37 @@ SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
                                                   SelectionDAG &DAG) const {
   EVT Ty = Op.getValueType();
   auto Idx = Op.getConstantOperandAPInt(2);
+  int64_t IdxVal = Idx.getSExtValue();
+  assert(Ty.isScalableVector() &&
+         "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
+
+  // We can use the splice instruction for certain index values where we are
+  // able to efficiently generate the correct predicate. The index will be
+  // inverted and used directly as the input to the ptrue instruction, i.e.
+  // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
+  // splice predicate. However, we can only do this if we can guarantee that
+  // there are enough elements in the vector, hence we check the index <= min
+  // number of elements.
+  Optional<unsigned> PredPattern;
+  if (Ty.isScalableVector() && IdxVal < 0 &&
+      (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
+          None) {
+    SDLoc DL(Op);
+
+    // Create a predicate where all but the last -IdxVal elements are false.
+    EVT PredVT = Ty.changeVectorElementType(MVT::i1);
+    SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
+    Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
+
+    // Now splice the two inputs together using the predicate.
+    return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
+                       Op.getOperand(1));
+  }
 
   // This will select to an EXT instruction, which has a maximum immediate
   // value of 255, hence 2048-bits is the maximum value we can lower.
-  if (Idx.sge(-1) && Idx.slt(2048 / Ty.getVectorElementType().getSizeInBits()))
+  if (IdxVal >= 0 &&
+      IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
     return Op;
 
   return SDValue();
@@ -8227,7 +8290,7 @@ SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
   } else {
     // Return LR, which contains the return address. Mark it an implicit
     // live-in.
-    unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+    Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
     ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
   }
 
@@ -9631,14 +9694,12 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
   MVT CastVT;
   if (getScaledOffsetDup(V, Lane, CastVT)) {
     V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
-  } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+  } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+             V.getOperand(0).getValueType().is128BitVector()) {
     // The lane is incremented by the index of the extract.
     // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
-    auto VecVT = V.getOperand(0).getValueType();
-    if (VecVT.isFixedLengthVector() && VecVT.getFixedSizeInBits() <= 128) {
-      Lane += V.getConstantOperandVal(1);
-      V = V.getOperand(0);
-    }
+    Lane += V.getConstantOperandVal(1);
+    V = V.getOperand(0);
   } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
     // The lane is decremented if we are splatting from the 2nd operand.
     // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
@@ -9925,7 +9986,7 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
     // lowering code.
     if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
       if (ConstVal->isZero())
-        return SDValue(DAG.getMachineNode(AArch64::PFALSE, dl, VT), 0);
+        return DAG.getNode(AArch64ISD::PFALSE, dl, VT);
       if (ConstVal->isOne())
         return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
     }
@@ -10978,6 +11039,28 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     if (!isTypeLegal(VT))
       return SDValue();
 
+    // Break down insert_subvector into simpler parts.
+    if (VT.getVectorElementType() == MVT::i1) {
+      unsigned NumElts = VT.getVectorMinNumElements();
+      EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
+
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
+                       DAG.getVectorIdxConstant(0, DL));
+      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
+                       DAG.getVectorIdxConstant(NumElts / 2, DL));
+      if (Idx < (NumElts / 2)) {
+        SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
+                                    DAG.getVectorIdxConstant(Idx, DL));
+        return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
+      } else {
+        SDValue NewHi =
+            DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
+                        DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
+        return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
+      }
+    }
+
     // Ensure the subvector is half the size of the main vector.
     if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
       return SDValue();
@@ -11012,10 +11095,10 @@ SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
     if (Vec0.isUndef())
       return Op;
 
-    unsigned int PredPattern =
+    Optional<unsigned> PredPattern =
         getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
     auto PredTy = VT.changeVectorElementType(MVT::i1);
-    SDValue PTrue = getPTrue(DAG, DL, PredTy, PredPattern);
+    SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
     SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
     return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
   }
@@ -11730,10 +11813,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_ldxr: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -11741,10 +11824,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_stxr: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -11772,7 +11855,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(I.getType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
     return true;
   }
@@ -11782,7 +11865,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(I.getOperand(0)->getType());
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
     return true;
   }
@@ -12320,7 +12403,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
 
   Value *PTrue = nullptr;
   if (UseScalable) {
-    unsigned PgPattern =
+    Optional<unsigned> PgPattern =
         getSVEPredPatternFromNumElements(FVTy->getNumElements());
     if (Subtarget->getMinSVEVectorSizeInBits() ==
             Subtarget->getMaxSVEVectorSizeInBits() &&
@@ -12328,7 +12411,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
       PgPattern = AArch64SVEPredPattern::all;
 
     auto *PTruePat =
-        ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), PgPattern);
+        ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
     PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
                                     {PTruePat});
   }
@@ -12500,7 +12583,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
 
   Value *PTrue = nullptr;
   if (UseScalable) {
-    unsigned PgPattern =
+    Optional<unsigned> PgPattern =
         getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
     if (Subtarget->getMinSVEVectorSizeInBits() ==
             Subtarget->getMaxSVEVectorSizeInBits() &&
@@ -12509,7 +12592,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
       PgPattern = AArch64SVEPredPattern::all;
 
     auto *PTruePat =
-        ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), PgPattern);
+        ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
     PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
                                     {PTruePat});
   }
@@ -12901,7 +12984,7 @@ bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
     return false;
 
-  return (Index == 0 || Index == ResVT.getVectorNumElements());
+  return (Index == 0 || Index == ResVT.getVectorMinNumElements());
 }
 
 /// Turn vector tests of the signbit in the form of:
@@ -14261,6 +14344,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
 static SDValue
 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                               SelectionDAG &DAG) {
+  SDLoc DL(N);
   SDValue Vec = N->getOperand(0);
   SDValue SubVec = N->getOperand(1);
   uint64_t IdxVal = N->getConstantOperandVal(2);
@@ -14286,7 +14370,6 @@ performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   // Fold insert_subvector -> concat_vectors
   // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
   // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
-  SDLoc DL(N);
   SDValue Lo, Hi;
   if (IdxVal == 0) {
     Lo = SubVec;
@@ -15004,7 +15087,15 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
                      Zero);
 }
 
-static bool isAllActivePredicate(SDValue N) {
+static bool isAllInactivePredicate(SDValue N) {
+  // Look through cast.
+  while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
+    N = N.getOperand(0);
+
+  return N.getOpcode() == AArch64ISD::PFALSE;
+}
+
+static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
   unsigned NumElts = N.getValueType().getVectorMinNumElements();
 
   // Look through cast.
@@ -15023,6 +15114,21 @@ static bool isAllActivePredicate(SDValue N) {
       N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
     return N.getValueType().getVectorMinNumElements() >= NumElts;
 
+  // If we're compiling for a specific vector-length, we can check if the
+  // pattern's VL equals that of the scalable vector at runtime.
+  if (N.getOpcode() == AArch64ISD::PTRUE) {
+    const auto &Subtarget =
+        static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
+    unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
+    unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
+    if (MaxSVESize && MinSVESize == MaxSVESize) {
+      unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
+      unsigned PatNumElts =
+          getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
+      return PatNumElts == (NumElts * VScale);
+    }
+  }
+
   return false;
 }
 
@@ -15039,7 +15145,7 @@ static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
   SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
 
   // ISD way to specify an all active predicate.
-  if (isAllActivePredicate(Pg)) {
+  if (isAllActivePredicate(DAG, Pg)) {
     if (UnpredOp)
       return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
 
@@ -15870,7 +15976,7 @@ static SDValue performPostLD1Combine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
-  if (VT.isScalableVector())
+  if (!VT.is128BitVector() && !VT.is64BitVector())
     return SDValue();
 
   unsigned LoadIdx = IsLaneOp ? 1 : 0;
@@ -16710,6 +16816,12 @@ static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT CCVT = N0.getValueType();
 
+  if (isAllActivePredicate(DAG, N0))
+    return N->getOperand(1);
+
+  if (isAllInactivePredicate(N0))
+    return N->getOperand(2);
+
   // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
   // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
   // supported types.
@@ -18753,7 +18865,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
          DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
          "Expected legal fixed length vector!");
 
-  unsigned PgPattern =
+  Optional<unsigned> PgPattern =
       getSVEPredPatternFromNumElements(VT.getVectorNumElements());
   assert(PgPattern && "Unexpected element count for SVE predicate");
 
@@ -18789,7 +18901,7 @@ static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
     break;
   }
 
-  return getPTrue(DAG, DL, MaskVT, PgPattern);
+  return getPTrue(DAG, DL, MaskVT, *PgPattern);
 }
 
 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
@@ -19281,7 +19393,12 @@ SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
   default:
     return SDValue();
   case ISD::VECREDUCE_OR:
-    return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
+    if (isAllActivePredicate(DAG, Pg))
+      // The predicate can be 'Op' because
+      // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
+      return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
+    else
+      return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
   case ISD::VECREDUCE_AND: {
     Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
     return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
@@ -19725,8 +19842,9 @@ SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
   return Op;
 }
 
-bool AArch64TargetLowering::isAllActivePredicate(SDValue N) const {
-  return ::isAllActivePredicate(N);
+bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
+                                                 SDValue N) const {
+  return ::isAllActivePredicate(DAG, N);
 }
 
 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
@@ -19777,7 +19895,7 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
       Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
 }
 
-bool AArch64TargetLowering::isConstantUnsignedBitfieldExtactLegal(
+bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
     unsigned Opc, LLT Ty1, LLT Ty2) const {
   return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 367ba3039a0c..ca6c70297c0b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -77,14 +77,16 @@ enum NodeType : unsigned {
   SBC, // adc, sbc instructions
 
   // Predicated instructions where inactive lanes produce undefined results.
+  ABDS_PRED,
+  ABDU_PRED,
   ADD_PRED,
   FADD_PRED,
   FDIV_PRED,
   FMA_PRED,
-  FMAXNM_PRED,
-  FMINNM_PRED,
   FMAX_PRED,
+  FMAXNM_PRED,
   FMIN_PRED,
+  FMINNM_PRED,
   FMUL_PRED,
   FSUB_PRED,
   MUL_PRED,
@@ -321,6 +323,7 @@ enum NodeType : unsigned {
   INSR,
   PTEST,
   PTRUE,
+  PFALSE,
 
   BITREVERSE_MERGE_PASSTHRU,
   BSWAP_MERGE_PASSTHRU,
@@ -487,7 +490,6 @@ const unsigned RoundingBitsPos = 22;
 } // namespace AArch64
 
 class AArch64Subtarget;
-class AArch64TargetMachine;
 
 class AArch64TargetLowering : public TargetLowering {
 public:
@@ -842,7 +844,7 @@ public:
     return 128;
   }
 
-  bool isAllActivePredicate(SDValue N) const;
+  bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const;
   EVT getPromotedVTForPredicate(EVT VT) const;
 
   EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty,
@@ -1137,8 +1139,8 @@ private:
   // with BITCAST used otherwise.
   SDValue getSVESafeBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) const;
 
-  bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1,
-                                             LLT Ty2) const override;
+  bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
+                                              LLT Ty2) const override;
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 84573dac7e41..b220929514f9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -102,6 +102,34 @@ def : Pat<(relaxed_load<atomic_load_64>
                (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
           (LDURXi GPR64sp:$Rn, simm9:$offset)>;
 
+// FP 32-bit loads
+def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend32:$extend))))),
+          (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend32:$extend))))),
+          (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))))),
+          (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+          (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// FP 64-bit loads
+def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend64:$extend))))),
+          (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend64:$extend))))),
+          (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(f64 (bitconvert (i64 (relaxed_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset))))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
 //===----------------------------------
 // Atomic stores
 //===----------------------------------
@@ -196,6 +224,38 @@ def : Pat<(relaxed_store<atomic_store_64>
                (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
           (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
 
+// FP 32-bit stores
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend32:$extend),
+                                          (i32 (bitconvert (f32 FPR32Op:$val)))),
+          (STRSroW FPR32Op:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend32:$extend),
+                                          (i32 (bitconvert (f32 FPR32Op:$val)))),
+          (STRSroX FPR32Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), (i32 (bitconvert (f32 FPR32Op:$val)))),
+          (STRSui FPR32Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), (i32 (bitconvert (f32 FPR32Op:$val)))),
+          (STURSi FPR32Op:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// FP 64-bit stores
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend64:$extend),
+                                          (i64 (bitconvert (f64 FPR64Op:$val)))),
+          (STRDroW FPR64Op:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend64:$extend),
+                                          (i64 (bitconvert (f64 FPR64Op:$val)))),
+          (STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+              (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
+          (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
+          (STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>;
+
 //===----------------------------------
 // Low-level exclusive operations
 //===----------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f8d492188744..4c1e41b7efee 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1816,10 +1816,10 @@ def am_brcond : Operand<OtherVT> {
   let OperandType = "OPERAND_PCREL";
 }
 
-class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
-                     "b", ".$cond\t$target", "",
-                     [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
-                   Sched<[WriteBr]> {
+class BranchCond<bit bit4, string mnemonic>
+   : I<(outs), (ins ccode:$cond, am_brcond:$target),
+       mnemonic, ".$cond\t$target", "",
+       [(AArch64brcond bb:$target, imm:$cond, NZCV)]>, Sched<[WriteBr]> {
   let isBranch = 1;
   let isTerminator = 1;
   let Uses = [NZCV];
@@ -1828,7 +1828,7 @@ class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
   bits<19> target;
   let Inst{31-24} = 0b01010100;
   let Inst{23-5} = target;
-  let Inst{4} = 0;
+  let Inst{4} = bit4;
   let Inst{3-0} = cond;
 }
 
@@ -7700,10 +7700,10 @@ multiclass SIMDTableLookupTied<bit op, string asm> {
 
 
 //----------------------------------------------------------------------------
-// AdvSIMD scalar CPY
+// AdvSIMD scalar DUP
 //----------------------------------------------------------------------------
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+class BaseSIMDScalarDUP<RegisterClass regtype, RegisterOperand vectype,
                         string asm, string kind, Operand idxtype>
   : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), asm,
        "{\t$dst, $src" # kind # "$idx" #
@@ -7717,30 +7717,30 @@ class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
   let Inst{4-0}   = dst;
 }
 
-class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+class SIMDScalarDUPAlias<string asm, string size, Instruction inst,
       RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
     : InstAlias<asm # "{\t$dst, $src" # size # "$index"
                     # "|\t$dst, $src$index}",
                 (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
 
 
-multiclass SIMDScalarCPY<string asm> {
-  def i8  : BaseSIMDScalarCPY<FPR8,  V128, asm, ".b", VectorIndexB> {
+multiclass SIMDScalarDUP<string asm> {
+  def i8  : BaseSIMDScalarDUP<FPR8,  V128, asm, ".b", VectorIndexB> {
     bits<4> idx;
     let Inst{20-17} = idx;
     let Inst{16} = 1;
   }
-  def i16 : BaseSIMDScalarCPY<FPR16, V128, asm, ".h", VectorIndexH> {
+  def i16 : BaseSIMDScalarDUP<FPR16, V128, asm, ".h", VectorIndexH> {
     bits<3> idx;
     let Inst{20-18} = idx;
     let Inst{17-16} = 0b10;
   }
-  def i32 : BaseSIMDScalarCPY<FPR32, V128, asm, ".s", VectorIndexS> {
+  def i32 : BaseSIMDScalarDUP<FPR32, V128, asm, ".s", VectorIndexS> {
     bits<2> idx;
     let Inst{20-19} = idx;
     let Inst{18-16} = 0b100;
   }
-  def i64 : BaseSIMDScalarCPY<FPR64, V128, asm, ".d", VectorIndexD> {
+  def i64 : BaseSIMDScalarDUP<FPR64, V128, asm, ".d", VectorIndexD> {
     bits<1> idx;
     let Inst{20} = idx;
     let Inst{19-16} = 0b1000;
@@ -7751,16 +7751,16 @@ multiclass SIMDScalarCPY<string asm> {
             (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
 
   // 'DUP' mnemonic aliases.
-  def : SIMDScalarCPYAlias<"dup", ".b",
+  def : SIMDScalarDUPAlias<"dup", ".b",
                            !cast<Instruction>(NAME#"i8"),
                            FPR8, V128, VectorIndexB>;
-  def : SIMDScalarCPYAlias<"dup", ".h",
+  def : SIMDScalarDUPAlias<"dup", ".h",
                            !cast<Instruction>(NAME#"i16"),
                            FPR16, V128, VectorIndexH>;
-  def : SIMDScalarCPYAlias<"dup", ".s",
+  def : SIMDScalarDUPAlias<"dup", ".s",
                            !cast<Instruction>(NAME#"i32"),
                            FPR32, V128, VectorIndexS>;
-  def : SIMDScalarCPYAlias<"dup", ".d",
+  def : SIMDScalarDUPAlias<"dup", ".d",
                            !cast<Instruction>(NAME#"i64"),
                            FPR64, V128, VectorIndexD>;
 }
@@ -10556,40 +10556,30 @@ class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
                                 pattern> {
 }
 multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
-                                             SDPatternOperator Accum> {
+                                             SDPatternOperator op> {
   def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
     [(set (v4i16 V64:$dst),
-          (Accum (v4i16 V64:$Rd),
-                 (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
-                                                   (v4i16 V64:$Rm)))))]>;
+          (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
   def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
     [(set (v8i16 V128:$dst),
-          (Accum (v8i16 V128:$Rd),
-                 (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
-                                                   (v8i16 V128:$Rm)))))]>;
+          (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
   def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
     [(set (v2i32 V64:$dst),
-          (Accum (v2i32 V64:$Rd),
-                 (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
-                                                   (v2i32 V64:$Rm)))))]>;
+          (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
   def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
     [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
-                                                   (v4i32 V128:$Rm)))))]>;
+          (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
 }
 
 multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
-                                     SDPatternOperator Accum> {
+                                     SDPatternOperator op> {
   def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
                                           V64, V64, V128_lo, VectorIndexH,
                                           asm, ".4h", ".4h", ".4h", ".h",
     [(set (v4i16 V64:$dst),
-          (Accum (v4i16 V64:$Rd),
-                 (v4i16 (int_aarch64_neon_sqrdmulh
-                          (v4i16 V64:$Rn),
-                          (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx))))))]> {
+          (v4i16 (op (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                     (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                              VectorIndexH:$idx)))))]> {
     bits<3> idx;
     let Inst{11} = idx{2};
     let Inst{21} = idx{1};
@@ -10600,11 +10590,9 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
                                           V128, V128, V128_lo, VectorIndexH,
                                           asm, ".8h", ".8h", ".8h", ".h",
     [(set (v8i16 V128:$dst),
-          (Accum (v8i16 V128:$Rd),
-                 (v8i16 (int_aarch64_neon_sqrdmulh
-                          (v8i16 V128:$Rn),
-                          (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
-                                                   VectorIndexH:$idx))))))]> {
+          (v8i16 (op (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                     (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                              VectorIndexH:$idx)))))]> {
     bits<3> idx;
     let Inst{11} = idx{2};
     let Inst{21} = idx{1};
@@ -10615,75 +10603,26 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
                                           V64, V64, V128, VectorIndexS,
                                           asm, ".2s", ".2s", ".2s", ".s",
     [(set (v2i32 V64:$dst),
-        (Accum (v2i32 V64:$Rd),
-               (v2i32 (int_aarch64_neon_sqrdmulh
-                        (v2i32 V64:$Rn),
-                        (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
+          (v2i32 (op (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                     (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                              VectorIndexS:$idx)))))]> {
     bits<2> idx;
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
   }
 
-  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
-  // an intermediate EXTRACT_SUBREG would be untyped.
-  // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we
-  // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
-  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                       (i32 (vector_extract
-                               (v4i32 (insert_subvector
-                                       (undef),
-                                        (v2i32 (int_aarch64_neon_sqrdmulh
-                                                 (v2i32 V64:$Rn),
-                                                 (v2i32 (AArch64duplane32
-                                                          (v4i32 V128:$Rm),
-                                                          VectorIndexS:$idx)))),
-                                      (i64 0))),
-                               (i64 0))))),
-            (EXTRACT_SUBREG
-                (v2i32 (!cast<Instruction>(NAME # v2i32_indexed)
-                          (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                                                FPR32Op:$Rd,
-                                                ssub)),
-                          V64:$Rn,
-                          V128:$Rm,
-                          VectorIndexS:$idx)),
-                ssub)>;
-
   def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
                                           V128, V128, V128, VectorIndexS,
                                           asm, ".4s", ".4s", ".4s", ".s",
     [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_aarch64_neon_sqrdmulh
-                          (v4i32 V128:$Rn),
-                          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
-                                                   VectorIndexS:$idx))))))]> {
+          (v4i32 (op (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                     (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                              VectorIndexS:$idx)))))]> {
     bits<2> idx;
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
   }
 
-  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
-  // an intermediate EXTRACT_SUBREG would be untyped.
-  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                        (i32 (vector_extract
-                               (v4i32 (int_aarch64_neon_sqrdmulh
-                                        (v4i32 V128:$Rn),
-                                        (v4i32 (AArch64duplane32
-                                                 (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx)))),
-                               (i64 0))))),
-            (EXTRACT_SUBREG
-                (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
-                         (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                                               FPR32Op:$Rd,
-                                               ssub)),
-                         V128:$Rn,
-                         V128:$Rm,
-                         VectorIndexS:$idx)),
-                ssub)>;
-
   def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
                                         FPR16Op, FPR16Op, V128_lo,
                                         VectorIndexH, asm, ".h", "", "", ".h",
@@ -10698,11 +10637,9 @@ multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
                                         FPR32Op, FPR32Op, V128, VectorIndexS,
                                         asm, ".s", "", "", ".s",
     [(set (i32 FPR32Op:$dst),
-          (Accum (i32 FPR32Op:$Rd),
-                 (i32 (int_aarch64_neon_sqrdmulh
-                        (i32 FPR32Op:$Rn),
-                        (i32 (vector_extract (v4i32 V128:$Rm),
-                                             VectorIndexS:$idx))))))]> {
+          (i32 (op (i32 FPR32Op:$Rd), (i32 FPR32Op:$Rn),
+                   (i32 (vector_extract (v4i32 V128:$Rm),
+                                        VectorIndexS:$idx)))))]> {
     bits<2> idx;
     let Inst{11} = idx{1};
     let Inst{21} = idx{0};
@@ -11430,6 +11367,123 @@ class Store64BV<bits<3> opc, string asm_inst, list<dag> pat = []>
   let Inst{20-16} = Rs;
 }
 
+class MOPSMemoryCopyMoveBase<bit isMove, bits<2> opcode, bits<2> op1,
+                             bits<2> op2, string asm>
+  : I<(outs GPR64common:$Rd_wb, GPR64common:$Rs_wb, GPR64:$Rn_wb),
+      (ins GPR64common:$Rd, GPR64common:$Rs, GPR64:$Rn),
+      asm, "\t[$Rd]!, [$Rs]!, $Rn!",
+      "$Rd = $Rd_wb,$Rs = $Rs_wb,$Rn = $Rn_wb", []>,
+    Sched<[]> {
+  bits<5> Rd;
+  bits<5> Rs;
+  bits<5> Rn;
+  let Inst{31-27} = 0b00011;
+  let Inst{26} = isMove;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opcode;
+  let Inst{21} = 0b0;
+  let Inst{20-16} = Rs;
+  let Inst{15-14} = op2;
+  let Inst{13-12} = op1;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+
+  let DecoderMethod = "DecodeCPYMemOpInstruction";
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+class MOPSMemoryCopy<bits<2> opcode, bits<2> op1, bits<2> op2, string asm>
+  : MOPSMemoryCopyMoveBase<0, opcode, op1, op2, asm>;
+
+class MOPSMemoryMove<bits<2> opcode, bits<2> op1, bits<2> op2, string asm>
+  : MOPSMemoryCopyMoveBase<1, opcode, op1, op2, asm>;
+
+class MOPSMemorySetBase<bit isTagging, bits<2> opcode, bit op1, bit op2,
+                        string asm>
+  : I<(outs GPR64common:$Rd_wb, GPR64:$Rn_wb),
+      (ins GPR64common:$Rd, GPR64:$Rn, GPR64:$Rm),
+      asm, "\t[$Rd]!, $Rn!, $Rm",
+      "$Rd = $Rd_wb,$Rn = $Rn_wb", []>,
+    Sched<[]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-27} = 0b00011;
+  let Inst{26} = isTagging;
+  let Inst{25-21} = 0b01110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = opcode;
+  let Inst{13} = op2;
+  let Inst{12} = op1;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+
+  let DecoderMethod = "DecodeSETMemOpInstruction";
+  let mayLoad = 0;
+  let mayStore = 1;
+}
+
+class MOPSMemorySet<bits<2> opcode, bit op1, bit op2, string asm>
+  : MOPSMemorySetBase<0, opcode, op1, op2, asm>;
+
+class MOPSMemorySetTagging<bits<2> opcode, bit op1, bit op2, string asm>
+  : MOPSMemorySetBase<1, opcode, op1, op2, asm>;
+
+multiclass MOPSMemoryCopyInsns<bits<2> opcode, string asm> {
+  def ""   : MOPSMemoryCopy<opcode, 0b00, 0b00, asm>;
+  def WN   : MOPSMemoryCopy<opcode, 0b00, 0b01, asm # "wn">;
+  def RN   : MOPSMemoryCopy<opcode, 0b00, 0b10, asm # "rn">;
+  def N    : MOPSMemoryCopy<opcode, 0b00, 0b11, asm # "n">;
+  def WT   : MOPSMemoryCopy<opcode, 0b01, 0b00, asm # "wt">;
+  def WTWN : MOPSMemoryCopy<opcode, 0b01, 0b01, asm # "wtwn">;
+  def WTRN : MOPSMemoryCopy<opcode, 0b01, 0b10, asm # "wtrn">;
+  def WTN  : MOPSMemoryCopy<opcode, 0b01, 0b11, asm # "wtn">;
+  def RT   : MOPSMemoryCopy<opcode, 0b10, 0b00, asm # "rt">;
+  def RTWN : MOPSMemoryCopy<opcode, 0b10, 0b01, asm # "rtwn">;
+  def RTRN : MOPSMemoryCopy<opcode, 0b10, 0b10, asm # "rtrn">;
+  def RTN  : MOPSMemoryCopy<opcode, 0b10, 0b11, asm # "rtn">;
+  def T    : MOPSMemoryCopy<opcode, 0b11, 0b00, asm # "t">;
+  def TWN  : MOPSMemoryCopy<opcode, 0b11, 0b01, asm # "twn">;
+  def TRN  : MOPSMemoryCopy<opcode, 0b11, 0b10, asm # "trn">;
+  def TN   : MOPSMemoryCopy<opcode, 0b11, 0b11, asm # "tn">;
+}
+
+multiclass MOPSMemoryMoveInsns<bits<2> opcode, string asm> {
+  def ""   : MOPSMemoryMove<opcode, 0b00, 0b00, asm>;
+  def WN   : MOPSMemoryMove<opcode, 0b00, 0b01, asm # "wn">;
+  def RN   : MOPSMemoryMove<opcode, 0b00, 0b10, asm # "rn">;
+  def N    : MOPSMemoryMove<opcode, 0b00, 0b11, asm # "n">;
+  def WT   : MOPSMemoryMove<opcode, 0b01, 0b00, asm # "wt">;
+  def WTWN : MOPSMemoryMove<opcode, 0b01, 0b01, asm # "wtwn">;
+  def WTRN : MOPSMemoryMove<opcode, 0b01, 0b10, asm # "wtrn">;
+  def WTN  : MOPSMemoryMove<opcode, 0b01, 0b11, asm # "wtn">;
+  def RT   : MOPSMemoryMove<opcode, 0b10, 0b00, asm # "rt">;
+  def RTWN : MOPSMemoryMove<opcode, 0b10, 0b01, asm # "rtwn">;
+  def RTRN : MOPSMemoryMove<opcode, 0b10, 0b10, asm # "rtrn">;
+  def RTN  : MOPSMemoryMove<opcode, 0b10, 0b11, asm # "rtn">;
+  def T    : MOPSMemoryMove<opcode, 0b11, 0b00, asm # "t">;
+  def TWN  : MOPSMemoryMove<opcode, 0b11, 0b01, asm # "twn">;
+  def TRN  : MOPSMemoryMove<opcode, 0b11, 0b10, asm # "trn">;
+  def TN   : MOPSMemoryMove<opcode, 0b11, 0b11, asm # "tn">;
+}
+
+multiclass MOPSMemorySetInsns<bits<2> opcode, string asm> {
+  def "" : MOPSMemorySet<opcode, 0, 0, asm>;
+  def T  : MOPSMemorySet<opcode, 1, 0, asm # "t">;
+  def N  : MOPSMemorySet<opcode, 0, 1, asm # "n">;
+  def TN : MOPSMemorySet<opcode, 1, 1, asm # "tn">;
+}
+
+multiclass MOPSMemorySetTaggingInsns<bits<2> opcode, string asm> {
+  def "" : MOPSMemorySetTagging<opcode, 0, 0, asm>;
+  def T  : MOPSMemorySetTagging<opcode, 1, 0, asm # "t">;
+  def N  : MOPSMemorySetTagging<opcode, 0, 1, asm # "n">;
+  def TN : MOPSMemorySetTagging<opcode, 1, 1, asm # "tn">;
+}
+
 //----------------------------------------------------------------------------
 // Allow the size specifier tokens to be upper case, not just lower.
 def : TokenAlias<".4B", ".4b">;  // Add dot product
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5fc5e4e5eb35..93c17133c845 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2574,6 +2574,7 @@ AArch64InstrInfo::getAddrModeFromMemoryOp(const MachineInstr &MemI,
   AM.BaseReg = Base->getReg();
   AM.Displacement = Offset;
   AM.ScaledReg = 0;
+  AM.Scale = 0;
   return AM;
 }
 
@@ -7350,8 +7351,7 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
         .setMIFlags(MachineInstr::FrameSetup);
 
     // If v8.3a features are available we can replace a RET instruction by
-    // RETAA or RETAB and omit the AUT instructions. In this case the
-    // DW_CFA_AARCH64_negate_ra_state can't be emitted.
+    // RETAA or RETAB and omit the AUT instructions
     if (Subtarget.hasPAuth() && MBBAUT != MBB.end() &&
         MBBAUT->getOpcode() == AArch64::RET) {
       BuildMI(MBB, MBBAUT, DL,
@@ -7364,11 +7364,6 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
               TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
                                                     : AArch64::AUTIBSP))
           .setMIFlag(MachineInstr::FrameDestroy);
-      unsigned CFIIndexAuth =
-          MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
-      BuildMI(MBB, MBBAUT, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndexAuth)
-          .setMIFlags(MachineInstr::FrameDestroy);
     }
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index b2f9e82a7e8b..1054bea40e68 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -26,7 +26,6 @@
 namespace llvm {
 
 class AArch64Subtarget;
-class AArch64TargetMachine;
 
 static const MachineMemOperand::Flags MOSuppressPair =
     MachineMemOperand::MOTargetFlag1;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ebccc07edc7a..c8a697c8b82f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -33,6 +33,8 @@ def HasV9_1a         : Predicate<"Subtarget->hasV9_1aOps()">,
                                  AssemblerPredicate<(all_of HasV9_1aOps), "armv9.1a">;
 def HasV9_2a         : Predicate<"Subtarget->hasV9_2aOps()">,
                                  AssemblerPredicate<(all_of HasV9_2aOps), "armv9.2a">;
+def HasV9_3a         : Predicate<"Subtarget->hasV9_3aOps()">,
+                                 AssemblerPredicate<(all_of HasV9_3aOps), "armv9.3a">;
 def HasV8_0r         : Predicate<"Subtarget->hasV8_0rOps()">,
                                  AssemblerPredicate<(all_of HasV8_0rOps), "armv8-r">;
 
@@ -198,6 +200,10 @@ def HasBRBE          : Predicate<"Subtarget->hasBRBE()">,
                        AssemblerPredicate<(all_of FeatureBRBE), "brbe">;
 def HasSPE_EEF       : Predicate<"Subtarget->hasSPE_EEF()">,
                        AssemblerPredicate<(all_of FeatureSPE_EEF), "spe-eef">;
+def HasHBC           : Predicate<"Subtarget->hasHBC()">,
+                       AssemblerPredicate<(all_of FeatureHBC), "hbc">;
+def HasMOPS          : Predicate<"Subtarget->hasMOPS()">,
+                       AssemblerPredicate<(all_of FeatureMOPS), "mops">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
@@ -2362,7 +2368,12 @@ def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
 //===----------------------------------------------------------------------===//
 // Conditional branch (immediate) instruction.
 //===----------------------------------------------------------------------===//
-def Bcc : BranchCond;
+def Bcc : BranchCond<0, "b">;
+
+// Armv8.8-A variant form which hints to the branch predictor that
+// this branch is very likely to go the same way nearly all the time
+// (even though it is not known at compile time _which_ way that is).
+def BCcc : BranchCond<1, "bc">, Requires<[HasHBC]>;
 
 //===----------------------------------------------------------------------===//
 // Compare-and-branch instructions.
@@ -4500,9 +4511,9 @@ defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
 defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
 defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
-                                                  int_aarch64_neon_sqadd>;
+                                                  int_aarch64_neon_sqrdmlah>;
 defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
-                                                    int_aarch64_neon_sqsub>;
+                                                    int_aarch64_neon_sqrdmlsh>;
 
 // Extra saturate patterns, other than the intrinsics matches above
 defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>;
@@ -4769,15 +4780,11 @@ defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
 let Predicates = [HasRDM] in {
   defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
   defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
-  def : Pat<(i32 (int_aarch64_neon_sqadd
-                   (i32 FPR32:$Rd),
-                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
-                                                   (i32 FPR32:$Rm))))),
+  def : Pat<(i32 (int_aarch64_neon_sqrdmlah (i32 FPR32:$Rd), (i32 FPR32:$Rn),
+                                            (i32 FPR32:$Rm))),
             (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(i32 (int_aarch64_neon_sqsub
-                   (i32 FPR32:$Rd),
-                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
-                                                   (i32 FPR32:$Rm))))),
+  def : Pat<(i32 (int_aarch64_neon_sqrdmlsh (i32 FPR32:$Rd), (i32 FPR32:$Rn),
+                                            (i32 FPR32:$Rm))),
             (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
 }
 
@@ -5342,19 +5349,6 @@ def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
                                  (v2i32 (trunc (v2i64 V128:$Vm))))),
           (UZP1v4i32 V128:$Vn, V128:$Vm)>;
 
-def : Pat<(v16i8 (concat_vectors
-                 (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
-                 (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))),
-          (UZP2v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors
-                 (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))),
-                 (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))),
-          (UZP2v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors
-                 (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))),
-                 (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
-          (UZP2v4i32 V128:$Vn, V128:$Vm)>;
-
 //----------------------------------------------------------------------------
 // AdvSIMD TBL/TBX instructions
 //----------------------------------------------------------------------------
@@ -5376,10 +5370,10 @@ def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
 
 
 //----------------------------------------------------------------------------
-// AdvSIMD scalar CPY instruction
+// AdvSIMD scalar DUP instruction
 //----------------------------------------------------------------------------
 
-defm CPY : SIMDScalarCPY<"mov">;
+defm DUP : SIMDScalarDUP<"mov">;
 
 //----------------------------------------------------------------------------
 // AdvSIMD scalar pairwise instructions
@@ -5790,7 +5784,7 @@ defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
 
 
 // Floating point vector extractions are codegen'd as either a sequence of
-// subregister extractions, or a MOV (aka CPY here, alias for DUP) if
+// subregister extractions, or a MOV (aka DUP here) if
 // the lane number is anything other than zero.
 def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
           (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
@@ -5803,13 +5797,13 @@ def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
 
 
 def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
-          (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
+          (f64 (DUPi64 V128:$Rn, VectorIndexD:$idx))>;
 def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
-          (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
+          (f32 (DUPi32 V128:$Rn, VectorIndexS:$idx))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
-          (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
+          (f16 (DUPi16 V128:$Rn, VectorIndexH:$idx))>;
 def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
-          (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
+          (bf16 (DUPi16 V128:$Rn, VectorIndexH:$idx))>;
 
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
@@ -6407,9 +6401,9 @@ defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
 defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
                                            int_aarch64_neon_sqsub>;
 defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
-                                          int_aarch64_neon_sqadd>;
+                                          int_aarch64_neon_sqrdmlah>;
 defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
-                                          int_aarch64_neon_sqsub>;
+                                          int_aarch64_neon_sqrdmlsh>;
 defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
 defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
@@ -6425,6 +6419,22 @@ def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
                                                            VectorIndexS:$idx)),
           (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
 
+// Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
+// have no common bits.
+def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
+                         [(add node:$lhs, node:$rhs), (or node:$lhs, node:$rhs)],[{
+   if (N->getOpcode() == ISD::ADD)
+     return true;
+   return CurDAG->haveNoCommonBitsSet(N->getOperand(0), N->getOperand(1));
+}]> {
+  let GISelPredicateCode = [{
+     // Only handle G_ADD for now. FIXME. build capability to compute whether
+     // operands of G_OR have common bits set or not.
+     return MI.getOpcode() == TargetOpcode::G_ADD;
+  }];
+}
+
+
 //----------------------------------------------------------------------------
 // AdvSIMD scalar shift instructions
 //----------------------------------------------------------------------------
@@ -6530,7 +6540,7 @@ defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
                    (AArch64srshri node:$MHS, node:$RHS))>>;
 defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
 defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
-    TriOpFrag<(add node:$LHS,
+    TriOpFrag<(add_and_or_is_add node:$LHS,
                    (AArch64vashr node:$MHS, node:$RHS))>>;
 defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
                                      int_aarch64_neon_uqrshrn>;
@@ -6543,7 +6553,7 @@ defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
                    (AArch64urshri node:$MHS, node:$RHS))>>;
 defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
 defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
-    TriOpFrag<(add node:$LHS,
+    TriOpFrag<(add_and_or_is_add node:$LHS,
                    (AArch64vlshr node:$MHS, node:$RHS))>>;
 
 //----------------------------------------------------------------------------
@@ -6585,7 +6595,7 @@ defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
 
 defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
 defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
-                TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+                TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
 defm UCVTF   : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
                         int_aarch64_neon_vcvtfxu2fp>;
 defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
@@ -6601,7 +6611,7 @@ defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
                 BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
 defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
 defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
-                TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+                TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
 
 // RADDHN patterns for when RSHRN shifts by half the size of the vector element
 def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
@@ -8106,7 +8116,7 @@ class NTStore128Pat<ValueType VT> :
   Pat<(nontemporalstore (VT FPR128:$Rt),
         (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
       (STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
-              (CPYi64 FPR128:$Rt, (i64 1)),
+              (DUPi64 FPR128:$Rt, (i64 1)),
               GPR64sp:$Rn, simm7s8:$offset)>;
 
 def : NTStore128Pat<v2i64>;
@@ -8118,7 +8128,7 @@ class NTStore64Pat<ValueType VT> :
   Pat<(nontemporalstore (VT FPR64:$Rt),
         (am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
       (STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
-              (CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
+              (DUPi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
               GPR64sp:$Rn, simm7s4:$offset)>;
 
 // FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
@@ -8319,6 +8329,26 @@ let Predicates = [HasLS64] in {
   def : ST64BPattern<int_aarch64_st64bv0, ST64BV0>;
 }
 
+let Predicates = [HasMOPS] in {
+  defm CPYFP : MOPSMemoryCopyInsns<0b00, "cpyfp">;
+  defm CPYFM : MOPSMemoryCopyInsns<0b01, "cpyfm">;
+  defm CPYFE : MOPSMemoryCopyInsns<0b10, "cpyfe">;
+
+  defm CPYP : MOPSMemoryMoveInsns<0b00, "cpyp">;
+  defm CPYM : MOPSMemoryMoveInsns<0b01, "cpym">;
+  defm CPYE : MOPSMemoryMoveInsns<0b10, "cpye">;
+
+  defm SETP : MOPSMemorySetInsns<0b00, "setp">;
+  defm SETM : MOPSMemorySetInsns<0b01, "setm">;
+  defm SETE : MOPSMemorySetInsns<0b10, "sete">;
+}
+let Predicates = [HasMOPS, HasMTE] in {
+  defm SETGP     : MOPSMemorySetTaggingInsns<0b00, "setgp">;
+  defm SETGM     : MOPSMemorySetTaggingInsns<0b01, "setgm">;
+  // Can't use SETGE because it's a reserved name in TargetSelectionDAG.td
+  defm MOPSSETGE : MOPSMemorySetTaggingInsns<0b10, "setge">;
+}
+
 let Defs = [X16, X17], mayStore = 1, isCodeGenOnly = 1 in
 def StoreSwiftAsyncContext
       : Pseudo<(outs), (ins GPR64:$ctx, GPR64sp:$base, simm9:$offset),
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 3a836ac33064..6aefc1fdb599 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1139,7 +1139,7 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
                                ? getLdStOffsetOp(*StoreI).getImm()
                                : getLdStOffsetOp(*StoreI).getImm() * StoreSize;
     int Width = LoadSize * 8;
-    unsigned DestReg =
+    Register DestReg =
         IsStoreXReg ? Register(TRI->getMatchingSuperReg(
                           LdRt, AArch64::sub_32, &AArch64::GPR64RegClass))
                     : LdRt;
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/llvm/lib/Target/AArch64/AArch64MCInstLower.h
index 8f3148a98410..b008e49d52dd 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.h
@@ -14,15 +14,12 @@
 
 namespace llvm {
 class AsmPrinter;
-class MCAsmInfo;
 class MCContext;
 class MCInst;
 class MCOperand;
 class MCSymbol;
 class MachineInstr;
-class MachineModuleInfoMachO;
 class MachineOperand;
-class Mangler;
 
 /// AArch64MCInstLower - This class is used to lower an MachineInstr
 /// into an MCInst.
diff --git a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
index 42db18332f1c..1fc5617b49f6 100644
--- a/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MIPeepholeOpt.cpp
@@ -11,12 +11,19 @@
 // 1. MOVi32imm + ANDWrr ==> ANDWri + ANDWri
 //    MOVi64imm + ANDXrr ==> ANDXri + ANDXri
 //
+// 2. MOVi32imm + ADDWrr ==> ADDWRi + ADDWRi
+//    MOVi64imm + ADDXrr ==> ANDXri + ANDXri
+//
+// 3. MOVi32imm + SUBWrr ==> SUBWRi + SUBWRi
+//    MOVi64imm + SUBXrr ==> SUBXri + SUBXri
+//
 //    The mov pseudo instruction could be expanded to multiple mov instructions
 //    later. In this case, we could try to split the constant  operand of mov
-//    instruction into two bitmask immediates. It makes two AND instructions
-//    intead of multiple `mov` + `and` instructions.
+//    instruction into two immediates which can be directly encoded into
+//    *Wri/*Xri instructions. It makes two AND/ADD/SUB instructions instead of
+//    multiple `mov` + `and/add/sub` instructions.
 //
-// 2. Remove redundant ORRWrs which is generated by zero-extend.
+// 4. Remove redundant ORRWrs which is generated by zero-extend.
 //
 //    %3:gpr32 = ORRWrs $wzr, %2, 0
 //    %4:gpr64 = SUBREG_TO_REG 0, %3, %subreg.sub_32
@@ -30,6 +37,7 @@
 #include "AArch64ExpandImm.h"
 #include "AArch64InstrInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -48,11 +56,44 @@ struct AArch64MIPeepholeOpt : public MachineFunctionPass {
   }
 
   const AArch64InstrInfo *TII;
+  const AArch64RegisterInfo *TRI;
   MachineLoopInfo *MLI;
   MachineRegisterInfo *MRI;
 
   template <typename T>
-  bool visitAND(MachineInstr &MI,
+  using SplitAndOpcFunc =
+      std::function<Optional<unsigned>(T, unsigned, T &, T &)>;
+  using BuildMIFunc =
+      std::function<void(MachineInstr &, unsigned, unsigned, unsigned, Register,
+                         Register, Register)>;
+
+  /// For instructions where an immediate operand could be split into two
+  /// separate immediate instructions, use the splitTwoPartImm two handle the
+  /// optimization.
+  ///
+  /// To implement, the following function types must be passed to
+  /// splitTwoPartImm. A SplitAndOpcFunc must be implemented that determines if
+  /// splitting the immediate is valid and returns the associated new opcode. A
+  /// BuildMIFunc must be implemented to build the two immediate instructions.
+  ///
+  /// Example Pattern (where IMM would require 2+ MOV instructions):
+  ///     %dst = <Instr>rr %src IMM [...]
+  /// becomes:
+  ///     %tmp = <Instr>ri %src (encode half IMM) [...]
+  ///     %dst = <Instr>ri %tmp (encode half IMM) [...]
+  template <typename T>
+  bool splitTwoPartImm(MachineInstr &MI,
+                       SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
+                       SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr);
+
+  bool checkMovImmInstr(MachineInstr &MI, MachineInstr *&MovMI,
+                        MachineInstr *&SubregToRegMI);
+
+  template <typename T>
+  bool visitADDSUB(unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
+                   SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
+  template <typename T>
+  bool visitAND(unsigned Opc, MachineInstr &MI,
                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
   bool visitORR(MachineInstr &MI,
                 SmallSetVector<MachineInstr *, 8> &ToBeRemoved);
@@ -116,7 +157,8 @@ static bool splitBitmaskImm(T Imm, unsigned RegSize, T &Imm1Enc, T &Imm2Enc) {
 
 template <typename T>
 bool AArch64MIPeepholeOpt::visitAND(
-    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+    unsigned Opc, MachineInstr &MI,
+    SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
   // Try below transformation.
   //
   // MOVi32imm + ANDWrr ==> ANDWri + ANDWri
@@ -127,23 +169,151 @@ bool AArch64MIPeepholeOpt::visitAND(
   // bitmask immediates. It makes only two AND instructions intead of multiple
   // mov + and instructions.
 
-  unsigned RegSize = sizeof(T) * 8;
-  assert((RegSize == 32 || RegSize == 64) &&
-         "Invalid RegSize for AND bitmask peephole optimization");
+  return splitTwoPartImm<T>(
+      MI, ToBeRemoved,
+      [Opc](T Imm, unsigned RegSize, T &Imm0, T &Imm1) -> Optional<unsigned> {
+        if (splitBitmaskImm(Imm, RegSize, Imm0, Imm1))
+          return Opc;
+        return None;
+      },
+      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
+                   Register NewDstReg) {
+        DebugLoc DL = MI.getDebugLoc();
+        MachineBasicBlock *MBB = MI.getParent();
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+            .addReg(SrcReg)
+            .addImm(Imm0);
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+            .addReg(NewTmpReg)
+            .addImm(Imm1);
+      });
+}
+
+bool AArch64MIPeepholeOpt::visitORR(
+    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+  // Check this ORR comes from below zero-extend pattern.
+  //
+  // def : Pat<(i64 (zext GPR32:$src)),
+  //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
+  if (MI.getOperand(3).getImm() != 0)
+    return false;
+
+  if (MI.getOperand(1).getReg() != AArch64::WZR)
+    return false;
+
+  MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
+  if (!SrcMI)
+    return false;
+
+  // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
+  //
+  // When you use the 32-bit form of an instruction, the upper 32 bits of the
+  // source registers are ignored and the upper 32 bits of the destination
+  // register are set to zero.
+  //
+  // If AArch64's 32-bit form of instruction defines the source operand of
+  // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
+  // real AArch64 instruction and if it is not, do not process the opcode
+  // conservatively.
+  if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
+    return false;
+
+  Register DefReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
+  MRI->replaceRegWith(DefReg, SrcReg);
+  MRI->clearKillFlags(SrcReg);
+  // replaceRegWith changes MI's definition register. Keep it for SSA form until
+  // deleting MI.
+  MI.getOperand(0).setReg(DefReg);
+  ToBeRemoved.insert(&MI);
+
+  LLVM_DEBUG(dbgs() << "Removed: " << MI << "\n");
+
+  return true;
+}
+
+template <typename T>
+static bool splitAddSubImm(T Imm, unsigned RegSize, T &Imm0, T &Imm1) {
+  // The immediate must be in the form of ((imm0 << 12) + imm1), in which both
+  // imm0 and imm1 are non-zero 12-bit unsigned int.
+  if ((Imm & 0xfff000) == 0 || (Imm & 0xfff) == 0 ||
+      (Imm & ~static_cast<T>(0xffffff)) != 0)
+    return false;
+
+  // The immediate can not be composed via a single instruction.
+  SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
+  AArch64_IMM::expandMOVImm(Imm, RegSize, Insn);
+  if (Insn.size() == 1)
+    return false;
+
+  // Split Imm into (Imm0 << 12) + Imm1;
+  Imm0 = (Imm >> 12) & 0xfff;
+  Imm1 = Imm & 0xfff;
+  return true;
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::visitADDSUB(
+    unsigned PosOpc, unsigned NegOpc, MachineInstr &MI,
+    SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
+  // Try below transformation.
+  //
+  // MOVi32imm + ADDWrr ==> ADDWri + ADDWri
+  // MOVi64imm + ADDXrr ==> ADDXri + ADDXri
+  //
+  // MOVi32imm + SUBWrr ==> SUBWri + SUBWri
+  // MOVi64imm + SUBXrr ==> SUBXri + SUBXri
+  //
+  // The mov pseudo instruction could be expanded to multiple mov instructions
+  // later. Let's try to split the constant operand of mov instruction into two
+  // legal add/sub immediates. It makes only two ADD/SUB instructions intead of
+  // multiple `mov` + `and/sub` instructions.
+
+  return splitTwoPartImm<T>(
+      MI, ToBeRemoved,
+      [PosOpc, NegOpc](T Imm, unsigned RegSize, T &Imm0,
+                       T &Imm1) -> Optional<unsigned> {
+        if (splitAddSubImm(Imm, RegSize, Imm0, Imm1))
+          return PosOpc;
+        if (splitAddSubImm(-Imm, RegSize, Imm0, Imm1))
+          return NegOpc;
+        return None;
+      },
+      [&TII = TII](MachineInstr &MI, unsigned Opcode, unsigned Imm0,
+                   unsigned Imm1, Register SrcReg, Register NewTmpReg,
+                   Register NewDstReg) {
+        DebugLoc DL = MI.getDebugLoc();
+        MachineBasicBlock *MBB = MI.getParent();
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
+            .addReg(SrcReg)
+            .addImm(Imm0)
+            .addImm(12);
+        BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
+            .addReg(NewTmpReg)
+            .addImm(Imm1)
+            .addImm(0);
+      });
+}
 
-  // Check whether AND's MBB is in loop and the AND is loop invariant.
+// Checks if the corresponding MOV immediate instruction is applicable for
+// this peephole optimization.
+bool AArch64MIPeepholeOpt::checkMovImmInstr(MachineInstr &MI,
+                                            MachineInstr *&MovMI,
+                                            MachineInstr *&SubregToRegMI) {
+  // Check whether current MBB is in loop and the AND is loop invariant.
   MachineBasicBlock *MBB = MI.getParent();
   MachineLoop *L = MLI->getLoopFor(MBB);
   if (L && !L->isLoopInvariant(MI))
     return false;
 
-  // Check whether AND's operand is MOV with immediate.
-  MachineInstr *MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
+  // Check whether current MI's operand is MOV with immediate.
+  MovMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
   if (!MovMI)
     return false;
 
-  MachineInstr *SubregToRegMI = nullptr;
   // If it is SUBREG_TO_REG, check its operand.
+  SubregToRegMI = nullptr;
   if (MovMI->getOpcode() == TargetOpcode::SUBREG_TO_REG) {
     SubregToRegMI = MovMI;
     MovMI = MRI->getUniqueVRegDef(MovMI->getOperand(2).getReg());
@@ -159,47 +329,63 @@ bool AArch64MIPeepholeOpt::visitAND(
   // more instructions.
   if (!MRI->hasOneUse(MovMI->getOperand(0).getReg()))
     return false;
-
   if (SubregToRegMI && !MRI->hasOneUse(SubregToRegMI->getOperand(0).getReg()))
     return false;
 
-  // Split the bitmask immediate into two.
-  T UImm = static_cast<T>(MovMI->getOperand(1).getImm());
+  // It is OK to perform this peephole optimization.
+  return true;
+}
+
+template <typename T>
+bool AArch64MIPeepholeOpt::splitTwoPartImm(
+    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved,
+    SplitAndOpcFunc<T> SplitAndOpc, BuildMIFunc BuildInstr) {
+  unsigned RegSize = sizeof(T) * 8;
+  assert((RegSize == 32 || RegSize == 64) &&
+         "Invalid RegSize for legal immediate peephole optimization");
+
+  // Perform several essential checks against current MI.
+  MachineInstr *MovMI, *SubregToRegMI;
+  if (!checkMovImmInstr(MI, MovMI, SubregToRegMI))
+    return false;
+
+  // Split the immediate to Imm0 and Imm1, and calculate the Opcode.
+  T Imm = static_cast<T>(MovMI->getOperand(1).getImm()), Imm0, Imm1;
   // For the 32 bit form of instruction, the upper 32 bits of the destination
   // register are set to zero. If there is SUBREG_TO_REG, set the upper 32 bits
-  // of UImm to zero.
+  // of Imm to zero. This is essential if the Immediate value was a negative
+  // number since it was sign extended when we assign to the 64-bit Imm.
   if (SubregToRegMI)
-    UImm &= 0xFFFFFFFF;
-  T Imm1Enc;
-  T Imm2Enc;
-  if (!splitBitmaskImm(UImm, RegSize, Imm1Enc, Imm2Enc))
+    Imm &= 0xFFFFFFFF;
+  unsigned Opcode;
+  if (auto R = SplitAndOpc(Imm, RegSize, Imm0, Imm1))
+    Opcode = R.getValue();
+  else
     return false;
 
-  // Create new AND MIs.
-  DebugLoc DL = MI.getDebugLoc();
-  const TargetRegisterClass *ANDImmRC =
-      (RegSize == 32) ? &AArch64::GPR32spRegClass : &AArch64::GPR64spRegClass;
+  // Create new ADD/SUB MIs.
+  MachineFunction *MF = MI.getMF();
+  const TargetRegisterClass *RC =
+      TII->getRegClass(TII->get(Opcode), 0, TRI, *MF);
+  const TargetRegisterClass *ORC =
+      TII->getRegClass(TII->get(Opcode), 1, TRI, *MF);
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-  Register NewTmpReg = MRI->createVirtualRegister(ANDImmRC);
-  Register NewDstReg = MRI->createVirtualRegister(ANDImmRC);
-  unsigned Opcode = (RegSize == 32) ? AArch64::ANDWri : AArch64::ANDXri;
-
-  MRI->constrainRegClass(NewTmpReg, MRI->getRegClass(SrcReg));
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewTmpReg)
-      .addReg(SrcReg)
-      .addImm(Imm1Enc);
+  Register NewTmpReg = MRI->createVirtualRegister(RC);
+  Register NewDstReg = MRI->createVirtualRegister(RC);
 
+  MRI->constrainRegClass(SrcReg, RC);
+  MRI->constrainRegClass(NewTmpReg, ORC);
   MRI->constrainRegClass(NewDstReg, MRI->getRegClass(DstReg));
-  BuildMI(*MBB, MI, DL, TII->get(Opcode), NewDstReg)
-      .addReg(NewTmpReg)
-      .addImm(Imm2Enc);
+
+  BuildInstr(MI, Opcode, Imm0, Imm1, SrcReg, NewTmpReg, NewDstReg);
 
   MRI->replaceRegWith(DstReg, NewDstReg);
   // replaceRegWith changes MI's definition register. Keep it for SSA form until
   // deleting MI.
   MI.getOperand(0).setReg(DstReg);
 
+  // Record the MIs need to be removed.
   ToBeRemoved.insert(&MI);
   if (SubregToRegMI)
     ToBeRemoved.insert(SubregToRegMI);
@@ -208,59 +394,17 @@ bool AArch64MIPeepholeOpt::visitAND(
   return true;
 }
 
-bool AArch64MIPeepholeOpt::visitORR(
-    MachineInstr &MI, SmallSetVector<MachineInstr *, 8> &ToBeRemoved) {
-  // Check this ORR comes from below zero-extend pattern.
-  //
-  // def : Pat<(i64 (zext GPR32:$src)),
-  //           (SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;
-  if (MI.getOperand(3).getImm() != 0)
-    return false;
-
-  if (MI.getOperand(1).getReg() != AArch64::WZR)
-    return false;
-
-  MachineInstr *SrcMI = MRI->getUniqueVRegDef(MI.getOperand(2).getReg());
-  if (!SrcMI)
-    return false;
-
-  // From https://developer.arm.com/documentation/dui0801/b/BABBGCAC
-  //
-  // When you use the 32-bit form of an instruction, the upper 32 bits of the
-  // source registers are ignored and the upper 32 bits of the destination
-  // register are set to zero.
-  //
-  // If AArch64's 32-bit form of instruction defines the source operand of
-  // zero-extend, we do not need the zero-extend. Let's check the MI's opcode is
-  // real AArch64 instruction and if it is not, do not process the opcode
-  // conservatively.
-  if (SrcMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
-    return false;
-
-  Register DefReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(2).getReg();
-  MRI->replaceRegWith(DefReg, SrcReg);
-  MRI->clearKillFlags(SrcReg);
-  // replaceRegWith changes MI's definition register. Keep it for SSA form until
-  // deleting MI.
-  MI.getOperand(0).setReg(DefReg);
-  ToBeRemoved.insert(&MI);
-
-  LLVM_DEBUG({ dbgs() << "Removed: " << MI << "\n"; });
-
-  return true;
-}
-
 bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
 
   TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+  TRI = static_cast<const AArch64RegisterInfo *>(
+      MF.getSubtarget().getRegisterInfo());
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
 
-  if (!MRI->isSSA())
-    return false;
+  assert(MRI->isSSA() && "Expected to be run on SSA form!");
 
   bool Changed = false;
   SmallSetVector<MachineInstr *, 8> ToBeRemoved;
@@ -271,13 +415,30 @@ bool AArch64MIPeepholeOpt::runOnMachineFunction(MachineFunction &MF) {
       default:
         break;
       case AArch64::ANDWrr:
-        Changed = visitAND<uint32_t>(MI, ToBeRemoved);
+        Changed = visitAND<uint32_t>(AArch64::ANDWri, MI, ToBeRemoved);
         break;
       case AArch64::ANDXrr:
-        Changed = visitAND<uint64_t>(MI, ToBeRemoved);
+        Changed = visitAND<uint64_t>(AArch64::ANDXri, MI, ToBeRemoved);
         break;
       case AArch64::ORRWrs:
         Changed = visitORR(MI, ToBeRemoved);
+        break;
+      case AArch64::ADDWrr:
+        Changed = visitADDSUB<uint32_t>(AArch64::ADDWri, AArch64::SUBWri, MI,
+                                        ToBeRemoved);
+        break;
+      case AArch64::SUBWrr:
+        Changed = visitADDSUB<uint32_t>(AArch64::SUBWri, AArch64::ADDWri, MI,
+                                        ToBeRemoved);
+        break;
+      case AArch64::ADDXrr:
+        Changed = visitADDSUB<uint64_t>(AArch64::ADDXri, AArch64::SUBXri, MI,
+                                        ToBeRemoved);
+        break;
+      case AArch64::SUBXrr:
+        Changed = visitADDSUB<uint64_t>(AArch64::SUBXri, AArch64::ADDXri, MI,
+                                        ToBeRemoved);
+        break;
       }
     }
   }
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index 80d98d17e1d6..2ef7bc83003a 100644
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -633,7 +633,7 @@ bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
 /// Return true when the instruction is processed successfully.
 bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
      unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
-  assert (DefiningMI != NULL);
+  assert(DefiningMI != nullptr);
   if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
     return false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eb55a472a69a..73a680465f6f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -180,20 +180,22 @@ def AArch64asr_p  : SDNode<"AArch64ISD::SRA_PRED",  SDT_AArch64Arith>;
 def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
 def AArch64fdiv_p : SDNode<"AArch64ISD::FDIV_PRED", SDT_AArch64Arith>;
 def AArch64fma_p  : SDNode<"AArch64ISD::FMA_PRED",  SDT_AArch64FMA>;
-def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>;
-def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>;
 def AArch64fmax_p : SDNode<"AArch64ISD::FMAX_PRED", SDT_AArch64Arith>;
+def AArch64fmaxnm_p : SDNode<"AArch64ISD::FMAXNM_PRED", SDT_AArch64Arith>;
 def AArch64fmin_p : SDNode<"AArch64ISD::FMIN_PRED", SDT_AArch64Arith>;
+def AArch64fminnm_p : SDNode<"AArch64ISD::FMINNM_PRED", SDT_AArch64Arith>;
 def AArch64fmul_p : SDNode<"AArch64ISD::FMUL_PRED", SDT_AArch64Arith>;
 def AArch64fsub_p : SDNode<"AArch64ISD::FSUB_PRED", SDT_AArch64Arith>;
 def AArch64lsl_p  : SDNode<"AArch64ISD::SHL_PRED",  SDT_AArch64Arith>;
 def AArch64lsr_p  : SDNode<"AArch64ISD::SRL_PRED",  SDT_AArch64Arith>;
 def AArch64mul_p  : SDNode<"AArch64ISD::MUL_PRED",  SDT_AArch64Arith>;
+def AArch64sabd_p : SDNode<"AArch64ISD::ABDS_PRED", SDT_AArch64Arith>;
 def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
 def AArch64smax_p : SDNode<"AArch64ISD::SMAX_PRED", SDT_AArch64Arith>;
 def AArch64smin_p : SDNode<"AArch64ISD::SMIN_PRED", SDT_AArch64Arith>;
 def AArch64smulh_p : SDNode<"AArch64ISD::MULHS_PRED", SDT_AArch64Arith>;
 def AArch64sub_p  : SDNode<"AArch64ISD::SUB_PRED",  SDT_AArch64Arith>;
+def AArch64uabd_p : SDNode<"AArch64ISD::ABDU_PRED", SDT_AArch64Arith>;
 def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
 def AArch64umax_p : SDNode<"AArch64ISD::UMAX_PRED", SDT_AArch64Arith>;
 def AArch64umin_p : SDNode<"AArch64ISD::UMIN_PRED", SDT_AArch64Arith>;
@@ -277,8 +279,11 @@ def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2),
   return N->hasOneUse();
 }]>;
 
+def AArch64fabd_p : PatFrag<(ops node:$pg, node:$op1, node:$op2),
+                            (AArch64fabs_mt node:$pg, (AArch64fsub_p node:$pg, node:$op1, node:$op2), undef)>;
+
 def AArch64fneg_mt_nsz : PatFrag<(ops node:$pred, node:$op, node:$pt),
-                                  (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{
+                                 (AArch64fneg_mt node:$pred, node:$op, node:$pt), [{
   return N->getFlags().hasNoSignedZeros();
 }]>;
 
@@ -415,6 +420,8 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm UMAX_ZPZZ : sve_int_bin_pred_bhsd<AArch64umax_p>;
   defm SMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64smin_p>;
   defm UMIN_ZPZZ : sve_int_bin_pred_bhsd<AArch64umin_p>;
+  defm SABD_ZPZZ : sve_int_bin_pred_bhsd<AArch64sabd_p>;
+  defm UABD_ZPZZ : sve_int_bin_pred_bhsd<AArch64uabd_p>;
 
   defm FRECPE_ZZ  : sve_fp_2op_u_zd<0b110, "frecpe",  AArch64frecpe>;
   defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", AArch64frsqrte>;
@@ -469,6 +476,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm FMINNM_ZPZZ : sve_fp_bin_pred_hfd<AArch64fminnm_p>;
   defm FMAX_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fmax_p>;
   defm FMIN_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fmin_p>;
+  defm FABD_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fabd_p>;
   defm FDIV_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fdiv_p>;
 } // End HasSVEorStreamingSVE
 
@@ -642,11 +650,11 @@ let Predicates = [HasSVEorStreamingSVE] in {
             (DUP_ZI_D $a, $b)>;
 
   // Duplicate immediate FP into all vector elements.
- def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
+  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm:$val))),
             (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
- def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
+  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm:$val))),
             (DUP_ZR_S (MOVi32imm (bitcast_fpimm_to_i32 f32:$val)))>;
- def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
+  def : Pat<(nxv2f64 (AArch64dup (f64 fpimm:$val))),
             (DUP_ZR_D (MOVi64imm (bitcast_fpimm_to_i64 f64:$val)))>;
 
   // Duplicate FP immediate into all vector elements
@@ -722,11 +730,11 @@ let Predicates = [HasSVEorStreamingSVE] in {
   defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;
 
   def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
-  def PFALSE   : sve_int_pfalse<0b000000, "pfalse">;
+  defm PFALSE  : sve_int_pfalse<0b000000, "pfalse">;
   defm PFIRST  : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
   defm PNEXT   : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
 
-  defm AND_PPzPP   : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>;
+  defm AND_PPzPP   : sve_int_pred_log_and<0b0000, "and", int_aarch64_sve_and_z>;
   defm BIC_PPzPP   : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
   defm EOR_PPzPP   : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
   defm SEL_PPPP    : sve_int_pred_log<0b0011, "sel", vselect>;
@@ -1419,6 +1427,16 @@ let Predicates = [HasSVEorStreamingSVE] in {
             (INSR_ZV_D ZPR:$Z2, (INSERT_SUBREG (IMPLICIT_DEF),
             (LASTB_VPZ_D (PTRUE_D 31), ZPR:$Z1), dsub))>;
 
+  // Splice with lane bigger or equal to 0
+  def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+  def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+  def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+  def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))),
+            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
+
   defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
   defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
   defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
@@ -2496,6 +2514,7 @@ let Predicates = [HasSVEorStreamingSVE] in {
   // 16-element contiguous store
   defm : st1<ST1B, ST1B_IMM,   nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
 
+  // Insert scalar into undef[0]
   def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
             (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
   def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
@@ -2691,17 +2710,6 @@ let Predicates = [HasSVEorStreamingSVE] in {
   def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
             (f64 (EXTRACT_SUBREG ZPR:$Zs, dsub))>;
   }
-
-  // Splice with lane bigger or equal to 0
-  def : Pat<(nxv16i8 (vector_splice (nxv16i8 ZPR:$Z1), (nxv16i8 ZPR:$Z2), (i64 (sve_ext_imm_0_255 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
-  def : Pat<(nxv8i16 (vector_splice (nxv8i16 ZPR:$Z1), (nxv8i16 ZPR:$Z2), (i64 (sve_ext_imm_0_127 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
-  def : Pat<(nxv4i32 (vector_splice (nxv4i32 ZPR:$Z1), (nxv4i32 ZPR:$Z2), (i64 (sve_ext_imm_0_63 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
-  def : Pat<(nxv2i64 (vector_splice (nxv2i64 ZPR:$Z1), (nxv2i64 ZPR:$Z2), (i64 (sve_ext_imm_0_31 i32:$index)))),
-            (EXT_ZZI  ZPR:$Z1, ZPR:$Z2, imm0_255:$index)>;
-
 } // End HasSVEorStreamingSVE
 
 let Predicates = [HasSVE, HasMatMulInt8] in {
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA55.td b/llvm/lib/Target/AArch64/AArch64SchedA55.td
index 877c4d2ced41..009219ce3c54 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA55.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA55.td
@@ -235,10 +235,14 @@ def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
 //---
 // Miscellaneous
 //---
-def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W")>;
-def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS[^W]")>;
-def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)")>;
-def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ")>;
+def : InstRW<[CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?Wi")>;
+def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPSi")>;
+def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)i")>;
+def : InstRW<[CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQi")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD1SI,CortexA55WriteLDP1], (instregex "LDPS?W(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP1], (instregex "LDPS(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP2], (instregex "LDP(X|D)(pre|post)")>;
+def : InstRW<[WriteAdr, CortexA55WriteVLD1,CortexA55WriteLDP4], (instregex "LDPQ(pre|post)")>;
 def : InstRW<[WriteI], (instrs COPY)>;
 //---
 // Vector Loads - 64-bit per cycle
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 168a762241ca..a860aa907fd1 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -526,7 +526,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
 def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>;
 
 // ASIMD duplicate, gen reg, D-form and Q-form
-def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
+def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^DUPv.+gpr")>;
 
 // ASIMD move, saturating
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index 1d25a6c00f95..fa10d056b7f7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -1891,7 +1891,7 @@ def : InstRW<[A64FXWrite_4Cyc_GI0],
 // ASIMD duplicate, gen reg
 // ASIMD duplicate, element
 def : InstRW<[A64FXWrite_DUPGENERAL], (instregex "^DUPv")>;
-def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^CPY")>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[A64FXWrite_6Cyc_GI0], (instregex "^DUPv.+gpr")>;
 
 // ASIMD extract
@@ -2512,16 +2512,16 @@ def : InstRW<[A64FXWrite_1Cyc_GI24], (instrs CNTW_XPiI)>;
 def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs COMPACT_ZPZ_D, COMPACT_ZPZ_S)>;
 
 // [72]   "cpy  $Zd, $Pg/m, $Rn";
-//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmR_B, CPY_ZPmR_D, CPY_ZPmR_H, CPY_ZPmR_S)>;
 
 // [73]   "cpy  $Zd, $Pg/m, $Vn";
-//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmV_B, CPY_ZPmV_D, CPY_ZPmV_H, CPY_ZPmV_S)>;
 
 // [74]   "cpy  $Zd, $Pg/m, $imm";
-//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPmI_B, CPY_ZPmI_D, CPY_ZPmI_H, CPY_ZPmI_S)>;
 
 // [75]   "cpy  $Zd, $Pg/z, $imm";
-//@@@ def : InstRW<[XXXXXX], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>;
+def : InstRW<[A64FXWrite_6Cyc_GI0], (instrs CPY_ZPzI_B, CPY_ZPzI_D, CPY_ZPzI_H, CPY_ZPzI_S)>;
 
 // [76]   "ctermeq      $Rn, $Rm";
 def : InstRW<[A64FXWrite_2Cyc_GI24], (instrs CTERMEQ_WW, CTERMEQ_XX)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index 14df8236504b..d66efb82fccc 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -669,7 +669,7 @@ def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^[SU]?Q?XTU?Nv")>;
-def : InstRW<[M3WriteNSHF1], (instregex "^CPY")>;
+def : InstRW<[M3WriteNSHF1], (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^INSv.+lane")>;
 def : InstRW<[M3WriteMOVI],  (instregex "^MOVI")>;
 def : InstRW<[M3WriteNALU1], (instregex "^FMOVv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index 8f740a9a0d35..94e70793e855 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -810,7 +810,7 @@ def : InstRW<[M4WriteNALU1],  (instregex "^RBITv")>;
 def : InstRW<[M4WriteNALU1],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M4WriteNALU1],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M4WriteNEONB],  (instregex "^DUPv.+gpr")>;
-def : InstRW<[M4WriteNSHF1],  (instregex "^CPY")>;
+def : InstRW<[M4WriteNSHF1],  (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[M4WriteNSHF1],  (instregex "^DUPv.+lane")>;
 def : InstRW<[M4WriteNSHF1],  (instregex "^EXTv")>;
 def : InstRW<[M4WriteNSHT4A], (instregex "^XTNv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index 93e1b66bea03..1db5f5322a64 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -848,7 +848,7 @@ def : InstRW<[M5WriteNALU2],  (instregex "^RBITv")>;
 def : InstRW<[M5WriteNALU2],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M5WriteNALU2],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M5WriteNEONB],  (instregex "^DUPv.+gpr")>;
-def : InstRW<[M5WriteNSHF2],  (instregex "^CPY")>;
+def : InstRW<[M5WriteNSHF2],  (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[M5WriteNSHF2],  (instregex "^DUPv.+lane")>;
 def : InstRW<[M5WriteNSHF2],  (instregex "^EXTv")>;
 def : InstRW<[M5WriteNSHT4A], (instregex "^XTNv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index f2cd83caffa2..a3a038f869fb 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -908,7 +908,7 @@ def : InstRW<[FalkorWr_ADDSUBsx],     (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>;
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^CPY(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index e4cae97b5524..ffa0a5e7d91a 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1499,7 +1499,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01],
 // ASIMD duplicate, gen reg
 // ASIMD duplicate, element
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>;
-def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CPY")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv.+gpr")>;
 
 // ASIMD extract
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
index 08be2b3a55b3..46a1c217f984 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
@@ -1608,7 +1608,7 @@ def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
 // ASIMD duplicate, gen reg
 // ASIMD duplicate, element
 def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv")>;
-def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^CPY")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUP(i8|i16|i32|i64)$")>;
 def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv.+gpr")>;
 
 // ASIMD extract
diff --git a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
index 7307961ddb5f..87be7bb6d113 100644
--- a/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -304,7 +304,7 @@ bool AArch64SpeculationHardening::instrumentControlFlow(
     // sure if that would actually result in a big performance difference
     // though. Maybe RegisterScavenger::findSurvivorBackwards has some logic
     // already to do this - but it's unclear if that could easily be used here.
-    unsigned TmpReg = RS.FindUnusedReg(&AArch64::GPR64commonRegClass);
+    Register TmpReg = RS.FindUnusedReg(&AArch64::GPR64commonRegClass);
     LLVM_DEBUG(dbgs() << "RS finds "
                       << ((TmpReg == 0) ? "no register " : "register ");
                if (TmpReg != 0) dbgs() << printReg(TmpReg, TRI) << " ";
diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
index d2488f61eb4b..cae6d65bed2d 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -195,7 +195,7 @@ void AArch64StackTaggingPreRA::uncheckUsesOf(unsigned TaggedReg, int FI) {
 
 void AArch64StackTaggingPreRA::uncheckLoadsAndStores() {
   for (auto *I : ReTags) {
-    unsigned TaggedReg = I->getOperand(0).getReg();
+    Register TaggedReg = I->getOperand(0).getReg();
     int FI = I->getOperand(1).getIndex();
     uncheckUsesOf(TaggedReg, FI);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index f7d3dd0bc222..a4f4b8582182 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/AArch64TargetParser.h"
 #include "llvm/Support/TargetParser.h"
 
 using namespace llvm;
@@ -157,13 +158,19 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case NeoverseN1:
     PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 5;
+    MaxBytesForLoopAlignment = 16;
     break;
   case NeoverseN2:
     PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 5;
+    MaxBytesForLoopAlignment = 16;
     VScaleForTuning = 1;
     break;
   case NeoverseV1:
     PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 5;
+    MaxBytesForLoopAlignment = 16;
     VScaleForTuning = 2;
     break;
   case Neoverse512TVB:
@@ -228,8 +235,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
       IsLittle(LittleEndian),
       MinSVEVectorSizeInBits(MinSVEVectorSizeInBitsOverride),
       MaxSVEVectorSizeInBits(MaxSVEVectorSizeInBitsOverride), TargetTriple(TT),
-      FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)), TSInfo(),
+      InstrInfo(initializeSubtargetDependencies(FS, CPU, TuneCPU)),
       TLInfo(TM, *this) {
   if (AArch64::isX18ReservedByDefault(TT))
     ReserveXRegister.set(18);
@@ -367,9 +373,4 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
     MFI.computeMaxCallFrameSize(MF);
 }
 
-bool AArch64Subtarget::useSVEForFixedLengthVectors() const {
-  // Prefer NEON unless larger SVE registers are available.
-  return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
-}
-
 bool AArch64Subtarget::useAA() const { return UseAA; }
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index b3cd5ebd5f65..3e3c0f6aba15 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -94,9 +94,11 @@ protected:
   bool HasV8_5aOps = false;
   bool HasV8_6aOps = false;
   bool HasV8_7aOps = false;
+  bool HasV8_8aOps = false;
   bool HasV9_0aOps = false;
   bool HasV9_1aOps = false;
   bool HasV9_2aOps = false;
+  bool HasV9_3aOps = false;
   bool HasV8_0rOps = false;
 
   bool HasCONTEXTIDREL2 = false;
@@ -188,6 +190,10 @@ protected:
   bool HasHCX = false;
   bool HasLS64 = false;
 
+  // Armv8.8-A Extensions
+  bool HasHBC = false;
+  bool HasMOPS = false;
+
   // Arm SVE2 extensions
   bool HasSVE2 = false;
   bool HasSVE2AES = false;
@@ -274,6 +280,7 @@ protected:
   unsigned MaxPrefetchIterationsAhead = UINT_MAX;
   unsigned PrefFunctionLogAlignment = 0;
   unsigned PrefLoopLogAlignment = 0;
+  unsigned MaxBytesForLoopAlignment = 0;
   unsigned MaxJumpTableSize = 0;
   unsigned WideningBaseCost = 0;
 
@@ -365,6 +372,7 @@ public:
   bool hasV9_0aOps() const { return HasV9_0aOps; }
   bool hasV9_1aOps() const { return HasV9_1aOps; }
   bool hasV9_2aOps() const { return HasV9_2aOps; }
+  bool hasV9_3aOps() const { return HasV9_3aOps; }
   bool hasV8_0rOps() const { return HasV8_0rOps; }
 
   bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
@@ -464,6 +472,10 @@ public:
   }
   unsigned getPrefLoopLogAlignment() const { return PrefLoopLogAlignment; }
 
+  unsigned getMaxBytesForLoopAlignment() const {
+    return MaxBytesForLoopAlignment;
+  }
+
   unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; }
 
   unsigned getWideningBaseCost() const { return WideningBaseCost; }
@@ -572,6 +584,8 @@ public:
   bool hasRCPC_IMMO() const { return HasRCPC_IMMO; }
   bool hasEL2VMSA() const { return HasEL2VMSA; }
   bool hasEL3() const { return HasEL3; }
+  bool hasHBC() const { return HasHBC; }
+  bool hasMOPS() const { return HasMOPS; }
 
   bool fixCortexA53_835769() const { return FixCortexA53_835769; }
 
@@ -666,7 +680,10 @@ public:
     return MinSVEVectorSizeInBits;
   }
 
-  bool useSVEForFixedLengthVectors() const;
+  bool useSVEForFixedLengthVectors() const {
+    // Prefer NEON unless larger SVE registers are available.
+    return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
+  }
 
   unsigned getVScaleForTuning() const { return VScaleForTuning; }
 };
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index f9fe804865a5..cce5813fe6e9 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -1333,7 +1333,7 @@ def : RWSysReg<"PRBAR_EL2",        0b11, 0b100, 0b0110, 0b1000, 0b000>;
 def : RWSysReg<"PRLAR_EL1",        0b11, 0b000, 0b0110, 0b1000, 0b001>;
 def : RWSysReg<"PRLAR_EL2",        0b11, 0b100, 0b0110, 0b1000, 0b001>;
 
-foreach n = 0-15 in {
+foreach n = 1-15 in {
 foreach x = 1-2 in {
 //Direct acces to Protection Region Base Address Register for n th MPU region
   def : RWSysReg<!strconcat("PRBAR"#n, "_EL"#x),
@@ -1348,7 +1348,7 @@ foreach x = 1-2 in {
     let Encoding{13} = !add(x,-1);
   }
 } //foreach x = 1-2 in
-} //foreach n = 0-15 in
+} //foreach n = 1-15 in
 } //let Requires = [{ {AArch64::HasV8_0rOps} }] in
 
 // v8.1a "Privileged Access Never" extension-specific system registers
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 25e626134317..7d314bce99b1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -20,8 +20,6 @@
 
 namespace llvm {
 
-class AArch64RegisterBankInfo;
-
 class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index dfc66f0cb4c1..7ed934cfabc0 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -25,8 +25,7 @@ void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
   SupportDebugThreadLocalLocation = false;
 }
 
-AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile()
-  : TargetLoweringObjectFileMachO() {
+AArch64_MachoTargetObjectFile::AArch64_MachoTargetObjectFile() {
   SupportGOTPCRelWithOffset = false;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
index 28324c2ae608..9f098230bbd7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -13,7 +13,6 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
-class AArch64TargetMachine;
 
 /// This implementation is used for AArch64 ELF targets (Linux in particular).
 class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d21854e38f5a..a4d666a0a3c2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -331,6 +331,45 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     }
     break;
   }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow: {
+    static const CostTblEntry WithOverflowCostTbl[] = {
+        {Intrinsic::sadd_with_overflow, MVT::i8, 3},
+        {Intrinsic::uadd_with_overflow, MVT::i8, 3},
+        {Intrinsic::sadd_with_overflow, MVT::i16, 3},
+        {Intrinsic::uadd_with_overflow, MVT::i16, 3},
+        {Intrinsic::sadd_with_overflow, MVT::i32, 1},
+        {Intrinsic::uadd_with_overflow, MVT::i32, 1},
+        {Intrinsic::sadd_with_overflow, MVT::i64, 1},
+        {Intrinsic::uadd_with_overflow, MVT::i64, 1},
+        {Intrinsic::ssub_with_overflow, MVT::i8, 3},
+        {Intrinsic::usub_with_overflow, MVT::i8, 3},
+        {Intrinsic::ssub_with_overflow, MVT::i16, 3},
+        {Intrinsic::usub_with_overflow, MVT::i16, 3},
+        {Intrinsic::ssub_with_overflow, MVT::i32, 1},
+        {Intrinsic::usub_with_overflow, MVT::i32, 1},
+        {Intrinsic::ssub_with_overflow, MVT::i64, 1},
+        {Intrinsic::usub_with_overflow, MVT::i64, 1},
+        {Intrinsic::smul_with_overflow, MVT::i8, 5},
+        {Intrinsic::umul_with_overflow, MVT::i8, 4},
+        {Intrinsic::smul_with_overflow, MVT::i16, 5},
+        {Intrinsic::umul_with_overflow, MVT::i16, 4},
+        {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
+        {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
+        {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
+        {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
+    };
+    EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
+    if (MTy.isSimple())
+      if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
+                                              MTy.getSimpleVT()))
+        return Entry->Cost;
+    break;
+  }
   default:
     break;
   }
@@ -377,12 +416,76 @@ static Optional<Instruction *> processPhiNode(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, NPN);
 }
 
+// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
+// => (binop (pred) (from_svbool _) (from_svbool _))
+//
+// The above transformation eliminates a `to_svbool` in the predicate
+// operand of bitwise operation `binop` by narrowing the vector width of
+// the operation. For example, it would convert a `<vscale x 16 x i1>
+// and` into a `<vscale x 4 x i1> and`. This is profitable because
+// to_svbool must zero the new lanes during widening, whereas
+// from_svbool is free.
+static Optional<Instruction *> tryCombineFromSVBoolBinOp(InstCombiner &IC,
+                                                         IntrinsicInst &II) {
+  auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
+  if (!BinOp)
+    return None;
+
+  auto IntrinsicID = BinOp->getIntrinsicID();
+  switch (IntrinsicID) {
+  case Intrinsic::aarch64_sve_and_z:
+  case Intrinsic::aarch64_sve_bic_z:
+  case Intrinsic::aarch64_sve_eor_z:
+  case Intrinsic::aarch64_sve_nand_z:
+  case Intrinsic::aarch64_sve_nor_z:
+  case Intrinsic::aarch64_sve_orn_z:
+  case Intrinsic::aarch64_sve_orr_z:
+    break;
+  default:
+    return None;
+  }
+
+  auto BinOpPred = BinOp->getOperand(0);
+  auto BinOpOp1 = BinOp->getOperand(1);
+  auto BinOpOp2 = BinOp->getOperand(2);
+
+  auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
+  if (!PredIntr ||
+      PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
+    return None;
+
+  auto PredOp = PredIntr->getOperand(0);
+  auto PredOpTy = cast<VectorType>(PredOp->getType());
+  if (PredOpTy != II.getType())
+    return None;
+
+  IRBuilder<> Builder(II.getContext());
+  Builder.SetInsertPoint(&II);
+
+  SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
+  auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
+      Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
+  NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
+  if (BinOpOp1 == BinOpOp2)
+    NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
+  else
+    NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
+        Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
+
+  auto NarrowedBinOp =
+      Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
+  return IC.replaceInstUsesWith(II, NarrowedBinOp);
+}
+
 static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
                                                             IntrinsicInst &II) {
   // If the reinterpret instruction operand is a PHI Node
   if (isa<PHINode>(II.getArgOperand(0)))
     return processPhiNode(IC, II);
 
+  if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
+    return BinOpCombine;
+
   SmallVector<Instruction *, 32> CandidatesForRemoval;
   Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
 
@@ -1129,6 +1232,32 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   return None;
 }
 
+Optional<Value *> AArch64TTIImpl::simplifyDemandedVectorEltsIntrinsic(
+    InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
+    APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
+    std::function<void(Instruction *, unsigned, APInt, APInt &)>
+        SimplifyAndSetOp) const {
+  switch (II.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::aarch64_neon_fcvtxn:
+  case Intrinsic::aarch64_neon_rshrn:
+  case Intrinsic::aarch64_neon_sqrshrn:
+  case Intrinsic::aarch64_neon_sqrshrun:
+  case Intrinsic::aarch64_neon_sqshrn:
+  case Intrinsic::aarch64_neon_sqshrun:
+  case Intrinsic::aarch64_neon_sqxtn:
+  case Intrinsic::aarch64_neon_sqxtun:
+  case Intrinsic::aarch64_neon_uqrshrn:
+  case Intrinsic::aarch64_neon_uqshrn:
+  case Intrinsic::aarch64_neon_uqxtn:
+    SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
+    break;
+  }
+
+  return None;
+}
+
 bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
                                            ArrayRef<const Value *> Args) {
 
@@ -1461,6 +1590,15 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
     { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
 
+    // Bitcasts from float to integer
+    { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
+    { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
+    { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
+
+    // Bitcasts from integer to float
+    { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
+    { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
+    { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
   };
 
   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
@@ -1555,9 +1693,12 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
     if (!LT.second.isVector())
       return 0;
 
-    // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
+    // The type may be split. For fixed-width vectors we can normalize the
+    // index to the new type.
+    if (LT.second.isFixedLengthVector()) {
+      unsigned Width = LT.second.getVectorNumElements();
+      Index = Index % Width;
+    }
 
     // The element at index zero is already inside the vector.
     if (Index == 0)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c3e1735cd4cd..a6029b9f2445 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -106,6 +106,12 @@ public:
   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
                                                IntrinsicInst &II) const;
 
+  Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
+      InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
+      APInt &UndefElts2, APInt &UndefElts3,
+      std::function<void(Instruction *, unsigned, APInt, APInt &)>
+          SimplifyAndSetOp) const;
+
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
     switch (K) {
     case TargetTransformInfo::RGK_Scalar:
@@ -307,6 +313,10 @@ public:
     return 2;
   }
 
+  bool emitGetActiveLaneMask() const {
+    return ST->hasSVE();
+  }
+
   bool supportsScalableVectors() const { return ST->hasSVE(); }
 
   bool enableScalableVectorization() const { return ST->hasSVE(); }
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 62038b10fccd..33ed7ae9780e 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/AArch64TargetParser.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -3284,6 +3285,8 @@ static const struct Extension {
     {"sme", {AArch64::FeatureSME}},
     {"sme-f64", {AArch64::FeatureSMEF64}},
     {"sme-i64", {AArch64::FeatureSMEI64}},
+    {"hbc", {AArch64::FeatureHBC}},
+    {"mops", {AArch64::FeatureMOPS}},
     // FIXME: Unsupported extensions
     {"lor", {}},
     {"rdma", {}},
@@ -3307,12 +3310,16 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
     Str += "ARMv8.6a";
   else if (FBS[AArch64::HasV8_7aOps])
     Str += "ARMv8.7a";
+  else if (FBS[AArch64::HasV8_8aOps])
+    Str += "ARMv8.8a";
   else if (FBS[AArch64::HasV9_0aOps])
     Str += "ARMv9-a";
   else if (FBS[AArch64::HasV9_1aOps])
     Str += "ARMv9.1a";
   else if (FBS[AArch64::HasV9_2aOps])
     Str += "ARMv9.2a";
+  else if (FBS[AArch64::HasV9_3aOps])
+    Str += "ARMv9.3a";
   else if (FBS[AArch64::HasV8_0rOps])
     Str += "ARMv8r";
   else {
@@ -4531,7 +4538,7 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
   Mnemonic = Head;
 
   // Handle condition codes for a branch mnemonic
-  if (Head == "b" && Next != StringRef::npos) {
+  if ((Head == "b" || Head == "bc") && Next != StringRef::npos) {
     Start = Next;
     Next = Name.find('.', Start + 1);
     Head = Name.slice(Start + 1, Next);
@@ -4862,6 +4869,177 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
   }
   }
 
+  // Check v8.8-A memops instructions.
+  switch (Inst.getOpcode()) {
+  case AArch64::CPYFP:
+  case AArch64::CPYFPWN:
+  case AArch64::CPYFPRN:
+  case AArch64::CPYFPN:
+  case AArch64::CPYFPWT:
+  case AArch64::CPYFPWTWN:
+  case AArch64::CPYFPWTRN:
+  case AArch64::CPYFPWTN:
+  case AArch64::CPYFPRT:
+  case AArch64::CPYFPRTWN:
+  case AArch64::CPYFPRTRN:
+  case AArch64::CPYFPRTN:
+  case AArch64::CPYFPT:
+  case AArch64::CPYFPTWN:
+  case AArch64::CPYFPTRN:
+  case AArch64::CPYFPTN:
+  case AArch64::CPYFM:
+  case AArch64::CPYFMWN:
+  case AArch64::CPYFMRN:
+  case AArch64::CPYFMN:
+  case AArch64::CPYFMWT:
+  case AArch64::CPYFMWTWN:
+  case AArch64::CPYFMWTRN:
+  case AArch64::CPYFMWTN:
+  case AArch64::CPYFMRT:
+  case AArch64::CPYFMRTWN:
+  case AArch64::CPYFMRTRN:
+  case AArch64::CPYFMRTN:
+  case AArch64::CPYFMT:
+  case AArch64::CPYFMTWN:
+  case AArch64::CPYFMTRN:
+  case AArch64::CPYFMTN:
+  case AArch64::CPYFE:
+  case AArch64::CPYFEWN:
+  case AArch64::CPYFERN:
+  case AArch64::CPYFEN:
+  case AArch64::CPYFEWT:
+  case AArch64::CPYFEWTWN:
+  case AArch64::CPYFEWTRN:
+  case AArch64::CPYFEWTN:
+  case AArch64::CPYFERT:
+  case AArch64::CPYFERTWN:
+  case AArch64::CPYFERTRN:
+  case AArch64::CPYFERTN:
+  case AArch64::CPYFET:
+  case AArch64::CPYFETWN:
+  case AArch64::CPYFETRN:
+  case AArch64::CPYFETN:
+  case AArch64::CPYP:
+  case AArch64::CPYPWN:
+  case AArch64::CPYPRN:
+  case AArch64::CPYPN:
+  case AArch64::CPYPWT:
+  case AArch64::CPYPWTWN:
+  case AArch64::CPYPWTRN:
+  case AArch64::CPYPWTN:
+  case AArch64::CPYPRT:
+  case AArch64::CPYPRTWN:
+  case AArch64::CPYPRTRN:
+  case AArch64::CPYPRTN:
+  case AArch64::CPYPT:
+  case AArch64::CPYPTWN:
+  case AArch64::CPYPTRN:
+  case AArch64::CPYPTN:
+  case AArch64::CPYM:
+  case AArch64::CPYMWN:
+  case AArch64::CPYMRN:
+  case AArch64::CPYMN:
+  case AArch64::CPYMWT:
+  case AArch64::CPYMWTWN:
+  case AArch64::CPYMWTRN:
+  case AArch64::CPYMWTN:
+  case AArch64::CPYMRT:
+  case AArch64::CPYMRTWN:
+  case AArch64::CPYMRTRN:
+  case AArch64::CPYMRTN:
+  case AArch64::CPYMT:
+  case AArch64::CPYMTWN:
+  case AArch64::CPYMTRN:
+  case AArch64::CPYMTN:
+  case AArch64::CPYE:
+  case AArch64::CPYEWN:
+  case AArch64::CPYERN:
+  case AArch64::CPYEN:
+  case AArch64::CPYEWT:
+  case AArch64::CPYEWTWN:
+  case AArch64::CPYEWTRN:
+  case AArch64::CPYEWTN:
+  case AArch64::CPYERT:
+  case AArch64::CPYERTWN:
+  case AArch64::CPYERTRN:
+  case AArch64::CPYERTN:
+  case AArch64::CPYET:
+  case AArch64::CPYETWN:
+  case AArch64::CPYETRN:
+  case AArch64::CPYETN: {
+    unsigned Xd_wb = Inst.getOperand(0).getReg();
+    unsigned Xs_wb = Inst.getOperand(1).getReg();
+    unsigned Xn_wb = Inst.getOperand(2).getReg();
+    unsigned Xd = Inst.getOperand(3).getReg();
+    unsigned Xs = Inst.getOperand(4).getReg();
+    unsigned Xn = Inst.getOperand(5).getReg();
+    if (Xd_wb != Xd)
+      return Error(Loc[0],
+                   "invalid CPY instruction, Xd_wb and Xd do not match");
+    if (Xs_wb != Xs)
+      return Error(Loc[0],
+                   "invalid CPY instruction, Xs_wb and Xs do not match");
+    if (Xn_wb != Xn)
+      return Error(Loc[0],
+                   "invalid CPY instruction, Xn_wb and Xn do not match");
+    if (Xd == Xs)
+      return Error(Loc[0], "invalid CPY instruction, destination and source"
+                           " registers are the same");
+    if (Xd == Xn)
+      return Error(Loc[0], "invalid CPY instruction, destination and size"
+                           " registers are the same");
+    if (Xs == Xn)
+      return Error(Loc[0], "invalid CPY instruction, source and size"
+                           " registers are the same");
+    break;
+  }
+  case AArch64::SETP:
+  case AArch64::SETPT:
+  case AArch64::SETPN:
+  case AArch64::SETPTN:
+  case AArch64::SETM:
+  case AArch64::SETMT:
+  case AArch64::SETMN:
+  case AArch64::SETMTN:
+  case AArch64::SETE:
+  case AArch64::SETET:
+  case AArch64::SETEN:
+  case AArch64::SETETN:
+  case AArch64::SETGP:
+  case AArch64::SETGPT:
+  case AArch64::SETGPN:
+  case AArch64::SETGPTN:
+  case AArch64::SETGM:
+  case AArch64::SETGMT:
+  case AArch64::SETGMN:
+  case AArch64::SETGMTN:
+  case AArch64::MOPSSETGE:
+  case AArch64::MOPSSETGET:
+  case AArch64::MOPSSETGEN:
+  case AArch64::MOPSSETGETN: {
+    unsigned Xd_wb = Inst.getOperand(0).getReg();
+    unsigned Xn_wb = Inst.getOperand(1).getReg();
+    unsigned Xd = Inst.getOperand(2).getReg();
+    unsigned Xn = Inst.getOperand(3).getReg();
+    unsigned Xm = Inst.getOperand(4).getReg();
+    if (Xd_wb != Xd)
+      return Error(Loc[0],
+                   "invalid SET instruction, Xd_wb and Xd do not match");
+    if (Xn_wb != Xn)
+      return Error(Loc[0],
+                   "invalid SET instruction, Xn_wb and Xn do not match");
+    if (Xd == Xn)
+      return Error(Loc[0], "invalid SET instruction, destination and size"
+                           " registers are the same");
+    if (Xd == Xm)
+      return Error(Loc[0], "invalid SET instruction, destination and source"
+                           " registers are the same");
+    if (Xn == Xm)
+      return Error(Loc[0], "invalid SET instruction, source and size"
+                           " registers are the same");
+    break;
+  }
+  }
 
   // Now check immediate ranges. Separate from the above as there is overlap
   // in the instructions being checked and this keeps the nested conditionals
@@ -5931,9 +6109,11 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
     case AArch64::ArchKind::ARMV8_5A:
     case AArch64::ArchKind::ARMV8_6A:
     case AArch64::ArchKind::ARMV8_7A:
+    case AArch64::ArchKind::ARMV8_8A:
     case AArch64::ArchKind::ARMV9A:
     case AArch64::ArchKind::ARMV9_1A:
     case AArch64::ArchKind::ARMV9_2A:
+    case AArch64::ArchKind::ARMV9_3A:
     case AArch64::ArchKind::ARMV8R:
       RequestedExtensions.push_back("sm4");
       RequestedExtensions.push_back("sha3");
@@ -5956,6 +6136,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
     case AArch64::ArchKind::ARMV8_5A:
     case AArch64::ArchKind::ARMV8_6A:
     case AArch64::ArchKind::ARMV8_7A:
+    case AArch64::ArchKind::ARMV8_8A:
     case AArch64::ArchKind::ARMV9A:
     case AArch64::ArchKind::ARMV9_1A:
     case AArch64::ArchKind::ARMV9_2A:
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 96d410e42be2..9ce00f76d9c7 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -238,6 +238,12 @@ static DecodeStatus DecodeSVEIncDecImm(MCInst &Inst, unsigned Imm,
                                        uint64_t Addr, const void *Decoder);
 static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
                                  const void *Decoder);
+static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder);
+static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -1842,3 +1848,52 @@ static DecodeStatus DecodeSVCROp(MCInst &Inst, unsigned Imm, uint64_t Address,
   }
   return Fail;
 }
+
+static DecodeStatus DecodeCPYMemOpInstruction(MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+
+  // None of the registers may alias: if they do, then the instruction is not
+  // merely unpredictable but actually entirely unallocated.
+  if (Rd == Rs || Rs == Rn || Rd == Rn)
+    return MCDisassembler::Fail;
+
+  // All three register operands are written back, so they all appear
+  // twice in the operand list, once as outputs and once as inputs.
+  if (!DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) ||
+      !DecodeGPR64commonRegisterClass(Inst, Rs, Addr, Decoder) ||
+      !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) ||
+      !DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) ||
+      !DecodeGPR64commonRegisterClass(Inst, Rs, Addr, Decoder) ||
+      !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder))
+    return MCDisassembler::Fail;
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSETMemOpInstruction(MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+
+  // None of the registers may alias: if they do, then the instruction is not
+  // merely unpredictable but actually entirely unallocated.
+  if (Rd == Rm || Rm == Rn || Rd == Rn)
+    return MCDisassembler::Fail;
+
+  // Rd and Rn (not Rm) register operands are written back, so they appear
+  // twice in the operand list, once as outputs and once as inputs.
+  if (!DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) ||
+      !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) ||
+      !DecodeGPR64commonRegisterClass(Inst, Rd, Addr, Decoder) ||
+      !DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder) ||
+      !DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder))
+    return MCDisassembler::Fail;
+
+  return MCDisassembler::Success;
+}
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index ac08ee8ae8dd..097b93e4fcca 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -1112,6 +1112,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     return false;
   }
 
+  Info.IsTailCall = CanTailCallOpt;
   if (CanTailCallOpt)
     return lowerTailCall(MIRBuilder, Info, OutArgs);
 
@@ -1179,7 +1180,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     if (!determineAndHandleAssignments(
             UsingReturnedArg ? ReturnedArgHandler : Handler, Assigner, InArgs,
             MIRBuilder, Info.CallConv, Info.IsVarArg,
-            UsingReturnedArg ? OutArgs[0].Regs[0] : Register()))
+            UsingReturnedArg ? makeArrayRef(OutArgs[0].Regs) : None))
       return false;
   }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
index add0342c90fd..aafb1d19640a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
@@ -24,9 +24,7 @@ namespace llvm {
 
 class AArch64TargetLowering;
 class CCValAssign;
-class DataLayout;
 class MachineIRBuilder;
-class MachineRegisterInfo;
 class Type;
 
 class AArch64CallLowering: public CallLowering {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 3d9a626d3ac3..1f546ad50d57 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -18,7 +18,6 @@
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
-#include "AArch64GlobalISelUtils.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/Optional.h"
@@ -472,8 +471,8 @@ private:
 AArch64InstructionSelector::AArch64InstructionSelector(
     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
     const AArch64RegisterBankInfo &RBI)
-    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI),
+    : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
+      RBI(RBI),
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_INIT
@@ -3937,19 +3936,19 @@ static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
   // vector's elements.
   switch (EltSize) {
   case 8:
-    CopyOpc = AArch64::CPYi8;
+    CopyOpc = AArch64::DUPi8;
     ExtractSubReg = AArch64::bsub;
     break;
   case 16:
-    CopyOpc = AArch64::CPYi16;
+    CopyOpc = AArch64::DUPi16;
     ExtractSubReg = AArch64::hsub;
     break;
   case 32:
-    CopyOpc = AArch64::CPYi32;
+    CopyOpc = AArch64::DUPi32;
     ExtractSubReg = AArch64::ssub;
     break;
   case 64:
-    CopyOpc = AArch64::CPYi64;
+    CopyOpc = AArch64::DUPi64;
     ExtractSubReg = AArch64::dsub;
     break;
   default:
@@ -5469,8 +5468,8 @@ bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
         // Insert the copy from LR/X30 into the entry block, before it can be
         // clobbered by anything.
         MFI.setReturnAddressIsTaken(true);
-        MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
-                                                AArch64::GPR64RegClass);
+        MFReturnAddr = getFunctionLiveInPhysReg(
+            MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
       }
 
       if (STI.hasPAuth()) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 35456d95dc2b..e2c46f4b4c1f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -21,7 +21,6 @@
 
 namespace llvm {
 
-class LLVMContext;
 class AArch64Subtarget;
 
 /// This class provides the information for the target register banks.
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 7274ae79f74a..225e0c8e55fc 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -19,7 +19,6 @@
 
 namespace llvm {
 class MCStreamer;
-class Target;
 class Triple;
 
 struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 941226b83e44..66cb7a37a958 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -30,11 +30,7 @@ class MCStreamer;
 class MCSubtargetInfo;
 class MCTargetOptions;
 class MCTargetStreamer;
-class StringRef;
 class Target;
-class Triple;
-class raw_ostream;
-class raw_pwrite_stream;
 
 MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index bb488cd7da32..574b22124957 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -334,6 +334,8 @@ multiclass sve_int_ptrue<bits<3> opc, string asm, SDPatternOperator op> {
 
 def SDT_AArch64PTrue : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
 def AArch64ptrue : SDNode<"AArch64ISD::PTRUE", SDT_AArch64PTrue>;
+def SDT_AArch64PFalse : SDTypeProfile<1, 0, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>]>;
+def AArch64pfalse : SDNode<"AArch64ISD::PFALSE", SDT_AArch64PFalse>;
 
 let Predicates = [HasSVEorStreamingSVE] in {
   defm PTRUE  : sve_int_ptrue<0b000, "ptrue", AArch64ptrue>;
@@ -609,6 +611,15 @@ class sve_int_pfalse<bits<6> opc, string asm>
   let isReMaterializable = 1;
 }
 
+multiclass sve_int_pfalse<bits<6> opc, string asm> {
+  def NAME : sve_int_pfalse<opc, asm>;
+
+  def : Pat<(nxv16i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv8i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv4i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
+  def : Pat<(nxv2i1 (AArch64pfalse)), (!cast<Instruction>(NAME))>;
+}
+
 class sve_int_ptest<bits<6> opc, string asm>
 : I<(outs), (ins PPRAny:$Pg, PPR8:$Pn),
   asm, "\t$Pg, $Pn",
@@ -1622,6 +1633,18 @@ multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
                                !cast<Instruction>(NAME), PTRUE_D>;
 }
 
+multiclass sve_int_pred_log_and<bits<4> opc, string asm, SDPatternOperator op> :
+  sve_int_pred_log<opc, asm, op> {
+  def : Pat<(nxv16i1 (and nxv16i1:$Op1, nxv16i1:$Op2)),
+            (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
+  def : Pat<(nxv8i1 (and nxv8i1:$Op1, nxv8i1:$Op2)),
+            (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
+  def : Pat<(nxv4i1 (and nxv4i1:$Op1, nxv4i1:$Op2)),
+            (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
+  def : Pat<(nxv2i1 (and nxv2i1:$Op1, nxv2i1:$Op2)),
+            (!cast<Instruction>(NAME) $Op1, $Op1, $Op2)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE Logical Mask Immediate Group
 //===----------------------------------------------------------------------===//
@@ -1708,6 +1731,9 @@ multiclass sve_int_dup_mask_imm<string asm> {
                   (!cast<Instruction>(NAME) ZPR32:$Zd, sve_preferred_logical_imm32:$imm), 6>;
   def : InstAlias<"mov $Zd, $imm",
                   (!cast<Instruction>(NAME) ZPR64:$Zd, sve_preferred_logical_imm64:$imm), 5>;
+
+  def : Pat<(nxv2i64 (AArch64dup (i64 logical_imm64:$imm))),
+            (!cast<Instruction>(NAME) logical_imm64:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4641,6 +4667,10 @@ multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
             (cmp $Op1, $Op2, $Op3)>;
   def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
             (cmp $Op1, $Op3, $Op2)>;
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, cc))),
+            (cmp $Pg, $Op2, $Op3)>;
+  def : Pat<(predvt (and predvt:$Pg, (AArch64setcc_z (predvt (AArch64ptrue 31)), intvt:$Op2, intvt:$Op3, invcc))),
+            (cmp $Pg, $Op3, $Op2)>;
 }
 
 multiclass SVE_SETCC_Pat_With_Zero<CondCode cc, CondCode invcc, ValueType predvt,
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
index 642080a0d40d..4a24162540a5 100644
--- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -40,10 +40,6 @@ using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "aarch64-sve-intrinsic-opts"
 
-namespace llvm {
-void initializeSVEIntrinsicOptsPass(PassRegistry &);
-}
-
 namespace {
 struct SVEIntrinsicOpts : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index caee2acd2606..5906a5d6b50b 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -483,18 +483,20 @@ inline unsigned getNumElementsFromSVEPredPattern(unsigned Pattern) {
 }
 
 /// Return specific VL predicate pattern based on the number of elements.
-inline unsigned getSVEPredPatternFromNumElements(unsigned MinNumElts) {
+inline Optional<unsigned>
+getSVEPredPatternFromNumElements(unsigned MinNumElts) {
   switch (MinNumElts) {
   default:
-    llvm_unreachable("unexpected element count for SVE predicate");
+    return None;
   case 1:
-    return AArch64SVEPredPattern::vl1;
   case 2:
-    return AArch64SVEPredPattern::vl2;
+  case 3:
   case 4:
-    return AArch64SVEPredPattern::vl4;
+  case 5:
+  case 6:
+  case 7:
   case 8:
-    return AArch64SVEPredPattern::vl8;
+    return MinNumElts;
   case 16:
     return AArch64SVEPredPattern::vl16;
   case 32:
@@ -757,7 +759,6 @@ namespace AArch64 {
 // <n x (M*P) x t> vector (such as index 1) are undefined.
 static constexpr unsigned SVEBitsPerBlock = 128;
 static constexpr unsigned SVEMaxBitsPerVector = 2048;
-const unsigned NeonBitsPerVector = 128;
 } // end namespace AArch64
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index e606f0e8fc3c..806c0b18637a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -610,12 +610,6 @@ def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts",
   "Has ds_*_src2 instructions"
 >;
 
-def FeatureRegisterBanking : SubtargetFeature<"register-banking",
-  "HasRegisterBanking",
-  "true",
-  "Has register banking"
->;
-
 def FeatureVOP3Literal : SubtargetFeature<"vop3-literal",
   "HasVOP3Literal",
   "true",
@@ -826,7 +820,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
    FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
-   FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
+   FeatureNoSdstCMPX, FeatureVscnt,
    FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
    FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
    FeatureGFX10A16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index 22be014813b0..5ba9b2cd187e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -26,7 +26,7 @@ class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
   const DataLayout &DL;
 
 public:
-  explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {}
+  explicit AMDGPUAAResult(const DataLayout &DL) : DL(DL) {}
   AMDGPUAAResult(AMDGPUAAResult &&Arg)
       : AAResultBase(std::move(Arg)), DL(Arg.DL) {}
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 2f1e7823f65c..cd084fd5440a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -192,8 +192,20 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
 
     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-    if (!SPReg)
-      SPReg = MIRBuilder.buildCopy(PtrTy, MFI->getStackPtrOffsetReg()).getReg(0);
+    if (!SPReg) {
+      const GCNSubtarget &ST = MIRBuilder.getMF().getSubtarget<GCNSubtarget>();
+      if (ST.enableFlatScratch()) {
+        // The stack is accessed unswizzled, so we can use a regular copy.
+        SPReg = MIRBuilder.buildCopy(PtrTy,
+                                     MFI->getStackPtrOffsetReg()).getReg(0);
+      } else {
+        // The address we produce here, without knowing the use context, is going
+        // to be interpreted as a vector address, so we need to convert to a
+        // swizzled address.
+        SPReg = MIRBuilder.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {PtrTy},
+                                      {MFI->getStackPtrOffsetReg()}).getReg(0);
+      }
+    }
 
     auto OffsetReg = MIRBuilder.buildConstant(S32, Offset);
 
@@ -615,6 +627,13 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     CCInfo.AllocateReg(ImplicitBufferPtrReg);
   }
 
+  // FIXME: This probably isn't defined for mesa
+  if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
+    Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
   SmallVector<ArgInfo, 32> SplitArgs;
   unsigned Idx = 0;
   unsigned PSInputNum = 0;
@@ -879,13 +898,17 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
   Register InputReg;
   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
       NeedWorkItemIDX) {
-    InputReg = MRI.createGenericVirtualRegister(S32);
-    LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
-                       std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
+    if (ST.getMaxWorkitemID(MF.getFunction(), 0) != 0) {
+      InputReg = MRI.createGenericVirtualRegister(S32);
+      LI->loadInputValue(InputReg, MIRBuilder, IncomingArgX,
+                         std::get<1>(WorkitemIDX), std::get<2>(WorkitemIDX));
+    } else {
+      InputReg = MIRBuilder.buildConstant(S32, 0).getReg(0);
+    }
   }
 
   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
-      NeedWorkItemIDY) {
+      NeedWorkItemIDY && ST.getMaxWorkitemID(MF.getFunction(), 1) != 0) {
     Register Y = MRI.createGenericVirtualRegister(S32);
     LI->loadInputValue(Y, MIRBuilder, IncomingArgY, std::get<1>(WorkitemIDY),
                        std::get<2>(WorkitemIDY));
@@ -895,7 +918,7 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
   }
 
   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
-      NeedWorkItemIDZ) {
+      NeedWorkItemIDZ && ST.getMaxWorkitemID(MF.getFunction(), 2) != 0) {
     Register Z = MRI.createGenericVirtualRegister(S32);
     LI->loadInputValue(Z, MIRBuilder, IncomingArgZ, std::get<1>(WorkitemIDZ),
                        std::get<2>(WorkitemIDZ));
@@ -904,16 +927,24 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     InputReg = InputReg ? MIRBuilder.buildOr(S32, InputReg, Z).getReg(0) : Z;
   }
 
-  if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
+  if (!InputReg &&
+      (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
     InputReg = MRI.createGenericVirtualRegister(S32);
-
-    // Workitem ids are already packed, any of present incoming arguments will
-    // carry all required fields.
-    ArgDescriptor IncomingArg = ArgDescriptor::createArg(
-      IncomingArgX ? *IncomingArgX :
+    if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
+      // We're in a situation where the outgoing function requires the workitem
+      // ID, but the calling function does not have it (e.g a graphics function
+      // calling a C calling convention function). This is illegal, but we need
+      // to produce something.
+      MIRBuilder.buildUndef(InputReg);
+    } else {
+      // Workitem ids are already packed, any of present incoming arguments will
+      // carry all required fields.
+      ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+        IncomingArgX ? *IncomingArgX :
         IncomingArgY ? *IncomingArgY : *IncomingArgZ, ~0u);
-    LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
-                       &AMDGPU::VGPR_32RegClass, S32);
+      LI->loadInputValue(InputReg, MIRBuilder, &IncomingArg,
+                         &AMDGPU::VGPR_32RegClass, S32);
+    }
   }
 
   if (OutgoingArg->isRegister()) {
@@ -1314,6 +1345,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     return false;
   }
 
+  Info.IsTailCall = CanTailCallOpt;
   if (CanTailCallOpt)
     return lowerTailCall(MIRBuilder, Info, OutArgs);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index a55729586b8d..1920684d8f1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -150,13 +150,13 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
 
   /// \returns The minimum number of bits needed to store the value of \Op as an
   /// unsigned integer. Truncating to this size and then zero-extending to
-  /// ScalarSize will not change the value.
-  unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
+  /// the original will not change the value.
+  unsigned numBitsUnsigned(Value *Op) const;
 
   /// \returns The minimum number of bits needed to store the value of \Op as a
   /// signed integer. Truncating to this size and then sign-extending to
-  /// ScalarSize will not change the value.
-  unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
+  /// the original size will not change the value.
+  unsigned numBitsSigned(Value *Op) const;
 
   /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
   /// SelectionDAG has an issue where an and asserting the bits are known
@@ -445,17 +445,12 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
   return true;
 }
 
-unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
-                                               unsigned ScalarSize) const {
-  KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
-  return ScalarSize - Known.countMinLeadingZeros();
+unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const {
+  return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
 }
 
-unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
-                                             unsigned ScalarSize) const {
-  // In order for this to be a signed 24-bit value, bit 23, must
-  // be a sign bit.
-  return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC) + 1;
+unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const {
+  return ComputeMaxSignificantBits(Op, *DL, 0, AC);
 }
 
 static void extractValues(IRBuilder<> &Builder,
@@ -532,12 +527,12 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   unsigned LHSBits = 0, RHSBits = 0;
   bool IsSigned = false;
 
-  if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 &&
-      (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) {
+  if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
+      (RHSBits = numBitsUnsigned(RHS)) <= 24) {
     IsSigned = false;
 
-  } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS, Size)) <= 24 &&
-             (RHSBits = numBitsSigned(RHS, Size)) <= 24) {
+  } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
+             (RHSBits = numBitsSigned(RHS)) <= 24) {
     IsSigned = true;
 
   } else
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 699c6c479455..3ac7c45b3275 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -331,8 +331,7 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
   if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) {
     if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
       // FIXME: Should report this for all address spaces
-      PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
-                                                   PtrTy->getElementType());
+      PointeeAlign = Arg.getParamAlign().valueOrOne();
     }
   }
 
@@ -731,10 +730,8 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
 
   // FIXME: Need to distinguish in memory alignment from pointer alignment.
   if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
-    if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
-                                                   PtrTy->getElementType());
-    }
+    if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
+      PointeeAlign = Arg.getParamAlign().valueOrOne();
   }
 
   // There's no distinction between byval aggregates and raw aggregates.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 54177564afbc..b9d0655feef7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -51,7 +51,7 @@ unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
   // In order for this to be a signed 24-bit value, bit 23, must
   // be a sign bit.
-  return DAG.ComputeMinSignedBits(Op);
+  return DAG.ComputeMaxSignificantBits(Op);
 }
 
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
@@ -360,6 +360,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
@@ -1408,6 +1410,11 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
       Start != 1)
     return Op;
 
+  if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
+       (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
+      (Start == 0 || Start == 4))
+    return Op;
+
   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
                             VT.getVectorNumElements());
 
@@ -4626,11 +4633,12 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     RHSKnown = RHSKnown.trunc(24);
 
     if (Opc == AMDGPUISD::MUL_I24) {
-      unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
-      unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
-      unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
-      if (MaxValBits >= 32)
+      unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
+      unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
+      unsigned MaxValBits = LHSValBits + RHSValBits;
+      if (MaxValBits > 32)
         break;
+      unsigned SignBits = 32 - MaxValBits + 1;
       bool LHSNegative = LHSKnown.isNegative();
       bool LHSNonNegative = LHSKnown.isNonNegative();
       bool LHSPositive = LHSKnown.isStrictlyPositive();
@@ -4639,16 +4647,16 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
       bool RHSPositive = RHSKnown.isStrictlyPositive();
 
       if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
-        Known.Zero.setHighBits(32 - MaxValBits);
+        Known.Zero.setHighBits(SignBits);
       else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
-        Known.One.setHighBits(32 - MaxValBits);
+        Known.One.setHighBits(SignBits);
     } else {
-      unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
-      unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
-      unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
+      unsigned LHSValBits = LHSKnown.countMaxActiveBits();
+      unsigned RHSValBits = RHSKnown.countMaxActiveBits();
+      unsigned MaxValBits = LHSValBits + RHSValBits;
       if (MaxValBits >= 32)
         break;
-      Known.Zero.setHighBits(32 - MaxValBits);
+      Known.Zero.setBitsFrom(MaxValBits);
     }
     break;
   }
@@ -4904,7 +4912,8 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
   }
 }
 
-bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtactLegal(
+bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
     unsigned Opc, LLT Ty1, LLT Ty2) const {
-  return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
+  return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
+         Ty2 == LLT::scalar(32);
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index daaca8737c5d..b41506157b68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -335,8 +335,8 @@ public:
 
   AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
 
-  bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1,
-                                             LLT Ty2) const override;
+  bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
+                                              LLT Ty2) const override;
 };
 
 namespace AMDGPUISD {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index db84b8766924..4f1d700bcd84 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -58,24 +58,37 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
 
 // Check if a value can be converted to a 16-bit value without losing
 // precision.
-static bool canSafelyConvertTo16Bit(Value &V) {
+// The value is expected to be either a float (IsFloat = true) or an unsigned
+// integer (IsFloat = false).
+static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
   Type *VTy = V.getType();
   if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
     // The value is already 16-bit, so we don't want to convert to 16-bit again!
     return false;
   }
-  if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
-    // We need to check that if we cast the index down to a half, we do not lose
-    // precision.
-    APFloat FloatValue(ConstFloat->getValueAPF());
-    bool LosesInfo = true;
-    FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
-    return !LosesInfo;
+  if (IsFloat) {
+    if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
+      // We need to check that if we cast the index down to a half, we do not
+      // lose precision.
+      APFloat FloatValue(ConstFloat->getValueAPF());
+      bool LosesInfo = true;
+      FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
+                         &LosesInfo);
+      return !LosesInfo;
+    }
+  } else {
+    if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
+      // We need to check that if we cast the index down to an i16, we do not
+      // lose precision.
+      APInt IntValue(ConstInt->getValue());
+      return IntValue.getActiveBits() <= 16;
+    }
   }
+
   Value *CastSrc;
-  if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
-      match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
-      match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
+  bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
+                       : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
+  if (IsExt) {
     Type *CastSrcTy = CastSrc->getType();
     if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
       return true;
@@ -97,13 +110,116 @@ static Value *convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder) {
   llvm_unreachable("Should never be called!");
 }
 
+/// Applies Function(II.Args, II.ArgTys) and replaces the intrinsic call with
+/// the modified arguments.
+static Optional<Instruction *> modifyIntrinsicCall(
+    IntrinsicInst &II, unsigned NewIntr, InstCombiner &IC,
+    std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
+        Func) {
+  SmallVector<Type *, 4> ArgTys;
+  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
+    return None;
+
+  SmallVector<Value *, 8> Args(II.args());
+
+  // Modify arguments and types
+  Func(Args, ArgTys);
+
+  Function *I = Intrinsic::getDeclaration(II.getModule(), NewIntr, ArgTys);
+
+  CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+  NewCall->takeName(&II);
+  NewCall->copyMetadata(II);
+  if (isa<FPMathOperator>(NewCall))
+    NewCall->copyFastMathFlags(&II);
+
+  // Erase and replace uses
+  if (!II.getType()->isVoidTy())
+    IC.replaceInstUsesWith(II, NewCall);
+  return IC.eraseInstFromFunction(II);
+}
+
 static Optional<Instruction *>
 simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
                              const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
                              IntrinsicInst &II, InstCombiner &IC) {
+  // Optimize _L to _LZ when _L is zero
+  if (const auto *LZMappingInfo =
+          AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
+    if (auto *ConstantLod =
+            dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
+      if (ConstantLod->isZero() || ConstantLod->isNegative()) {
+        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+            AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
+                                                     ImageDimIntr->Dim);
+        return modifyIntrinsicCall(
+            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+              Args.erase(Args.begin() + ImageDimIntr->LodIndex);
+            });
+      }
+    }
+  }
+
+  // Optimize _mip away, when 'lod' is zero
+  if (const auto *MIPMappingInfo =
+          AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+    if (auto *ConstantMip =
+            dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
+      if (ConstantMip->isZero()) {
+        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+            AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
+                                                     ImageDimIntr->Dim);
+        return modifyIntrinsicCall(
+            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+              Args.erase(Args.begin() + ImageDimIntr->MipIndex);
+            });
+      }
+    }
+  }
+
+  // Optimize _bias away when 'bias' is zero
+  if (const auto *BiasMappingInfo =
+          AMDGPU::getMIMGBiasMappingInfo(ImageDimIntr->BaseOpcode)) {
+    if (auto *ConstantBias =
+            dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
+      if (ConstantBias->isZero()) {
+        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+            AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
+                                                     ImageDimIntr->Dim);
+        return modifyIntrinsicCall(
+            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+              Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
+              ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
+            });
+      }
+    }
+  }
+
+  // Optimize _offset away when 'offset' is zero
+  if (const auto *OffsetMappingInfo =
+          AMDGPU::getMIMGOffsetMappingInfo(ImageDimIntr->BaseOpcode)) {
+    if (auto *ConstantOffset =
+            dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
+      if (ConstantOffset->isZero()) {
+        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+            AMDGPU::getImageDimIntrinsicByBaseOpcode(
+                OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
+        return modifyIntrinsicCall(
+            II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
+              Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
+            });
+      }
+    }
+  }
+
+  // Try to use A16 or G16
   if (!ST->hasA16() && !ST->hasG16())
     return None;
 
+  // Address is interpreted as float if the instruction has a sampler or as
+  // unsigned int if there is no sampler.
+  bool HasSampler =
+      AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
   bool FloatCoord = false;
   // true means derivatives can be converted to 16 bit, coordinates not
   bool OnlyDerivatives = false;
@@ -112,7 +228,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
        OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
     Value *Coord = II.getOperand(OperandIndex);
     // If the values are not derived from 16-bit values, we cannot optimize.
-    if (!canSafelyConvertTo16Bit(*Coord)) {
+    if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
       if (OperandIndex < ImageDimIntr->CoordStart ||
           ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
         return None;
@@ -127,43 +243,50 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
     FloatCoord = Coord->getType()->isFloatingPointTy();
   }
 
-  if (OnlyDerivatives) {
-    if (!ST->hasG16())
-      return None;
-  } else {
-    if (!ST->hasA16())
-      OnlyDerivatives = true; // Only supports G16
+  if (!OnlyDerivatives && !ST->hasA16())
+    OnlyDerivatives = true; // Only supports G16
+
+  // Check if there is a bias parameter and if it can be converted to f16
+  if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
+    Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
+    assert(HasSampler &&
+           "Only image instructions with a sampler can have a bias");
+    if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
+      OnlyDerivatives = true;
   }
 
+  if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
+                                               ImageDimIntr->CoordStart))
+    return None;
+
   Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
                                : Type::getInt16Ty(II.getContext());
 
-  SmallVector<Type *, 4> ArgTys;
-  if (!Intrinsic::getIntrinsicSignature(II.getCalledFunction(), ArgTys))
-    return None;
-
-  ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
-  if (!OnlyDerivatives)
-    ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
-  Function *I =
-      Intrinsic::getDeclaration(II.getModule(), II.getIntrinsicID(), ArgTys);
+  return modifyIntrinsicCall(
+      II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
+        ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
+        if (!OnlyDerivatives) {
+          ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
 
-  SmallVector<Value *, 8> Args(II.args());
+          // Change the bias type
+          if (ImageDimIntr->NumBiasArgs != 0)
+            ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
+        }
 
-  unsigned EndIndex =
-      OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
-  for (unsigned OperandIndex = ImageDimIntr->GradientStart;
-       OperandIndex < EndIndex; OperandIndex++) {
-    Args[OperandIndex] =
-        convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
-  }
+        unsigned EndIndex =
+            OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
+        for (unsigned OperandIndex = ImageDimIntr->GradientStart;
+             OperandIndex < EndIndex; OperandIndex++) {
+          Args[OperandIndex] =
+              convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
+        }
 
-  CallInst *NewCall = IC.Builder.CreateCall(I, Args);
-  NewCall->takeName(&II);
-  NewCall->copyMetadata(II);
-  if (isa<FPMathOperator>(NewCall))
-    NewCall->copyFastMathFlags(&II);
-  return IC.replaceInstUsesWith(II, NewCall);
+        // Convert the bias
+        if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
+          Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
+          Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
+        }
+      });
 }
 
 bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index b1263618c5db..e7ee36447682 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -20,9 +20,6 @@
 namespace llvm {
 
 class GCNSubtarget;
-class MachineFunction;
-class MachineInstr;
-class MachineInstrBuilder;
 class MachineMemOperand;
 
 class AMDGPUInstrInfo {
@@ -52,6 +49,9 @@ struct ImageDimIntrinsicInfo {
   unsigned BaseOpcode;
   MIMGDim Dim;
 
+  uint8_t NumOffsetArgs;
+  uint8_t NumBiasArgs;
+  uint8_t NumZCompareArgs;
   uint8_t NumGradients;
   uint8_t NumDmask;
   uint8_t NumData;
@@ -60,6 +60,9 @@ struct ImageDimIntrinsicInfo {
 
   uint8_t DMaskIndex;
   uint8_t VAddrStart;
+  uint8_t OffsetIndex;
+  uint8_t BiasIndex;
+  uint8_t ZCompareIndex;
   uint8_t GradientStart;
   uint8_t CoordStart;
   uint8_t LodIndex;
@@ -71,6 +74,7 @@ struct ImageDimIntrinsicInfo {
   uint8_t TexFailCtrlIndex;
   uint8_t CachePolicyIndex;
 
+  uint8_t BiasTyArg;
   uint8_t GradientTyArg;
   uint8_t CoordTyArg;
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index e16bead81b65..b7d0f0580cda 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -46,8 +46,7 @@ static cl::opt<bool> AllowRiskySelect(
 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
     const AMDGPUTargetMachine &TM)
-    : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
+    : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
       STI(STI),
       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
 #define GET_GLOBALISEL_PREDICATES_INIT
@@ -1103,7 +1102,18 @@ bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
   const DebugLoc &DL = I.getDebugLoc();
   Register SrcReg = I.getOperand(2).getReg();
   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
+
   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
+  if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(Pred))) {
+    MachineInstr *ICmp =
+        BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
+
+    if (!RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
+                                      *TRI.getBoolRC(), *MRI))
+      return false;
+    I.eraseFromParent();
+    return true;
+  }
 
   int Opcode = getV_CMPOpcode(Pred, Size);
   if (Opcode == -1)
@@ -1234,7 +1244,7 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
   // Get the return address reg and mark it as an implicit live-in
   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
-                                             AMDGPU::SReg_64RegClass);
+                                             AMDGPU::SReg_64RegClass, DL);
   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
     .addReg(LiveIn);
   I.eraseFromParent();
@@ -1494,9 +1504,9 @@ static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
   if (TexFailCtrl)
     IsTexFail = true;
 
-  TFE = (TexFailCtrl & 0x1) ? 1 : 0;
+  TFE = (TexFailCtrl & 0x1) ? true : false;
   TexFailCtrl &= ~(uint64_t)0x1;
-  LWE = (TexFailCtrl & 0x2) ? 1 : 0;
+  LWE = (TexFailCtrl & 0x2) ? true : false;
   TexFailCtrl &= ~(uint64_t)0x2;
 
   return TexFailCtrl == 0;
@@ -1511,10 +1521,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
 
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
-      AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
-  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
-      AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
   unsigned IntrOpcode = Intr->BaseOpcode;
   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
 
@@ -1523,7 +1529,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
   Register VDataIn, VDataOut;
   LLT VDataTy;
   int NumVDataDwords = -1;
-  bool IsD16 = false;
+  bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
+               MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
 
   bool Unorm;
   if (!BaseOpcode->Sampler)
@@ -1572,16 +1579,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
 
-    // One memoperand is mandatory, except for getresinfo.
-    // FIXME: Check this in verifier.
-    if (!MI.memoperands_empty()) {
-      const MachineMemOperand *MMO = *MI.memoperands_begin();
-
-      // Infer d16 from the memory size, as the register type will be mangled by
-      // unpacked subtargets, or by TFE.
-      IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
-    }
-
     if (BaseOpcode->Store) {
       VDataIn = MI.getOperand(1).getReg();
       VDataTy = MRI->getType(VDataIn);
@@ -1596,26 +1593,6 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
     }
   }
 
-  // Optimize _L to _LZ when _L is zero
-  if (LZMappingInfo) {
-    // The legalizer replaced the register with an immediate 0 if we need to
-    // change the opcode.
-    const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
-    if (Lod.isImm()) {
-      assert(Lod.getImm() == 0);
-      IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
-    }
-  }
-
-  // Optimize _mip away, when 'lod' is zero
-  if (MIPMappingInfo) {
-    const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
-    if (Lod.isImm()) {
-      assert(Lod.getImm() == 0);
-      IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
-    }
-  }
-
   // Set G16 opcode
   if (IsG16 && !IsA16) {
     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
@@ -2562,6 +2539,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
   Register MaskReg = I.getOperand(2).getReg();
   LLT Ty = MRI->getType(DstReg);
   LLT MaskTy = MRI->getType(MaskReg);
+  MachineBasicBlock *BB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
 
   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -2570,6 +2549,24 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
   if (DstRB != SrcRB) // Should only happen for hand written MIR.
     return false;
 
+  // Try to avoid emitting a bit operation when we only need to touch half of
+  // the 64-bit pointer.
+  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
+  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
+
+  const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
+  const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
+
+  if (!IsVGPR && Ty.getSizeInBits() == 64 &&
+      !CanCopyLow32 && !CanCopyHi32) {
+    auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
+      .addReg(SrcReg)
+      .addReg(MaskReg);
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  }
+
   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
   const TargetRegisterClass &RegRC
     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
@@ -2586,8 +2583,6 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
     return false;
 
-  MachineBasicBlock *BB = I.getParent();
-  const DebugLoc &DL = I.getDebugLoc();
   if (Ty.getSizeInBits() == 32) {
     assert(MaskTy.getSizeInBits() == 32 &&
            "ptrmask should have been narrowed during legalize");
@@ -2610,13 +2605,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
 
   Register MaskedLo, MaskedHi;
 
-  // Try to avoid emitting a bit operation when we only need to touch half of
-  // the 64-bit pointer.
-  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
-
-  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
-  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
-  if ((MaskOnes & MaskLo32) == MaskLo32) {
+  if (CanCopyLow32) {
     // If all the bits in the low half are 1, we only need a copy for it.
     MaskedLo = LoReg;
   } else {
@@ -2631,7 +2620,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
       .addReg(MaskLo);
   }
 
-  if ((MaskOnes & MaskHi32) == MaskHi32) {
+  if (CanCopyHi32) {
     // If all the bits in the high half are 1, we only need a copy for it.
     MaskedHi = HiReg;
   } else {
@@ -3123,6 +3112,33 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  if (IsVALU) {
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
+      .addImm(Subtarget->getWavefrontSizeLog2())
+      .addReg(SrcReg);
+  } else {
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
+      .addReg(SrcReg)
+      .addImm(Subtarget->getWavefrontSizeLog2());
+  }
+
+  const TargetRegisterClass &RC =
+      IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
+    return false;
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   if (I.isPHI())
     return selectPHI(I);
@@ -3236,7 +3252,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_SHUFFLE_VECTOR:
     return selectG_SHUFFLE_VECTOR(I);
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
-  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
     const AMDGPU::ImageDimIntrinsicInfo *Intr
       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
     assert(Intr && "not an image intrinsic with image pseudo");
@@ -3252,6 +3270,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case AMDGPU::G_SI_CALL:
     I.setDesc(TII.get(AMDGPU::SI_CALL));
     return true;
+  case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
+    return selectWaveAddress(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -3896,20 +3916,59 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
   return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
 }
 
+// Return the wave level SGPR base address if this is a wave address.
+static Register getWaveAddress(const MachineInstr *Def) {
+  return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
+             ? Def->getOperand(1).getReg()
+             : Register();
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
     MachineOperand &Root) const {
-  MachineInstr *MI = Root.getParent();
-  MachineBasicBlock *MBB = MI->getParent();
+  Register Reg = Root.getReg();
+  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+  const MachineInstr *Def = MRI->getVRegDef(Reg);
+  if (Register WaveBase = getWaveAddress(Def)) {
+    return {{
+        [=](MachineInstrBuilder &MIB) { // rsrc
+          MIB.addReg(Info->getScratchRSrcReg());
+        },
+        [=](MachineInstrBuilder &MIB) { // soffset
+          MIB.addReg(WaveBase);
+        },
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
+    }};
+  }
 
   int64_t Offset = 0;
+
+  // FIXME: Copy check is a hack
+  Register BasePtr;
+  if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
+    if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+      return {};
+    const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
+    Register WaveBase = getWaveAddress(BasePtrDef);
+    if (!WaveBase)
+      return {};
+
+    return {{
+        [=](MachineInstrBuilder &MIB) { // rsrc
+          MIB.addReg(Info->getScratchRSrcReg());
+        },
+        [=](MachineInstrBuilder &MIB) { // soffset
+          MIB.addReg(WaveBase);
+        },
+        [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
+    }};
+  }
+
   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
     return {};
 
-  const MachineFunction *MF = MBB->getParent();
-  const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-
   return {{
       [=](MachineInstrBuilder &MIB) { // rsrc
         MIB.addReg(Info->getScratchRSrcReg());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 26996e42af53..42095332d11a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -30,7 +30,6 @@ namespace AMDGPU {
 struct ImageDimIntrinsicInfo;
 }
 
-class AMDGPUInstrInfo;
 class AMDGPURegisterBankInfo;
 class AMDGPUTargetMachine;
 class BlockFrequencyInfo;
@@ -42,7 +41,6 @@ class MachineOperand;
 class MachineRegisterInfo;
 class RegisterBank;
 class SIInstrInfo;
-class SIMachineFunctionInfo;
 class SIRegisterInfo;
 class TargetRegisterClass;
 
@@ -147,6 +145,7 @@ private:
   bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
                               MachineOperand &DataOp) const;
   bool selectBVHIntrinsic(MachineInstr &I) const;
+  bool selectWaveAddress(MachineInstr &I) const;
 
   std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
                                                    bool AllowAbs = true) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 0528b552f475..7d3dbfd7e851 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -18,6 +18,7 @@ class AddressSpacesImpl {
   int Local = 3;
   int Constant = 4;
   int Private = 5;
+  int Constant32Bit = 6;
 }
 
 def AddrSpaces : AddressSpacesImpl;
@@ -405,18 +406,23 @@ class Aligned<int Bytes> {
   int MinAlignment = Bytes;
 }
 
-class StoreHi16<SDPatternOperator op> : PatFrag <
+class StoreHi16<SDPatternOperator op, ValueType vt> : PatFrag <
   (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> {
   let IsStore = 1;
+  let MemoryVT = vt;
 }
 
-def LoadAddress_constant : AddressSpaceList<[  AddrSpaces.Constant ]>;
-def LoadAddress_global : AddressSpaceList<[  AddrSpaces.Global, AddrSpaces.Constant ]>;
+def LoadAddress_constant : AddressSpaceList<[ AddrSpaces.Constant,
+                                              AddrSpaces.Constant32Bit ]>;
+def LoadAddress_global : AddressSpaceList<[ AddrSpaces.Global,
+                                            AddrSpaces.Constant,
+                                            AddrSpaces.Constant32Bit ]>;
 def StoreAddress_global : AddressSpaceList<[ AddrSpaces.Global ]>;
 
-def LoadAddress_flat : AddressSpaceList<[  AddrSpaces.Flat,
-                                           AddrSpaces.Global,
-                                           AddrSpaces.Constant ]>;
+def LoadAddress_flat : AddressSpaceList<[ AddrSpaces.Flat,
+                                          AddrSpaces.Global,
+                                          AddrSpaces.Constant,
+                                          AddrSpaces.Constant32Bit ]>;
 def StoreAddress_flat : AddressSpaceList<[ AddrSpaces.Flat, AddrSpaces.Global ]>;
 
 def LoadAddress_private : AddressSpaceList<[ AddrSpaces.Private ]>;
@@ -522,9 +528,9 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
   let MemoryVT = i16;
 }
 
-def store_hi16_#as : StoreHi16 <truncstorei16>;
-def truncstorei8_hi16_#as : StoreHi16<truncstorei8>;
-def truncstorei16_hi16_#as : StoreHi16<truncstorei16>;
+def store_hi16_#as : StoreHi16 <truncstorei16, i16>;
+def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>;
+def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>;
 
 defm atomic_store_#as : binary_atomic_op<atomic_store>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 5046daaed977..04c6f67ed339 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -272,8 +272,8 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
   const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
 
   unsigned RegSize = Ty.getSizeInBits();
-  unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
-  unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
+  uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
+  uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
   unsigned AS = Query.Types[1].getAddressSpace();
 
   // All of these need to be custom lowered to cast the pointer operand.
@@ -380,7 +380,7 @@ static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
 /// access up to the alignment. Note this case when the memory access itself
 /// changes, not the size of the result register.
 static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
-                            unsigned AlignInBits, unsigned AddrSpace,
+                            uint64_t AlignInBits, unsigned AddrSpace,
                             unsigned Opcode) {
   unsigned SizeInBits = MemoryTy.getSizeInBits();
   // We don't want to widen cases that are naturally legal.
@@ -929,10 +929,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder(G_CTPOP)
     .legalFor({{S32, S32}, {S32, S64}})
     .clampScalar(0, S32, S32)
+    .widenScalarToNextPow2(1, 32)
     .clampScalar(1, S32, S64)
     .scalarize(0)
-    .widenScalarToNextPow2(0, 32)
-    .widenScalarToNextPow2(1, 32);
+    .widenScalarToNextPow2(0, 32);
+
 
   // The hardware instructions return a different result on 0 than the generic
   // instructions expect. The hardware produces -1, but these produce the
@@ -1172,7 +1173,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
               if (MemSize > MaxSize)
                 return std::make_pair(0, LLT::scalar(MaxSize));
 
-              unsigned Align = Query.MMODescrs[0].AlignInBits;
+              uint64_t Align = Query.MMODescrs[0].AlignInBits;
               return std::make_pair(0, LLT::scalar(Align));
             })
         .fewerElementsIf(
@@ -1295,6 +1296,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   if (ST.hasAtomicFaddInsts())
     Atomic.legalFor({{S32, GlobalPtr}});
 
+  if (ST.hasGFX90AInsts()) {
+    // These are legal with some caveats, and should have undergone expansion in
+    // the IR in most situations
+    // TODO: Move atomic expansion into legalizer
+    // TODO: Also supports <2 x f16>
+    Atomic.legalFor({
+        {S32, GlobalPtr},
+        {S64, GlobalPtr},
+        {S64, FlatPtr}
+      });
+  }
+
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
   // demarshalling
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
@@ -1345,8 +1358,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       }, changeTo(1, S16));
     Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
     Shifts.clampScalar(1, S32, S32);
-    Shifts.clampScalar(0, S16, S64);
     Shifts.widenScalarToNextPow2(0, 16);
+    Shifts.clampScalar(0, S16, S64);
 
     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
       .minScalar(0, S16)
@@ -1357,8 +1370,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     // expansion for the shifted type will produce much worse code if it hasn't
     // been truncated already.
     Shifts.clampScalar(1, S32, S32);
-    Shifts.clampScalar(0, S32, S64);
     Shifts.widenScalarToNextPow2(0, 32);
+    Shifts.clampScalar(0, S32, S64);
 
     getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
       .minScalar(0, S32)
@@ -1812,6 +1825,27 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
 }
 
+/// Return true if the value is a known valid address, such that a null check is
+/// not necessary.
+static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI,
+                           const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
+  MachineInstr *Def = MRI.getVRegDef(Val);
+  switch (Def->getOpcode()) {
+  case AMDGPU::G_FRAME_INDEX:
+  case AMDGPU::G_GLOBAL_VALUE:
+  case AMDGPU::G_BLOCK_ADDR:
+    return true;
+  case AMDGPU::G_CONSTANT: {
+    const ConstantInt *CI = Def->getOperand(1).getCImm();
+    return CI->getSExtValue() != TM.getNullPointerValue(AddrSpace);
+  }
+  default:
+    return false;
+  }
+
+  return false;
+}
+
 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
@@ -1862,6 +1896,14 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
     assert(DestAS == AMDGPUAS::LOCAL_ADDRESS ||
            DestAS == AMDGPUAS::PRIVATE_ADDRESS);
+
+    if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+      // Extract low 32-bits of the pointer.
+      B.buildExtract(Dst, Src, 0);
+      MI.eraseFromParent();
+      return true;
+    }
+
     unsigned NullVal = TM.getNullPointerValue(DestAS);
 
     auto SegmentNull = B.buildConstant(DstTy, NullVal);
@@ -1884,24 +1926,29 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   if (!ST.hasFlatAddressSpace())
     return false;
 
-  auto SegmentNull =
-      B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
-  auto FlatNull =
-      B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
-
   Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
   if (!ApertureReg.isValid())
     return false;
 
-  auto CmpRes =
-      B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
-
   // Coerce the type of the low half of the result so we can use merge_values.
   Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
 
   // TODO: Should we allow mismatched types but matching sizes in merges to
   // avoid the ptrtoint?
   auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+
+  if (isKnownNonNull(Src, MRI, TM, SrcAS)) {
+    B.buildCopy(Dst, BuildPtr);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+  auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+
+  auto CmpRes =
+      B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
+
   B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
 
   MI.eraseFromParent();
@@ -1959,6 +2006,7 @@ bool AMDGPULegalizerInfo::legalizeFceil(
 
   // TODO: Should this propagate fast-math-flags?
   B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
+  MI.eraseFromParent();
   return true;
 }
 
@@ -2213,10 +2261,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
   LLT EltTy = VecTy.getElementType();
   assert(EltTy == MRI.getType(Dst));
 
-  if (IdxVal < VecTy.getNumElements())
-    B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
-  else
+  if (IdxVal < VecTy.getNumElements()) {
+    auto Unmerge = B.buildUnmerge(EltTy, Vec);
+    B.buildCopy(Dst, Unmerge.getReg(IdxVal));
+  } else {
     B.buildUndef(Dst);
+  }
 
   MI.eraseFromParent();
   return true;
@@ -2245,11 +2295,20 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
   LLT VecTy = MRI.getType(Vec);
   LLT EltTy = VecTy.getElementType();
   assert(EltTy == MRI.getType(Ins));
+  (void)Ins;
 
-  if (IdxVal < VecTy.getNumElements())
-    B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
-  else
+  unsigned NumElts = VecTy.getNumElements();
+  if (IdxVal < NumElts) {
+    SmallVector<Register, 8> SrcRegs;
+    for (unsigned i = 0; i < NumElts; ++i)
+      SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
+    B.buildUnmerge(SrcRegs, Vec);
+
+    SrcRegs[IdxVal] = MI.getOperand(2).getReg();
+    B.buildMerge(Dst, SrcRegs);
+  } else {
     B.buildUndef(Dst);
+  }
 
   MI.eraseFromParent();
   return true;
@@ -2502,7 +2561,7 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
   const LLT MemTy = MMO->getMemoryType();
   const Align MemAlign = MMO->getAlign();
   const unsigned MemSize = MemTy.getSizeInBits();
-  const unsigned AlignInBits = 8 * MemAlign.value();
+  const uint64_t AlignInBits = 8 * MemAlign.value();
 
   // Widen non-power-of-2 loads to the alignment if needed
   if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
@@ -2832,8 +2891,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
   assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
   assert(DstReg.isVirtual() && "Virtual register expected");
 
-  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
-                                             ArgTy);
+  Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
+                                             *ArgRC, B.getDebugLoc(), ArgTy);
   if (Arg->isMasked()) {
     // TODO: Should we try to emit this once in the entry block?
     const LLT S32 = LLT::scalar(32);
@@ -2842,6 +2901,8 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
 
     Register AndMaskSrc = LiveIn;
 
+    // TODO: Avoid clearing the high bits if we know workitem id y/z are always
+    // 0.
     if (Shift != 0) {
       auto ShiftAmt = B.buildConstant(S32, Shift);
       AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
@@ -4106,7 +4167,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
-  case Intrinsic::amdgcn_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
@@ -4213,15 +4273,18 @@ static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI,
     if ((I < Intr->GradientStart) ||
         (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
         (I >= Intr->CoordStart && !IsA16)) {
-      // Handle any gradient or coordinate operands that should not be packed
       if ((I < Intr->GradientStart) && IsA16 &&
           (B.getMRI()->getType(AddrReg) == S16)) {
+        assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
         // Special handling of bias when A16 is on. Bias is of type half but
         // occupies full 32-bit.
         PackedAddrs.push_back(
             B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
                 .getReg(0));
       } else {
+        assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
+               "Bias needs to be converted to 16 bit in A16 mode");
+        // Handle any gradient or coordinate operands that should not be packed
         AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
         PackedAddrs.push_back(AddrReg);
       }
@@ -4320,6 +4383,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   const LLT V2S16 = LLT::fixed_vector(2, 16);
 
   unsigned DMask = 0;
+  Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
+  LLT Ty = MRI->getType(VData);
 
   // Check for 16 bit addresses and pack if true.
   LLT GradTy =
@@ -4328,6 +4393,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
       MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
   const bool IsG16 = GradTy == S16;
   const bool IsA16 = AddrTy == S16;
+  const bool IsD16 = Ty.getScalarType() == S16;
 
   int DMaskLanes = 0;
   if (!BaseOpcode->Atomic) {
@@ -4347,8 +4413,11 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   Observer.changingInstr(MI);
   auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
 
-  unsigned NewOpcode = NumDefs == 0 ?
-    AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
+  const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
+                                     : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
+  const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
+                                    : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
+  unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
 
   // Track that we legalized this
   MI.setDesc(B.getTII().get(NewOpcode));
@@ -4381,44 +4450,6 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
 
   unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
 
-  // Optimize _L to _LZ when _L is zero
-  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
-          AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
-    const ConstantFP *ConstantLod;
-
-    if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
-                 m_GFCst(ConstantLod))) {
-      if (ConstantLod->isZero() || ConstantLod->isNegative()) {
-        // Set new opcode to _lz variant of _l, and change the intrinsic ID.
-        const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
-            AMDGPU::getImageDimIntrinsicByBaseOpcode(LZMappingInfo->LZ,
-                                                     Intr->Dim);
-
-        // The starting indexes should remain in the same place.
-        --CorrectedNumVAddrs;
-
-        MI.getOperand(MI.getNumExplicitDefs())
-            .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
-        MI.RemoveOperand(ArgOffset + Intr->LodIndex);
-        Intr = NewImageDimIntr;
-      }
-    }
-  }
-
-  // Optimize _mip away, when 'lod' is zero
-  if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
-    int64_t ConstantLod;
-    if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
-                 m_ICst(ConstantLod))) {
-      if (ConstantLod == 0) {
-        // TODO: Change intrinsic opcode and remove operand instead or replacing
-        // it with 0, as the _L to _LZ handling is done above.
-        MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
-        --CorrectedNumVAddrs;
-      }
-    }
-  }
-
   // Rewrite the addressing register layout before doing anything else.
   if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
     // 16 bit gradients are supported, but are tied to the A16 control
@@ -4494,9 +4525,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
 
   if (BaseOpcode->Store) { // No TFE for stores?
     // TODO: Handle dmask trim
-    Register VData = MI.getOperand(1).getReg();
-    LLT Ty = MRI->getType(VData);
-    if (!Ty.isVector() || Ty.getElementType() != S16)
+    if (!Ty.isVector() || !IsD16)
       return true;
 
     Register RepackedReg = handleD16VData(B, *MRI, VData, true);
@@ -4508,9 +4537,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
   }
 
   Register DstReg = MI.getOperand(0).getReg();
-  LLT Ty = MRI->getType(DstReg);
   const LLT EltTy = Ty.getScalarType();
-  const bool IsD16 = Ty.getScalarType() == S16;
   const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
 
   // Confirm that the return type is large enough for the dmask specified
@@ -4918,6 +4945,12 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
   return true;
 }
 
+static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) {
+  B.buildConstant(MI.getOperand(0).getReg(), C);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
                                             MachineInstr &MI) const {
   MachineIRBuilder &B = Helper.MIRBuilder;
@@ -5021,12 +5054,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_implicitarg_ptr:
     return legalizeImplicitArgPtr(MI, MRI, B);
   case Intrinsic::amdgcn_workitem_id_x:
+    if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0)
+      return replaceWithConstant(B, MI, 0);
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKITEM_ID_X);
   case Intrinsic::amdgcn_workitem_id_y:
+    if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0)
+      return replaceWithConstant(B, MI, 0);
+
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
   case Intrinsic::amdgcn_workitem_id_z:
+    if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0)
+      return replaceWithConstant(B, MI, 0);
+
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
   case Intrinsic::amdgcn_workgroup_id_x:
@@ -5105,16 +5146,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_struct_buffer_atomic_inc:
   case Intrinsic::amdgcn_raw_buffer_atomic_dec:
   case Intrinsic::amdgcn_struct_buffer_atomic_dec:
-  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
-  case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
-  case Intrinsic::amdgcn_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
   case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
   case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
     return legalizeBufferAtomic(MI, B, IntrID);
+  case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+  case Intrinsic::amdgcn_struct_buffer_atomic_fadd: {
+    Register DstReg = MI.getOperand(0).getReg();
+    if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) {
+      Function &F = B.getMF().getFunction();
+      DiagnosticInfoUnsupported NoFpRet(
+          F, "return versions of fp atomics not supported", B.getDebugLoc(),
+          DS_Error);
+      F.getContext().diagnose(NoFpRet);
+      B.buildUndef(DstReg);
+      MI.eraseFromParent();
+      return true;
+    }
+
+    return legalizeBufferAtomic(MI, B, IntrID);
+  }
   case Intrinsic::amdgcn_atomic_inc:
     return legalizeAtomicIncDec(MI, B, true);
   case Intrinsic::amdgcn_atomic_dec:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 7faf0436f995..964a41d3d740 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -21,7 +21,6 @@
 namespace llvm {
 
 class GCNTargetMachine;
-class LLVMContext;
 class GCNSubtarget;
 class MachineIRBuilder;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 49cf6db5197f..c28427758ac7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -58,9 +58,6 @@ private:
   // "FuncName" exists. It may create a new function prototype in pre-link mode.
   FunctionCallee getFunction(Module *M, const FuncInfo &fInfo);
 
-  // Replace a normal function with its native version.
-  bool replaceWithNative(CallInst *CI, const FuncInfo &FInfo);
-
   bool parseFunctionName(const StringRef &FMangledName, FuncInfo &FInfo);
 
   bool TDOFold(CallInst *CI, const FuncInfo &FInfo);
@@ -90,24 +87,6 @@ private:
     double& Res1, Constant *copr0, Constant *copr1, Constant *copr2);
   bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
 
-  // exp
-  bool fold_exp(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
-  // exp2
-  bool fold_exp2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
-  // exp10
-  bool fold_exp10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
-  // log
-  bool fold_log(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
-  // log2
-  bool fold_log2(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
-  // log10
-  bool fold_log10(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
   // sqrt
   bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
 
@@ -623,7 +602,8 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
   Function *Callee = CI->getCalledFunction();
 
   // Ignore indirect calls.
-  if (Callee == 0) return false;
+  if (Callee == nullptr)
+    return false;
 
   BasicBlock *BB = CI->getParent();
   LLVMContext &Context = CI->getParent()->getContext();
@@ -778,27 +758,6 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   return false;
 }
 
-bool AMDGPULibCalls::replaceWithNative(CallInst *CI, const FuncInfo &FInfo) {
-  Module *M = CI->getModule();
-  if (getArgType(FInfo) != AMDGPULibFunc::F32 ||
-      FInfo.getPrefix() != AMDGPULibFunc::NOPFX ||
-      !HasNative(FInfo.getId()))
-    return false;
-
-  AMDGPULibFunc nf = FInfo;
-  nf.setPrefix(AMDGPULibFunc::NATIVE);
-  if (FunctionCallee FPExpr = getFunction(M, nf)) {
-    LLVM_DEBUG(dbgs() << "AMDIC: " << *CI << " ---> ");
-
-    CI->setCalledFunction(FPExpr);
-
-    LLVM_DEBUG(dbgs() << *CI << '\n');
-
-    return true;
-  }
-  return false;
-}
-
 //  [native_]half_recip(c) ==> 1.0/c
 bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
                                 const FuncInfo &FInfo) {
@@ -1402,8 +1361,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
   Function *UCallee = UI->getCalledFunction();
   Type *RetType = UCallee->getReturnType();
   B.SetInsertPoint(&*ItNew);
-  AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
-    std::string(prefix) + UI->getName());
+  AllocaInst *Alloc =
+      B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName());
   Alloc->setAlignment(
       Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
   return Alloc;
@@ -1724,7 +1683,8 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0) continue;
+      if (Callee == nullptr)
+        continue;
 
       LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
                  dbgs().flush());
@@ -1757,7 +1717,7 @@ PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0)
+      if (Callee == nullptr)
         continue;
 
       LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
@@ -1783,9 +1743,10 @@ bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0) continue;
+      if (Callee == nullptr)
+        continue;
 
-      if(Simplifier.useNative(CI))
+      if (Simplifier.useNative(CI))
         Changed = true;
     }
   }
@@ -1811,7 +1772,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
-      if (Callee == 0)
+      if (Callee == nullptr)
         continue;
 
       if (Simplifier.useNative(CI))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index c97223b047e8..dc0ac72016f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -10,6 +10,7 @@
 #define _AMDGPU_LIBFUNC_H_
 
 #include "llvm/ADT/StringRef.h"
+#include <memory>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0c743a77092c..593388a4d819 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -15,9 +15,8 @@
 using namespace llvm;
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
-    : MachineFunctionInfo(), Mode(MF.getFunction()),
-      IsEntryFunction(
-          AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
+    : Mode(MF.getFunction()), IsEntryFunction(AMDGPU::isEntryFunctionCC(
+                                  MF.getFunction().getCallingConv())),
       IsModuleEntryFunction(
           AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())),
       NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 10ff50040c6a..48cf46b5f871 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -15,8 +15,6 @@
 
 namespace llvm {
 
-class GCNSubtarget;
-
 class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// A map to keep track of local memory objects and their offsets within the
   /// local memory space.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
index 8af7979dba8b..5cefc83e25e0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -29,4 +29,4 @@ const char NoteNameV3[] = "AMDGPU";
 } // End namespace ElfNote
 } // End namespace AMDGPU
 } // End namespace llvm
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUNOTETYPE_H
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUPTNOTE_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 7c4eb71882c7..f91f31508ad2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -463,7 +463,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
             WhatToStore.push_back(Arg);
           }
         } else if (isa<FixedVectorType>(ArgType)) {
-          Type *IType = NULL;
+          Type *IType = nullptr;
           uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements();
           uint32_t EleSize = ArgType->getScalarSizeInBits();
           uint32_t TotalSize = EleCount * EleSize;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index f9a9fe403ff6..2d8126a49327 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -789,6 +789,17 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
     Align Alignment =
         DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
     uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
+
+    // HIP uses an extern unsized array in local address space for dynamically
+    // allocated shared memory.  In that case, we have to disable the promotion.
+    if (GV->hasExternalLinkage() && AllocSize == 0) {
+      LocalMemLimit = 0;
+      LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
+                           "local memory. Promoting to local memory "
+                           "disabled.\n");
+      return false;
+    }
+
     AllocatedSizes.emplace_back(AllocSize, Alignment);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 3ce67a733c10..0df6f4d45b06 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -36,6 +36,7 @@ protected:
   MachineIRBuilder &B;
   MachineFunction &MF;
   MachineRegisterInfo &MRI;
+  const GCNSubtarget &Subtarget;
   const RegisterBankInfo &RBI;
   const TargetRegisterInfo &TRI;
   const SIInstrInfo &TII;
@@ -44,9 +45,9 @@ protected:
 public:
   AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
       : B(B), MF(B.getMF()), MRI(*B.getMRI()),
-        RBI(*MF.getSubtarget().getRegBankInfo()),
-        TRI(*MF.getSubtarget().getRegisterInfo()),
-        TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){};
+        Subtarget(MF.getSubtarget<GCNSubtarget>()),
+        RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()),
+        TII(*Subtarget.getInstrInfo()), Helper(Helper){};
 
   bool isVgprRegBank(Register Reg);
   Register getAsVgpr(Register Reg);
@@ -193,7 +194,10 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
     MachineInstr &MI, Med3MatchInfo &MatchInfo) {
   Register Dst = MI.getOperand(0).getReg();
   LLT Ty = MRI.getType(Dst);
-  if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32))
+
+  // med3 for f16 is only available on gfx9+, and not available for v2f16.
+  if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) &&
+      Ty != LLT::scalar(32))
     return false;
 
   auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c60012bcfe2e..de2dccef804a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -718,8 +718,11 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
   const unsigned WaveAndOpc = Subtarget.isWave32() ?
     AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-  const unsigned MovTermOpc = Subtarget.isWave32() ?
-    AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+  const unsigned MovExecOpc =
+      Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  const unsigned MovExecTermOpc =
+      Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+
   const unsigned XorTermOpc = Subtarget.isWave32() ?
     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
   const unsigned AndSaveExecOpc =  Subtarget.isWave32() ?
@@ -996,12 +999,12 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
 
   // Save the EXEC mask before the loop.
-  BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
+  BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
     .addReg(ExecReg);
 
   // Restore the EXEC mask after the loop.
   B.setMBB(*RestoreExecBB);
-  B.buildInstr(MovTermOpc)
+  B.buildInstr(MovExecTermOpc)
     .addDef(ExecReg)
     .addReg(SaveExecReg);
 
@@ -2953,7 +2956,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     break;
   }
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
-  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
     const AMDGPU::RsrcIntrinsic *RSrcIntrin
       = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
     assert(RSrcIntrin && RSrcIntrin->IsImage);
@@ -3691,6 +3696,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
     break;
   }
+  case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
+    // This case is weird because we expect a physical register in the source,
+    // but need to set a bank anyway.
+    //
+    // We could select the result to SGPR or VGPR, but for the one current use
+    // it's more practical to always use VGPR.
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+    OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+    break;
+  }
   case AMDGPU::G_INSERT: {
     unsigned BankID = getMappingType(MRI, MI);
     unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -4078,7 +4093,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mqsad_pk_u16_u8:
     case Intrinsic::amdgcn_mqsad_u32_u8:
     case Intrinsic::amdgcn_cvt_pk_u8_f32:
-    case Intrinsic::amdgcn_alignbit:
     case Intrinsic::amdgcn_alignbyte:
     case Intrinsic::amdgcn_perm:
     case Intrinsic::amdgcn_fdot2:
@@ -4276,7 +4290,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
-  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
     auto IntrID = MI.getIntrinsicID();
     const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
     assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 45f7c2f369bd..1c6c63dd5b25 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -353,7 +353,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
   // off any return attributes, e.g. zeroext doesn't make sense with a struct.
   NewFunc->stealArgumentListFrom(F);
 
-  AttrBuilder RetAttrs;
+  AttributeMask RetAttrs;
   RetAttrs.addAttribute(Attribute::SExt);
   RetAttrs.addAttribute(Attribute::ZExt);
   RetAttrs.addAttribute(Attribute::NoAlias);
@@ -433,7 +433,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
 
     PointerType *ArgType = cast<PointerType>(Arg.getType());
 
-    auto *EltTy = ArgType->getElementType();
+    auto *EltTy = ArgType->getPointerElementType();
     const auto Align =
         DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index cd05797fdbdb..e82f9232b114 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -269,7 +269,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasGetWaveIdInst(false),
     HasSMemTimeInst(false),
     HasShaderCyclesRegister(false),
-    HasRegisterBanking(false),
     HasVOP3Literal(false),
     HasNoDataDepHazard(false),
     FlatAddressSpace(false),
@@ -772,11 +771,11 @@ unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
 }
 
 unsigned
-GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
+GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
   if (getGeneration() >= AMDGPUSubtarget::GFX10)
     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
 
-  if (HasFlatScratchInit || HasArchitectedFlatScratch) {
+  if (HasFlatScratch || HasArchitectedFlatScratch) {
     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
@@ -794,20 +793,11 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
 }
 
 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
-  // The logic to detect if the function has
-  // flat scratch init is slightly different than how
-  // SIMachineFunctionInfo constructor derives.
-  // We don't use amdgpu-calls, amdgpu-stack-objects
-  // attributes and isAmdHsaOrMesa here as it doesn't really matter.
-  // TODO: Outline this derivation logic and have just
-  // one common function in the backend to avoid duplication.
-  bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
-  bool FunctionHasFlatScratchInit = false;
-  if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
-      enableFlatScratch()) {
-    FunctionHasFlatScratchInit = true;
-  }
-  return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
+  // In principle we do not need to reserve SGPR pair used for flat_scratch if
+  // we know flat instructions do not access the stack anywhere in the
+  // program. For now assume it's needed if we have flat instructions.
+  const bool KernelUsesFlatScratch = hasFlatAddressSpace();
+  return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
 }
 
 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 88ed4b2b7a24..7f1b94be4ffe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -212,7 +212,19 @@ public:
   /// Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const Function &F) const {
-    return isAmdHsaOrMesa(F) ? 0 : 36;
+    switch (TargetTriple.getOS()) {
+    case Triple::AMDHSA:
+    case Triple::AMDPAL:
+    case Triple::Mesa3D:
+      return 0;
+    case Triple::UnknownOS:
+    default:
+      // For legacy reasons unknown/other is treated as a different version of
+      // mesa.
+      return 36;
+    }
+
+    llvm_unreachable("invalid triple OS");
   }
 
   /// \returns Maximum number of work groups per compute unit supported by the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 226646a96953..dd3676f3b707 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -21,8 +21,6 @@
 
 namespace llvm {
 
-class ScheduleDAGMILive;
-
 //===----------------------------------------------------------------------===//
 // AMDGPU Target Machine (R600+)
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 09c5eb192e1f..a8df7789c8a1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -844,15 +844,8 @@ bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
 
     TLI->ComputeConstraintToUse(TC, SDValue());
 
-    Register AssignedReg;
-    const TargetRegisterClass *RC;
-    std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
-      TRI, TC.ConstraintCode, TC.ConstraintVT);
-    if (AssignedReg) {
-      // FIXME: This is a workaround for getRegForInlineAsmConstraint
-      // returning VS_32
-      RC = TRI->getPhysRegClass(AssignedReg);
-    }
+    const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
+        TRI, TC.ConstraintCode, TC.ConstraintVT).second;
 
     // For AGPR constraints null is returned on subtargets without AGPRs, so
     // assume divergent for null.
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 2bb59086f391..c1c88d9a7462 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -62,7 +62,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
 
 public:
   AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
-    : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
+      : Kind(Kind_), AsmParser(AsmParser_) {}
 
   using Ptr = std::unique_ptr<AMDGPUOperand>;
 
@@ -1548,6 +1548,7 @@ private:
   bool validateVccOperand(unsigned Reg) const;
   bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands);
   bool validateMAIAccWrite(const MCInst &Inst, const OperandVector &Operands);
+  bool validateMFMA(const MCInst &Inst, const OperandVector &Operands);
   bool validateAGPRLdSt(const MCInst &Inst) const;
   bool validateVGPRAlign(const MCInst &Inst) const;
   bool validateGWS(const MCInst &Inst, const OperandVector &Operands);
@@ -3613,6 +3614,40 @@ bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst,
   return true;
 }
 
+bool AMDGPUAsmParser::validateMFMA(const MCInst &Inst,
+                                   const OperandVector &Operands) {
+  const unsigned Opc = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opc);
+
+  if ((Desc.TSFlags & SIInstrFlags::IsMAI) == 0)
+    return true;
+
+  const int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+  if (Src2Idx == -1)
+    return true;
+
+  const MCOperand &Src2 = Inst.getOperand(Src2Idx);
+  if (!Src2.isReg())
+    return true;
+
+  MCRegister Src2Reg = Src2.getReg();
+  MCRegister DstReg = Inst.getOperand(0).getReg();
+  if (Src2Reg == DstReg)
+    return true;
+
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  if (TRI->getRegClass(Desc.OpInfo[0].RegClass).getSizeInBits() <= 128)
+    return true;
+
+  if (isRegIntersect(Src2Reg, DstReg, TRI)) {
+    Error(getRegLoc(mc2PseudoReg(Src2Reg), Operands),
+          "source 2 operand must not partially overlap with dst");
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateDivScale(const MCInst &Inst) {
   switch (Inst.getOpcode()) {
   default:
@@ -4297,6 +4332,9 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateMAIAccWrite(Inst, Operands)) {
     return false;
   }
+  if (!validateMFMA(Inst, Operands)) {
+    return false;
+  }
   if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
     return false;
   }
@@ -4568,7 +4606,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
   uint64_t AccumOffset = 0;
   SMRange SGPRRange;
   uint64_t NextFreeSGPR = 0;
-  unsigned UserSGPRCount = 0;
+
+  // Count the number of user SGPRs implied from the enabled feature bits.
+  unsigned ImpliedUserSGPRCount = 0;
+
+  // Track if the asm explicitly contains the directive for the user SGPR
+  // count.
+  Optional<unsigned> ExplicitUserSGPRCount;
   bool ReserveVCC = true;
   bool ReserveFlatScr = true;
   Optional<bool> EnableWavefrontSize32;
@@ -4617,6 +4661,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       if (!isUInt<sizeof(KD.kernarg_size) * CHAR_BIT>(Val))
         return OutOfRangeError(ValRange);
       KD.kernarg_size = Val;
+    } else if (ID == ".amdhsa_user_sgpr_count") {
+      ExplicitUserSGPRCount = Val;
     } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
@@ -4626,31 +4672,31 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
                        Val, ValRange);
       if (Val)
-        UserSGPRCount += 4;
+        ImpliedUserSGPRCount += 4;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
                        ValRange);
       if (Val)
-        UserSGPRCount += 2;
+        ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
                        ValRange);
       if (Val)
-        UserSGPRCount += 2;
+        ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
                        Val, ValRange);
       if (Val)
-        UserSGPRCount += 2;
+        ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
                        ValRange);
       if (Val)
-        UserSGPRCount += 2;
+        ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
@@ -4660,13 +4706,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
                        ValRange);
       if (Val)
-        UserSGPRCount += 2;
+        ImpliedUserSGPRCount += 2;
     } else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
                        Val, ValRange);
       if (Val)
-        UserSGPRCount += 1;
+        ImpliedUserSGPRCount += 1;
     } else if (ID == ".amdhsa_wavefront_size32") {
       if (IVersion.Major < 10)
         return Error(IDRange.Start, "directive requires gfx10+", IDRange);
@@ -4850,6 +4896,13 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
                   COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT,
                   SGPRBlocks);
 
+  if (ExplicitUserSGPRCount && ImpliedUserSGPRCount > *ExplicitUserSGPRCount)
+    return TokError("amdgpu_user_sgpr_count smaller than than implied by "
+                    "enabled user SGPRs");
+
+  unsigned UserSGPRCount =
+      ExplicitUserSGPRCount ? *ExplicitUserSGPRCount : ImpliedUserSGPRCount;
+
   if (!isUInt<COMPUTE_PGM_RSRC2_USER_SGPR_COUNT_WIDTH>(UserSGPRCount))
     return TokError("too many user SGPRs enabled");
   AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 104b5160b985..c4043177b618 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -89,7 +89,6 @@ class DS_Real <DS_Pseudo ps> :
                     !if(!or(ps.has_data0, ps.has_gws_data0), data0{9}, 0));
 }
 
-
 // DS Pseudo instructions
 
 class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index c7ec5308e6d0..c530d3cb49f0 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -915,7 +915,7 @@ class FlatSignedAtomicPatNoRtn <FLAT_Pseudo inst, SDPatternOperator node, ValueT
 class FlatSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt,
                            ValueType data_vt = vt> : GCNPat <
   (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
-  (inst $vaddr, $data, $offset)
+  (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
 >;
 
 class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 0f8dd0b3bf58..c0592f6f3c7a 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -95,7 +95,9 @@ static bool isDGEMM(unsigned Opcode) {
   return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
          Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
          Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
-         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
+         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64 ||
+         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64 ||
+         Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64;
 }
 
 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
@@ -1438,7 +1440,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
 
     if (!Use.isReg())
       continue;
-    unsigned Reg = Use.getReg();
+    Register Reg = Use.getReg();
     bool FullReg;
     const MachineInstr *MI1;
 
@@ -1477,6 +1479,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
         switch (Opc1) {
         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+        case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
+        case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
           if (!isXDL(ST, *MI))
             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
           break;
@@ -1509,6 +1513,8 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
       switch (Opc1) {
       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
+      case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
+      case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
         break;
       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 162121c2c525..716bc027a894 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -25,7 +25,6 @@ class MachineFunction;
 class MachineInstr;
 class MachineOperand;
 class MachineRegisterInfo;
-class ScheduleDAG;
 class SIInstrInfo;
 class SIRegisterInfo;
 class GCNSubtarget;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 82c09378acac..fb106d98c162 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI,
          << *LIS.getInstructionFromIndex(SI);
   unsigned Num = 0;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    const unsigned Reg = Register::index2VirtReg(I);
+    const Register Reg = Register::index2VirtReg(I);
     if (!LIS.hasInterval(Reg))
       continue;
     const auto &LI = LIS.getInterval(Reg);
@@ -487,7 +487,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
                                  const MachineRegisterInfo &MRI) {
   const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = Register::index2VirtReg(I);
+    Register Reg = Register::index2VirtReg(I);
     auto It = LiveRegs.find(Reg);
     if (It != LiveRegs.end() && It->second.any())
       OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 53d6ff0aa731..a6e42ad3dfca 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -140,4 +140,4 @@ public:
 
 } // End namespace llvm
 
-#endif // GCNSCHEDSTRATEGY_H
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d8bc0b2df2bd..0cd2cfa2f0e7 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -153,7 +153,6 @@ protected:
   bool HasGetWaveIdInst;
   bool HasSMemTimeInst;
   bool HasShaderCyclesRegister;
-  bool HasRegisterBanking;
   bool HasVOP3Literal;
   bool HasNoDataDepHazard;
   bool FlatAddressSpace;
@@ -723,10 +722,6 @@ public:
     return HasShaderCyclesRegister;
   }
 
-  bool hasRegisterBanking() const {
-    return HasRegisterBanking;
-  }
-
   bool hasVOP3Literal() const {
     return HasVOP3Literal;
   }
@@ -1029,7 +1024,7 @@ public:
   /// \returns Reserved number of SGPRs. This is common
   /// utility function called by MachineFunction and
   /// Function variants of getReservedNumSGPRs.
-  unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
+  unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
   /// \returns Reserved number of SGPRs for given machine function \p MF.
   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index b68b4b12e750..76663b563150 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1397,21 +1397,26 @@ void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
   unsigned Vmcnt, Expcnt, Lgkmcnt;
   decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt);
 
+  bool IsDefaultVmcnt = Vmcnt == getVmcntBitMask(ISA);
+  bool IsDefaultExpcnt = Expcnt == getExpcntBitMask(ISA);
+  bool IsDefaultLgkmcnt = Lgkmcnt == getLgkmcntBitMask(ISA);
+  bool PrintAll = IsDefaultVmcnt && IsDefaultExpcnt && IsDefaultLgkmcnt;
+
   bool NeedSpace = false;
 
-  if (Vmcnt != getVmcntBitMask(ISA)) {
+  if (!IsDefaultVmcnt || PrintAll) {
     O << "vmcnt(" << Vmcnt << ')';
     NeedSpace = true;
   }
 
-  if (Expcnt != getExpcntBitMask(ISA)) {
+  if (!IsDefaultExpcnt || PrintAll) {
     if (NeedSpace)
       O << ' ';
     O << "expcnt(" << Expcnt << ')';
     NeedSpace = true;
   }
 
-  if (Lgkmcnt != getLgkmcntBitMask(ISA)) {
+  if (!IsDefaultLgkmcnt || PrintAll) {
     if (NeedSpace)
       O << ' ';
     O << "lgkmcnt(" << Lgkmcnt << ')';
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 7708579a4491..ded3fb7ab8d9 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -15,8 +15,7 @@
 using namespace llvm;
 
 AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
-                                 const MCTargetOptions &Options)
-    : MCAsmInfoELF() {
+                                 const MCTargetOptions &Options) {
   CodePointerSize = (TT.getArch() == Triple::amdgcn) ? 8 : 4;
   StackGrowsUp = true;
   HasSingleParameterDotFile = false;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 9a9a2c973f44..9578bdb0bad0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -319,6 +319,10 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
      << KD.private_segment_fixed_size << '\n';
   OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
 
+  PRINT_FIELD(OS, ".amdhsa_user_sgpr_count", KD,
+              compute_pgm_rsrc2,
+              amdhsa::COMPUTE_PGM_RSRC2_USER_SGPR_COUNT);
+
   if (!hasArchitectedFlatScratch(STI))
     PRINT_FIELD(
         OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 6dd886367302..cf03fd682143 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -131,6 +131,38 @@ def MIMGMIPMappingTable : GenericTable {
   let PrimaryKeyName = "getMIMGMIPMappingInfo";
 }
 
+class MIMGBiasMapping<MIMGBaseOpcode bias, MIMGBaseOpcode nobias> {
+  MIMGBaseOpcode Bias = bias;
+  MIMGBaseOpcode NoBias = nobias;
+}
+
+def MIMGBiasMappingTable : GenericTable {
+  let FilterClass = "MIMGBiasMapping";
+  let CppTypeName = "MIMGBiasMappingInfo";
+  let Fields = ["Bias", "NoBias"];
+  string TypeOf_Bias = "MIMGBaseOpcode";
+  string TypeOf_NoBias = "MIMGBaseOpcode";
+
+  let PrimaryKey = ["Bias"];
+  let PrimaryKeyName = "getMIMGBiasMappingInfo";
+}
+
+class MIMGOffsetMapping<MIMGBaseOpcode offset, MIMGBaseOpcode nooffset> {
+  MIMGBaseOpcode Offset = offset;
+  MIMGBaseOpcode NoOffset = nooffset;
+}
+
+def MIMGOffsetMappingTable : GenericTable {
+  let FilterClass = "MIMGOffsetMapping";
+  let CppTypeName = "MIMGOffsetMappingInfo";
+  let Fields = ["Offset", "NoOffset"];
+  string TypeOf_Offset = "MIMGBaseOpcode";
+  string TypeOf_NoOffset = "MIMGBaseOpcode";
+
+  let PrimaryKey = ["Offset"];
+  let PrimaryKeyName = "getMIMGOffsetMappingInfo";
+}
+
 class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> {
   MIMGBaseOpcode G = g;
   MIMGBaseOpcode G16 = g16;
@@ -1070,6 +1102,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   AMDGPUDimProps Dim = I.P.Dim;
   AMDGPUImageDimIntrinsicEval DimEval = AMDGPUImageDimIntrinsicEval<I.P>;
 
+  bits<8> NumOffsetArgs = DimEval.NumOffsetArgs;
+  bits<8> NumBiasArgs = DimEval.NumBiasArgs;
+  bits<8> NumZCompareArgs = DimEval.NumZCompareArgs;
   bits<8> NumGradients = DimEval.NumGradientArgs;
   bits<8> NumDmask = DimEval.NumDmaskArgs;
   bits<8> NumData = DimEval.NumDataArgs;
@@ -1078,6 +1113,9 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
 
   bits<8> DMaskIndex = DimEval.DmaskArgIndex;
   bits<8> VAddrStart = DimEval.VAddrArgIndex;
+  bits<8> OffsetIndex = DimEval.OffsetArgIndex;
+  bits<8> BiasIndex = DimEval.BiasArgIndex;
+  bits<8> ZCompareIndex = DimEval.ZCompareArgIndex;
   bits<8> GradientStart = DimEval.GradientArgIndex;
   bits<8> CoordStart = DimEval.CoordArgIndex;
   bits<8> LodIndex = DimEval.LodArgIndex;
@@ -1089,6 +1127,8 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
   bits<8> TexFailCtrlIndex = DimEval.TexFailCtrlArgIndex;
   bits<8> CachePolicyIndex = DimEval.CachePolicyArgIndex;
 
+  bits<8> BiasTyArg = !add(I.P.NumRetAndDataAnyTypes,
+    !if(!eq(NumOffsetArgs, 0), 0, I.P.ExtraAddrArgs[0].Type.isAny));
   bits<8> GradientTyArg = !add(I.P.NumRetAndDataAnyTypes,
     !foldl(0, I.P.ExtraAddrArgs, cnt, arg, !add(cnt, arg.Type.isAny)));
   bits<8> CoordTyArg = !add(GradientTyArg, !if(I.P.Gradients, 1, 0));
@@ -1096,10 +1136,10 @@ class ImageDimIntrinsicInfo<AMDGPUImageDimIntrinsic I> {
 
 def ImageDimIntrinsicTable : GenericTable {
   let FilterClass = "ImageDimIntrinsicInfo";
-  let Fields = ["Intr", "BaseOpcode", "Dim", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
-    "DMaskIndex", "VAddrStart", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
+  let Fields = ["Intr", "BaseOpcode", "Dim", "NumOffsetArgs", "NumBiasArgs", "NumZCompareArgs", "NumGradients", "NumDmask", "NumData", "NumVAddrs", "NumArgs",
+    "DMaskIndex", "VAddrStart", "OffsetIndex", "BiasIndex", "ZCompareIndex", "GradientStart", "CoordStart", "LodIndex", "MipIndex", "VAddrEnd",
     "RsrcIndex", "SampIndex", "UnormIndex", "TexFailCtrlIndex", "CachePolicyIndex",
-    "GradientTyArg", "CoordTyArg"];
+    "BiasTyArg", "GradientTyArg", "CoordTyArg"];
   string TypeOf_BaseOpcode = "MIMGBaseOpcode";
   string TypeOf_Dim = "MIMGDim";
 
@@ -1132,6 +1172,66 @@ def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
 def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
 def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>;
 
+// Bias to NoBias Optimization Mapping
+def : MIMGBiasMapping<IMAGE_SAMPLE_B, IMAGE_SAMPLE>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL, IMAGE_SAMPLE_CL>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B, IMAGE_SAMPLE_C>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL, IMAGE_SAMPLE_C_CL>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_O>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_CL_O>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_O>;
+def : MIMGBiasMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_CL_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B, IMAGE_GATHER4>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B_CL, IMAGE_GATHER4_CL>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B, IMAGE_GATHER4_C>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL, IMAGE_GATHER4_C_CL>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_CL_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_O>;
+def : MIMGBiasMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_CL_O>;
+
+// Offset to NoOffset Optimization Mapping
+def : MIMGOffsetMapping<IMAGE_SAMPLE_O, IMAGE_SAMPLE>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CL_O, IMAGE_SAMPLE_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_O_G16, IMAGE_SAMPLE_D_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_D_CL_O_G16, IMAGE_SAMPLE_D_CL_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_L_O, IMAGE_SAMPLE_L>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_B_O, IMAGE_SAMPLE_B>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_B_CL_O, IMAGE_SAMPLE_B_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_LZ_O, IMAGE_SAMPLE_LZ>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_O, IMAGE_SAMPLE_C>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CL_O, IMAGE_SAMPLE_C_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_O_G16, IMAGE_SAMPLE_C_D_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_D_CL_O_G16, IMAGE_SAMPLE_C_D_CL_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_L_O, IMAGE_SAMPLE_C_L>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_CL_O, IMAGE_SAMPLE_C_B_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_B_O, IMAGE_SAMPLE_C_B>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_LZ_O, IMAGE_SAMPLE_C_LZ>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_O, IMAGE_GATHER4>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_CL_O, IMAGE_GATHER4_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_L_O, IMAGE_GATHER4_L>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_B_O, IMAGE_GATHER4_B>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_B_CL_O, IMAGE_GATHER4_B_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_LZ_O, IMAGE_GATHER4_LZ>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_O, IMAGE_GATHER4_C>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_CL_O, IMAGE_GATHER4_C_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_L>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_O, IMAGE_GATHER4_C_B>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_B_CL_O, IMAGE_GATHER4_C_B_CL>;
+def : MIMGOffsetMapping<IMAGE_GATHER4_C_LZ_O, IMAGE_GATHER4_C_LZ>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_O_G16, IMAGE_SAMPLE_CD_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_CD_CL_O_G16, IMAGE_SAMPLE_CD_CL_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_O_G16, IMAGE_SAMPLE_C_CD_G16>;
+def : MIMGOffsetMapping<IMAGE_SAMPLE_C_CD_CL_O_G16, IMAGE_SAMPLE_C_CD_CL_G16>;
+
 // G to G16 Optimization Mapping
 def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>;
 def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index f9a9a6127322..1e75a0432ec3 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -19,7 +19,6 @@
 
 namespace llvm {
 
-class R600InstrInfo;
 class R600Subtarget;
 
 class R600TargetLowering final : public AMDGPUTargetLowering {
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.h b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
index fc567f1a1fca..bc8a4786df77 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.h
@@ -29,7 +29,6 @@ enum : uint64_t {
 };
 }
 
-class AMDGPUTargetMachine;
 class DFAPacketizer;
 class MachineFunction;
 class MachineInstr;
diff --git a/llvm/lib/Target/AMDGPU/R600Subtarget.h b/llvm/lib/Target/AMDGPU/R600Subtarget.h
index 94403b88f21a..92d559b1f8e6 100644
--- a/llvm/lib/Target/AMDGPU/R600Subtarget.h
+++ b/llvm/lib/Target/AMDGPU/R600Subtarget.h
@@ -21,12 +21,6 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
-namespace llvm {
-
-class MCInstrInfo;
-
-} // namespace llvm
-
 #define GET_SUBTARGETINFO_HEADER
 #include "R600GenSubtargetInfo.inc"
 
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 397b2f873515..b81fac36fc95 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -245,6 +245,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
     return CallInst::Create(IfBreak, Args, "", Insert);
   }
 
+  if (isa<Argument>(Cond)) {
+    Instruction *Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
+  }
+
   llvm_unreachable("Unhandled loop condition!");
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 580e4bc417a4..107ee5ed5532 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -379,6 +379,8 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_FLAT_SCR_LO = 20,
   ID_FLAT_SCR_HI = 21,
   ID_XNACK_MASK = 22,
+  ID_HW_ID1 = 23,
+  ID_HW_ID2 = 24,
   ID_POPS_PACKER = 25,
   ID_SHADER_CYCLES = 29,
   ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES,
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1f93284fc7ee..33954e11d6c6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -300,6 +300,13 @@ static bool updateOperand(FoldCandidate &Fold,
   assert(!Fold.needsShrink() && "not handled");
 
   if (Fold.isImm()) {
+    if (Old.isTied()) {
+      int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(MI->getOpcode());
+      if (NewMFMAOpc == -1)
+        return false;
+      MI->setDesc(TII.get(NewMFMAOpc));
+      MI->untieRegOperand(0);
+    }
     Old.ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index d4fe74ecb96e..6078f4a0577a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1195,7 +1195,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
           }
         } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
                    TII->isLoadFromStackSlot(MI, FrameIndex))
-          NonVGPRSpillFIs.set(FrameIndex);
+          if (!MFI.isFixedObjectIndex(FrameIndex))
+            NonVGPRSpillFIs.set(FrameIndex);
       }
     }
 
@@ -1320,16 +1321,14 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
   const BitVector AllSavedRegs = SavedRegs;
   SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
 
-  // If clearing VGPRs changed the mask, we will have some CSR VGPR spills.
-  const bool HaveAnyCSRVGPR = SavedRegs != AllSavedRegs;
-
   // We have to anticipate introducing CSR VGPR spills or spill of caller
   // save VGPR reserved for SGPR spills as we now always create stack entry
-  // for it, if we don't have any stack objects already, since we require
-  // an FP if there is a call and stack.
+  // for it, if we don't have any stack objects already, since we require a FP
+  // if there is a call and stack. We will allocate a VGPR for SGPR spills if
+  // there are any SGPR spills. Whether they are CSR spills or otherwise.
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   const bool WillHaveFP =
-      FrameInfo.hasCalls() && (HaveAnyCSRVGPR || MFI->VGPRReservedForSGPRSpill);
+      FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
 
   // FP will be specially managed like SP.
   if (WillHaveFP || hasFP(MF))
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 56fbb875ffd9..7949dcfa6632 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -13,11 +13,6 @@
 
 namespace llvm {
 
-class SIInstrInfo;
-class SIMachineFunctionInfo;
-class SIRegisterInfo;
-class GCNSubtarget;
-
 class SIFrameLowering final : public AMDGPUFrameLowering {
 public:
   SIFrameLowering(StackDirection D, Align StackAl, int LAO,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9f138136e6e9..561866b5a398 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -45,10 +45,6 @@ static cl::opt<bool> DisableLoopAlignment(
   cl::desc("Do not align and prefetch loops"),
   cl::init(false));
 
-static cl::opt<bool> VGPRReserveforSGPRSpill(
-    "amdgpu-reserve-vgpr-for-sgpr-spill",
-    cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
-
 static cl::opt<bool> UseDivergentRegisterIndexing(
   "amdgpu-use-divergent-register-indexing",
   cl::Hidden,
@@ -138,6 +134,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
+    addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
+    addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
   }
 
   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -273,7 +271,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                   MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
                   MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
                   MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
-                  MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
+                  MVT::v8i16, MVT::v8f16, MVT::v16i64, MVT::v16f64,
+                  MVT::v32i32, MVT::v32f32 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -615,7 +614,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     if (STI.hasMadF16())
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
 
-    for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
+    for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
+                   MVT::v8f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -677,6 +677,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
 
+    setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
+    setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
+
+    setOperationAction(ISD::STORE, MVT::v4i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
+    setOperationAction(ISD::STORE, MVT::v4f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
+
+    setOperationAction(ISD::STORE, MVT::v8i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
+    setOperationAction(ISD::STORE, MVT::v8f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
+
     setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
@@ -686,6 +701,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
 
+    setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Expand);
+
     if (!Subtarget->hasVOP3PInsts()) {
       setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
       setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
@@ -703,9 +722,20 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
     setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
+    setOperationAction(ISD::FMINNUM_IEEE, MVT::v8f16, Custom);
+    setOperationAction(ISD::FMAXNUM_IEEE, MVT::v8f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
+    setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);
+    setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);
+
+    for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
+      setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
+    }
   }
 
   if (Subtarget->hasVOP3PInsts()) {
@@ -739,34 +769,42 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
 
-    setOperationAction(ISD::SHL, MVT::v4i16, Custom);
-    setOperationAction(ISD::SRA, MVT::v4i16, Custom);
-    setOperationAction(ISD::SRL, MVT::v4i16, Custom);
-    setOperationAction(ISD::ADD, MVT::v4i16, Custom);
-    setOperationAction(ISD::SUB, MVT::v4i16, Custom);
-    setOperationAction(ISD::MUL, MVT::v4i16, Custom);
+    for (MVT VT : { MVT::v4i16, MVT::v8i16 }) {
+      // Split vector operations.
+      setOperationAction(ISD::SHL, VT, Custom);
+      setOperationAction(ISD::SRA, VT, Custom);
+      setOperationAction(ISD::SRL, VT, Custom);
+      setOperationAction(ISD::ADD, VT, Custom);
+      setOperationAction(ISD::SUB, VT, Custom);
+      setOperationAction(ISD::MUL, VT, Custom);
 
-    setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
-    setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
-    setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
-    setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
+      setOperationAction(ISD::SMIN, VT, Custom);
+      setOperationAction(ISD::SMAX, VT, Custom);
+      setOperationAction(ISD::UMIN, VT, Custom);
+      setOperationAction(ISD::UMAX, VT, Custom);
 
-    setOperationAction(ISD::UADDSAT, MVT::v4i16, Custom);
-    setOperationAction(ISD::SADDSAT, MVT::v4i16, Custom);
-    setOperationAction(ISD::USUBSAT, MVT::v4i16, Custom);
-    setOperationAction(ISD::SSUBSAT, MVT::v4i16, Custom);
+      setOperationAction(ISD::UADDSAT, VT, Custom);
+      setOperationAction(ISD::SADDSAT, VT, Custom);
+      setOperationAction(ISD::USUBSAT, VT, Custom);
+      setOperationAction(ISD::SSUBSAT, VT, Custom);
+    }
 
-    setOperationAction(ISD::FADD, MVT::v4f16, Custom);
-    setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
-    setOperationAction(ISD::FMA, MVT::v4f16, Custom);
+    for (MVT VT : { MVT::v4f16, MVT::v8f16 }) {
+      // Split vector operations.
+      setOperationAction(ISD::FADD, VT, Custom);
+      setOperationAction(ISD::FMUL, VT, Custom);
+      setOperationAction(ISD::FMA, VT, Custom);
+      setOperationAction(ISD::FCANONICALIZE, VT, Custom);
+    }
 
     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
     setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
-    setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
 
     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
     setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
@@ -803,7 +841,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FABS, MVT::v2f16, Custom);
   }
 
-  for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
+  for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
+                  MVT::v8i16, MVT::v8f16 }) {
     setOperationAction(ISD::SELECT, VT, Custom);
   }
 
@@ -2776,6 +2815,7 @@ void SITargetLowering::passSpecialInputs(
 
   SelectionDAG &DAG = CLI.DAG;
   const SDLoc &DL = CLI.DL;
+  const Function &F = DAG.getMachineFunction().getFunction();
 
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
@@ -2887,11 +2927,16 @@ void SITargetLowering::passSpecialInputs(
 
   // If incoming ids are not packed we need to pack them.
   if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
-      NeedWorkItemIDX)
-    InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+      NeedWorkItemIDX) {
+    if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
+      InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
+    } else {
+      InputReg = DAG.getConstant(0, DL, MVT::i32);
+    }
+  }
 
   if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
-      NeedWorkItemIDY) {
+      NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
     SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
     Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
                     DAG.getShiftAmountConstant(10, MVT::i32, SL));
@@ -2900,7 +2945,7 @@ void SITargetLowering::passSpecialInputs(
   }
 
   if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
-      NeedWorkItemIDZ) {
+      NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
     SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
     Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
                     DAG.getShiftAmountConstant(20, MVT::i32, SL));
@@ -2909,13 +2954,21 @@ void SITargetLowering::passSpecialInputs(
   }
 
   if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
-    // Workitem ids are already packed, any of present incoming arguments
-    // will carry all required fields.
-    ArgDescriptor IncomingArg = ArgDescriptor::createArg(
-      IncomingArgX ? *IncomingArgX :
-      IncomingArgY ? *IncomingArgY :
-                     *IncomingArgZ, ~0u);
-    InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+    if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
+      // We're in a situation where the outgoing function requires the workitem
+      // ID, but the calling function does not have it (e.g a graphics function
+      // calling a C calling convention function). This is illegal, but we need
+      // to produce something.
+      InputReg = DAG.getUNDEF(MVT::i32);
+    } else {
+      // Workitem ids are already packed, any of present incoming arguments
+      // will carry all required fields.
+      ArgDescriptor IncomingArg = ArgDescriptor::createArg(
+        IncomingArgX ? *IncomingArgX :
+        IncomingArgY ? *IncomingArgY :
+        *IncomingArgZ, ~0u);
+      InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
+    }
   }
 
   if (OutgoingArg->isRegister()) {
@@ -4600,7 +4653,8 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
-         VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
+         VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8f32 ||
+         VT == MVT::v16f32 || VT == MVT::v32f32);
 
   SDValue Lo0, Hi0;
   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4621,21 +4675,26 @@ SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
                                               SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
-         VT == MVT::v8f32 || VT == MVT::v16f32 || VT == MVT::v32f32);
+  assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
+         VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v8f32 ||
+         VT == MVT::v16f32 || VT == MVT::v32f32);
 
   SDValue Lo0, Hi0;
-  std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+  SDValue Op0 = Op.getOperand(0);
+  std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
+                         ? DAG.SplitVectorOperand(Op.getNode(), 0)
+                         : std::make_pair(Op0, Op0);
   SDValue Lo1, Hi1;
   std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
   SDValue Lo2, Hi2;
   std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
 
   SDLoc SL(Op);
+  auto ResVT = DAG.GetSplitDestVTs(VT);
 
-  SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
+  SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
                              Op->getFlags());
-  SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
+  SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
                              Op->getFlags());
 
   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
@@ -5297,7 +5356,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   if (IsIEEEMode)
     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
 
-  if (VT == MVT::v4f16)
+  if (VT == MVT::v4f16 || VT == MVT::v8f16)
     return splitBinaryVectorOp(Op, DAG);
   return Op;
 }
@@ -5501,6 +5560,22 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                          MachineMemOperand::MOInvariant);
 }
 
+/// Return true if the value is a known valid address, such that a null check is
+/// not necessary.
+static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
+                           const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
+  if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
+      isa<BasicBlockSDNode>(Val))
+    return true;
+
+  if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
+    return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
+
+  // TODO: Search through arithmetic, handle arguments and loads
+  // marked nonnull.
+  return false;
+}
+
 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDLoc SL(Op);
@@ -5508,48 +5583,64 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
 
   SDValue Src = ASC->getOperand(0);
   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
+  unsigned SrcAS = ASC->getSrcAddressSpace();
 
   const AMDGPUTargetMachine &TM =
     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
 
   // flat -> local/private
-  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
+  if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
     unsigned DestAS = ASC->getDestAddressSpace();
 
     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
         DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
+      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
+      if (isKnownNonNull(Src, DAG, TM, SrcAS))
+        return Ptr;
+
       unsigned NullVal = TM.getNullPointerValue(DestAS);
       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
-      SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
 
-      return DAG.getNode(ISD::SELECT, SL, MVT::i32,
-                         NonNull, Ptr, SegmentNullPtr);
+      return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
+                         SegmentNullPtr);
     }
   }
 
   // local/private -> flat
   if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
-    unsigned SrcAS = ASC->getSrcAddressSpace();
-
     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
         SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
+
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
+      SDValue CvtPtr =
+          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
+      CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
+
+      if (isKnownNonNull(Src, DAG, TM, SrcAS))
+        return CvtPtr;
+
       unsigned NullVal = TM.getNullPointerValue(SrcAS);
       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
 
       SDValue NonNull
         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
 
-      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
-      SDValue CvtPtr
-        = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
-
-      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
-                         DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
+      return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
                          FlatNullPtr);
     }
   }
 
+  if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+      Op.getValueType() == MVT::i64) {
+    const SIMachineFunctionInfo *Info =
+        DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
+    SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
+    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
+    return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
+  }
+
   if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
       Src.getValueType() == MVT::i64)
     return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
@@ -5676,7 +5767,6 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   EVT VecVT = Vec.getValueType();
   unsigned VecSize = VecVT.getSizeInBits();
   EVT EltVT = VecVT.getVectorElementType();
-  assert(VecSize <= 64);
 
   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
 
@@ -5687,6 +5777,28 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
     return Combined;
 
+  if (VecSize == 128) {
+    SDValue Lo, Hi;
+    EVT LoVT, HiVT;
+    SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
+    Lo =
+        DAG.getBitcast(LoVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
+                                         V2, DAG.getConstant(0, SL, MVT::i32)));
+    Hi =
+        DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64,
+                                         V2, DAG.getConstant(1, SL, MVT::i32)));
+    EVT IdxVT = Idx.getValueType();
+    unsigned NElem = VecVT.getVectorNumElements();
+    assert(isPowerOf2_32(NElem));
+    SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
+    SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
+    SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
+  }
+
+  assert(VecSize <= 64);
+
   unsigned EltSize = EltVT.getSizeInBits();
   assert(isPowerOf2_32(EltSize));
 
@@ -5769,20 +5881,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
   SDLoc SL(Op);
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::v4i16 || VT == MVT::v4f16) {
-    EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
+  if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+      VT == MVT::v8i16 || VT == MVT::v8f16) {
+    EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+                                  VT.getVectorNumElements() / 2);
+    MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
 
     // Turn into pair of packed build_vectors.
     // TODO: Special case for constants that can be materialized with s_mov_b64.
-    SDValue Lo = DAG.getBuildVector(HalfVT, SL,
-                                    { Op.getOperand(0), Op.getOperand(1) });
-    SDValue Hi = DAG.getBuildVector(HalfVT, SL,
-                                    { Op.getOperand(2), Op.getOperand(3) });
+    SmallVector<SDValue, 4> LoOps, HiOps;
+    for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
+      LoOps.push_back(Op.getOperand(I));
+      HiOps.push_back(Op.getOperand(I + E));
+    }
+    SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
+    SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
 
-    SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
-    SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
+    SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
+    SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
 
-    SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
+    SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
+                                       { CastLo, CastHi });
     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
   }
 
@@ -6155,10 +6274,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
-  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
-      AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
-  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
-      AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
   unsigned IntrOpcode = Intr->BaseOpcode;
   bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
 
@@ -6246,28 +6361,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
   SmallVector<SDValue, 4> VAddrs;
 
-  // Optimize _L to _LZ when _L is zero
-  if (LZMappingInfo) {
-    if (auto *ConstantLod = dyn_cast<ConstantFPSDNode>(
-            Op.getOperand(ArgOffset + Intr->LodIndex))) {
-      if (ConstantLod->isZero() || ConstantLod->isNegative()) {
-        IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
-        VAddrEnd--;                      // remove 'lod'
-      }
-    }
-  }
-
-  // Optimize _mip away, when 'lod' is zero
-  if (MIPMappingInfo) {
-    if (auto *ConstantLod = dyn_cast<ConstantSDNode>(
-            Op.getOperand(ArgOffset + Intr->MipIndex))) {
-      if (ConstantLod->isZero()) {
-        IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
-        VAddrEnd--;                           // remove 'mip'
-      }
-    }
-  }
-
   // Check for 16 bit addresses or derivatives and pack if true.
   MVT VAddrVT =
       Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
@@ -6283,12 +6376,18 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   // Push back extra arguments.
   for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
     if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
+      assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
       // Special handling of bias when A16 is on. Bias is of type half but
       // occupies full 32-bit.
-      SDValue bias = DAG.getBuildVector( MVT::v2f16, DL, {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
-      VAddrs.push_back(bias);
-    } else
+      SDValue Bias = DAG.getBuildVector(
+          MVT::v2f16, DL,
+          {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
+      VAddrs.push_back(Bias);
+    } else {
+      assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
+             "Bias needs to be converted to 16 bit in A16 mode");
       VAddrs.push_back(Op.getOperand(ArgOffset + I));
+    }
   }
 
   if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
@@ -6731,14 +6830,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
   case Intrinsic::amdgcn_workitem_id_x:
+    if (Subtarget->getMaxWorkitemID(MF.getFunction(), 0) == 0)
+      return DAG.getConstant(0, DL, MVT::i32);
+
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDX);
   case Intrinsic::amdgcn_workitem_id_y:
+    if (Subtarget->getMaxWorkitemID(MF.getFunction(), 1) == 0)
+      return DAG.getConstant(0, DL, MVT::i32);
+
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDY);
   case Intrinsic::amdgcn_workitem_id_z:
+    if (Subtarget->getMaxWorkitemID(MF.getFunction(), 2) == 0)
+      return DAG.getConstant(0, DL, MVT::i32);
+
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDZ);
@@ -6899,9 +7007,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                 DAG.getConstant(1, SL, MVT::i32));
     return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
   }
-  case Intrinsic::amdgcn_alignbit:
-    return DAG.getNode(ISD::FSHR, DL, VT,
-                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_perm:
     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
@@ -8408,21 +8513,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
+  if (VT.getSizeInBits() == 128)
+    return splitTernaryVectorOp(Op, DAG);
+
   assert(VT.getSizeInBits() == 64);
 
   SDLoc DL(Op);
   SDValue Cond = Op.getOperand(0);
 
-  if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() &&
-      !Op->isDivergent()) {
-    if (VT == MVT::i64)
-      return Op;
-    SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1));
-    SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2));
-    return DAG.getNode(ISD::BITCAST, DL, VT,
-                       DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS));
-  }
-
   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
   SDValue One = DAG.getConstant(1, DL, MVT::i32);
 
@@ -9550,6 +9648,9 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
 
 SDValue SITargetLowering::performXorCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
+  if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
+    return RV;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::i64)
     return SDValue();
@@ -10462,6 +10563,9 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
+  if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
+    return SDValue();
+
   unsigned Opc = N->getOpcode();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -10483,12 +10587,6 @@ SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
   if (Op1->isDivergent())
     std::swap(Op1, Op2);
 
-  // If either operand is constant this will conflict with
-  // DAGCombiner::ReassociateOps().
-  if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
-      DAG.isConstantIntBuildVectorOrConstantInt(Op1))
-    return SDValue();
-
   SDLoc SL(N);
   SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
   return DAG.getNode(Opc, SL, VT, Add1, Op2);
@@ -11130,7 +11228,9 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
   unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
   bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
-                  Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
+                  Node->getConstantOperandVal(LWEIdx))
+                     ? true
+                     : false;
   unsigned TFCLane = 0;
   bool HasChain = Node->getNumValues() > 1;
 
@@ -11719,25 +11819,51 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
       return std::make_pair(0U, RC);
   }
 
-  if (Constraint.size() > 1) {
-    if (Constraint[1] == 'v') {
+  if (Constraint.startswith("{") && Constraint.endswith("}")) {
+    StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
+    if (RegName.consume_front("v")) {
       RC = &AMDGPU::VGPR_32RegClass;
-    } else if (Constraint[1] == 's') {
+    } else if (RegName.consume_front("s")) {
       RC = &AMDGPU::SGPR_32RegClass;
-    } else if (Constraint[1] == 'a') {
+    } else if (RegName.consume_front("a")) {
       RC = &AMDGPU::AGPR_32RegClass;
     }
 
     if (RC) {
       uint32_t Idx;
-      bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
-      if (!Failed && Idx < RC->getNumRegs())
-        return std::make_pair(RC->getRegister(Idx), RC);
+      if (RegName.consume_front("[")) {
+        uint32_t End;
+        bool Failed = RegName.consumeInteger(10, Idx);
+        Failed |= !RegName.consume_front(":");
+        Failed |= RegName.consumeInteger(10, End);
+        Failed |= !RegName.consume_back("]");
+        if (!Failed) {
+          uint32_t Width = (End - Idx + 1) * 32;
+          MCRegister Reg = RC->getRegister(Idx);
+          if (SIRegisterInfo::isVGPRClass(RC))
+            RC = TRI->getVGPRClassForBitWidth(Width);
+          else if (SIRegisterInfo::isSGPRClass(RC))
+            RC = TRI->getSGPRClassForBitWidth(Width);
+          else if (SIRegisterInfo::isAGPRClass(RC))
+            RC = TRI->getAGPRClassForBitWidth(Width);
+          if (RC) {
+            Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
+            return std::make_pair(Reg, RC);
+          }
+        }
+      } else {
+        bool Failed = RegName.getAsInteger(10, Idx);
+        if (!Failed && Idx < RC->getNumRegs())
+          return std::make_pair(RC->getRegister(Idx), RC);
+      }
     }
   }
 
-  // FIXME: Returns VS_32 for physical SGPR constraints
-  return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+  auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
+  if (Ret.first)
+    Ret.second = TRI->getPhysRegClass(Ret.first);
+
+  return Ret;
 }
 
 static bool isImmConstraint(StringRef Constraint) {
@@ -11975,13 +12101,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   }
 
   TargetLoweringBase::finalizeLowering(MF);
-
-  // Allocate a VGPR for future SGPR Spill if
-  // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
-  // FIXME: We won't need this hack if we split SGPR allocation from VGPR
-  if (VGPRReserveforSGPRSpill && TRI->spillSGPRToVGPR() &&
-      !Info->VGPRReservedForSGPRSpill && !Info->isEntryFunction())
-    Info->reserveVGPRforSGPRSpills(MF);
 }
 
 void SITargetLowering::computeKnownBitsForFrameIndex(
@@ -12441,17 +12560,10 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
       for (auto &TC : TargetConstraints) {
         if (TC.Type == InlineAsm::isOutput) {
           ComputeConstraintToUse(TC, SDValue());
-          unsigned AssignedReg;
-          const TargetRegisterClass *RC;
-          std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
-              SIRI, TC.ConstraintCode, TC.ConstraintVT);
-          if (RC) {
-            MachineRegisterInfo &MRI = MF.getRegInfo();
-            if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
-              return true;
-            else if (SIRI->isSGPRClass(RC))
-              return true;
-          }
+          const TargetRegisterClass *RC = getRegForInlineAsmConstraint(
+              SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
+          if (RC && SIRI->isSGPRClass(RC))
+            return true;
         }
       }
     }
@@ -12475,3 +12587,27 @@ SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
   Cost.first += (Size + 255) / 256;
   return Cost;
 }
+
+bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
+  SDNode::use_iterator I = N->use_begin(), E = N->use_end();
+  for (; I != E; ++I) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
+      if (getBasePtrIndex(M) == I.getOperandNo())
+        return true;
+    }
+  }
+  return false;
+}
+
+bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                                           SDValue N1) const {
+  if (!N0.hasOneUse())
+    return false;
+  // Take care of the oportunity to keep N0 uniform
+  if (N0->isDivergent() || !N1->isDivergent())
+    return true;
+  // Check if we have a good chance to form the memory access pattern with the
+  // base and offset
+  return (DAG.isBaseWithConstantOffset(N0) &&
+          hasMemSDNodeUser(*N0->use_begin()));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1315cc15dd02..bf81e082b478 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -449,6 +449,11 @@ public:
   bool isSDNodeSourceOfDivergence(const SDNode *N,
     FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
 
+  bool hasMemSDNodeUser(SDNode *N) const;
+
+  bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
+                           SDValue N1) const override;
+
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
   bool isCanonicalized(Register Reg, MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6fbe5d45ce0a..f8a10bc8ef6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -863,7 +863,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
       Wait.ExpCnt = ~0u;
 
       LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                        << "Old Instr: " << MI << "New Instr: " << *WaitcntInstr
+                        << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr
                         << '\n');
     } else {
       WaitcntInstr->eraseFromParent();
@@ -886,7 +886,7 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
       Wait.VsCnt = ~0u;
 
       LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                        << "Old Instr: " << MI
+                        << "Old Instr: " << *MI
                         << "New Instr: " << *WaitcntVsCntInstr << '\n');
     } else {
       WaitcntVsCntInstr->eraseFromParent();
@@ -1382,7 +1382,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
 
   for (auto T : inst_counter_types()) {
     // Merge event flags for this counter
-    const bool OldOutOfOrder = counterOutOfOrder(T);
     const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
     const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
     if (OtherEvents & ~OldEvents)
@@ -1425,7 +1424,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
       }
     }
 
-    if (RegStrictDom && !OldOutOfOrder)
+    if (RegStrictDom)
       StrictDom = true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1755b93538ce..0a2f9381e71f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -130,10 +130,24 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   return false;
 }
 
+static bool readsExecAsData(const MachineInstr &MI) {
+  if (MI.isCompare())
+    return true;
+
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case AMDGPU::V_READFIRSTLANE_B32:
+    return true;
+  }
+
+  return false;
+}
+
 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
   // Any implicit use of exec by VALU is not a real register read.
   return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
-         isVALU(*MO.getParent());
+         isVALU(*MO.getParent()) && !readsExecAsData(*MO.getParent());
 }
 
 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
@@ -3184,10 +3198,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
                Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
                Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
   bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
+  int NewMFMAOpc = -1;
 
   switch (Opc) {
   default:
-    return nullptr;
+    NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
+    if (NewMFMAOpc == -1)
+      return nullptr;
+    break;
   case AMDGPU::V_MAC_F16_e64:
   case AMDGPU::V_FMAC_F16_e64:
     IsF16 = true;
@@ -3216,6 +3234,19 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
   }
   }
 
+  MachineInstrBuilder MIB;
+  MachineBasicBlock &MBB = *MI.getParent();
+
+  if (NewMFMAOpc != -1) {
+    MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
+      MIB.add(MI.getOperand(I));
+    updateLiveVariables(LV, MI, *MIB);
+    if (LIS)
+      LIS->ReplaceMachineInstrInMaps(MI, *MIB);
+    return MIB;
+  }
+
   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
   const MachineOperand *Src0Mods =
@@ -3226,8 +3257,6 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
-  MachineInstrBuilder MIB;
-  MachineBasicBlock &MBB = *MI.getParent();
 
   if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
       // If we have an SGPR input, we will violate the constant bus restriction.
@@ -4520,6 +4549,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
   }
 
+  if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
+    const MachineOperand &SrcOp = MI.getOperand(1);
+    if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
+      ErrInfo = "pseudo expects only physical SGPRs";
+      return false;
+    }
+  }
+
   return true;
 }
 
@@ -6122,11 +6159,8 @@ MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       continue;
 
     case AMDGPU::S_CSELECT_B32:
-      lowerSelect32(Worklist, Inst, MDT);
-      Inst.eraseFromParent();
-      continue;
     case AMDGPU::S_CSELECT_B64:
-      splitSelect64(Worklist, Inst, MDT);
+      lowerSelect(Worklist, Inst, MDT);
       Inst.eraseFromParent();
       continue;
     case AMDGPU::S_CMP_EQ_I32:
@@ -6304,8 +6338,8 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
   return std::make_pair(false, nullptr);
 }
 
-void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
-                                MachineDominatorTree *MDT) const {
+void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+                              MachineDominatorTree *MDT) const {
 
   MachineBasicBlock &MBB = *Inst.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6380,95 +6414,6 @@ void SIInstrInfo::lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
-                                MachineDominatorTree *MDT) const {
-  // Split S_CSELECT_B64 into a pair of S_CSELECT_B32 and lower them
-  // further.
-  const DebugLoc &DL = Inst.getDebugLoc();
-  MachineBasicBlock::iterator MII = Inst;
-  MachineBasicBlock &MBB = *Inst.getParent();
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  // Get the original operands.
-  MachineOperand &Dest = Inst.getOperand(0);
-  MachineOperand &Src0 = Inst.getOperand(1);
-  MachineOperand &Src1 = Inst.getOperand(2);
-  MachineOperand &Cond = Inst.getOperand(3);
-
-  Register SCCSource = Cond.getReg();
-  bool IsSCC = (SCCSource == AMDGPU::SCC);
-
-  // If this is a trivial select where the condition is effectively not SCC
-  // (SCCSource is a source of copy to SCC), then the select is semantically
-  // equivalent to copying SCCSource. Hence, there is no need to create
-  // V_CNDMASK, we can just use that and bail out.
-  if (!IsSCC && (Src0.isImm() && Src0.getImm() == -1) &&
-      (Src1.isImm() && Src1.getImm() == 0)) {
-    MRI.replaceRegWith(Dest.getReg(), SCCSource);
-    return;
-  }
-
-  // Prepare the split destination.
-  Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
-  Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-  Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-  // Split the source operands.
-  const TargetRegisterClass *Src0RC = nullptr;
-  const TargetRegisterClass *Src0SubRC = nullptr;
-  if (Src0.isReg()) {
-    Src0RC = MRI.getRegClass(Src0.getReg());
-    Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
-  }
-  const TargetRegisterClass *Src1RC = nullptr;
-  const TargetRegisterClass *Src1SubRC = nullptr;
-  if (Src1.isReg()) {
-    Src1RC = MRI.getRegClass(Src1.getReg());
-    Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
-  }
-  // Split lo.
-  MachineOperand SrcReg0Sub0 =
-      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
-  MachineOperand SrcReg1Sub0 =
-      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
-  // Split hi.
-  MachineOperand SrcReg0Sub1 =
-      buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
-  MachineOperand SrcReg1Sub1 =
-      buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
-  // Select the lo part.
-  MachineInstr *LoHalf =
-      BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub0)
-          .add(SrcReg0Sub0)
-          .add(SrcReg1Sub0);
-  // Replace the condition operand with the original one.
-  LoHalf->getOperand(3).setReg(SCCSource);
-  Worklist.insert(LoHalf);
-  // Select the hi part.
-  MachineInstr *HiHalf =
-      BuildMI(MBB, MII, DL, get(AMDGPU::S_CSELECT_B32), DestSub1)
-          .add(SrcReg0Sub1)
-          .add(SrcReg1Sub1);
-  // Replace the condition operand with the original one.
-  HiHalf->getOperand(3).setReg(SCCSource);
-  Worklist.insert(HiHalf);
-  // Merge them back to the original 64-bit one.
-  BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
-      .addReg(DestSub0)
-      .addImm(AMDGPU::sub0)
-      .addReg(DestSub1)
-      .addImm(AMDGPU::sub1);
-  MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-
-  // Try to legalize the operands in case we need to swap the order to keep
-  // it valid.
-  legalizeOperands(*LoHalf, MDT);
-  legalizeOperands(*HiHalf, MDT);
-
-  // Move all users of this moved value.
-  addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
-}
-
 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
@@ -7820,6 +7765,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
     }
   }
 
+  if (isMAI(Opcode)) {
+    int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
+    if (MFMAOp != -1)
+      Opcode = MFMAOp;
+  }
+
   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
   // -1 means that Opcode is already a native instruction.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dd9ea2b53ca2..e551d6c7223f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -78,11 +78,8 @@ private:
   moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
                    MachineDominatorTree *MDT = nullptr) const;
 
-  void lowerSelect32(SetVectorType &Worklist, MachineInstr &Inst,
-                     MachineDominatorTree *MDT = nullptr) const;
-
-  void splitSelect64(SetVectorType &Worklist, MachineInstr &Inst,
-                     MachineDominatorTree *MDT = nullptr) const;
+  void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+                   MachineDominatorTree *MDT = nullptr) const;
 
   void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
@@ -1249,6 +1246,10 @@ namespace AMDGPU {
   LLVM_READONLY
   int getFlatScratchInstSVfromSS(uint16_t Opcode);
 
+  /// \returns earlyclobber version of a MAC MFMA is exists.
+  LLVM_READONLY
+  int getMFMAEarlyClobberOp(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index dda92d3d25ff..713a08907e99 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2588,6 +2588,14 @@ def getFlatScratchInstSVfromSS : InstrMapping {
   let ValueCols = [["SV"]];
 }
 
+def getMFMAEarlyClobberOp : InstrMapping {
+  let FilterClass = "MFMATable";
+  let RowFields = ["FMAOp"];
+  let ColFields = ["IsMac"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 636337ede000..7be63ae6964b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1011,7 +1011,7 @@ def : GCNPat <
 }
 
 def : GCNPat <
-  (i32 (ctpop i32:$popcnt)),
+  (i32 (DivergentUnaryFrag<ctpop> i32:$popcnt)),
   (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
 >;
 
@@ -1020,6 +1020,14 @@ def : GCNPat <
   (V_BCNT_U32_B32_e64 $popcnt, $val)
 >;
 
+def : GCNPat <
+  (i64 (DivergentUnaryFrag<ctpop> i64:$src)),
+  (REG_SEQUENCE VReg_64,
+    (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub1)),
+      (i32 (V_BCNT_U32_B32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0)))), sub0,
+      (i32 (V_MOV_B32_e32 (i32 0))), sub1)
+>;
+
 /********** ============================================ **********/
 /********** Extraction, Insertion, Building and Casting  **********/
 /********** ============================================ **********/
@@ -1184,6 +1192,26 @@ def : Pat <
   (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
 >;
 
+def : Pat <
+  (extract_subvector v8i16:$vec, (i32 0)),
+  (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1))
+>;
+
+def : Pat <
+  (extract_subvector v8i16:$vec, (i32 4)),
+  (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3))
+>;
+
+def : Pat <
+  (extract_subvector v8f16:$vec, (i32 0)),
+  (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1))
+>;
+
+def : Pat <
+  (extract_subvector v8f16:$vec, (i32 4)),
+  (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
+>;
+
 foreach Index = 0-31 in {
   def Extract_Element_v32i32_#Index : Extract_Element <
     i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -1279,6 +1307,26 @@ def : BitConvert <v2i64, v2f64, VReg_128>;
 def : BitConvert <v2f64, v2i64, VReg_128>;
 def : BitConvert <v4f32, v2i64, VReg_128>;
 def : BitConvert <v2i64, v4f32, VReg_128>;
+def : BitConvert <v8i16, v4i32, SReg_128>;
+def : BitConvert <v4i32, v8i16, SReg_128>;
+def : BitConvert <v8f16, v4f32, VReg_128>;
+def : BitConvert <v8f16, v4i32, VReg_128>;
+def : BitConvert <v4f32, v8f16, VReg_128>;
+def : BitConvert <v4i32, v8f16, VReg_128>;
+def : BitConvert <v8i16, v8f16, VReg_128>;
+def : BitConvert <v8f16, v8i16, VReg_128>;
+def : BitConvert <v4f32, v8i16, VReg_128>;
+def : BitConvert <v8i16, v4f32, VReg_128>;
+def : BitConvert <v8i16, v8f16, SReg_128>;
+def : BitConvert <v8i16, v2i64, SReg_128>;
+def : BitConvert <v8i16, v2f64, SReg_128>;
+def : BitConvert <v8f16, v2i64, SReg_128>;
+def : BitConvert <v8f16, v2f64, SReg_128>;
+def : BitConvert <v8f16, v8i16, SReg_128>;
+def : BitConvert <v2i64, v8i16, SReg_128>;
+def : BitConvert <v2f64, v8i16, SReg_128>;
+def : BitConvert <v2i64, v8f16, SReg_128>;
+def : BitConvert <v2f64, v8f16, SReg_128>;
 
 // 160-bit bitcast
 def : BitConvert <v5i32, v5f32, SReg_160>;
@@ -1762,44 +1810,44 @@ def BFIImm32 : PatFrag<
 // (y & x) | (z & ~x)
 def : AMDGPUPat <
   (DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
-  (V_BFI_B32_e64 $x, $y, $z)
+  (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
 >;
 
 // (y & C) | (z & ~C)
 def : AMDGPUPat <
   (BFIImm32 i32:$x, i32:$y, i32:$z),
-  (V_BFI_B32_e64 $x, $y, $z)
+  (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
 >;
 
 // 64-bit version
 def : AMDGPUPat <
   (DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
-  (REG_SEQUENCE SReg_64,
-    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
-               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
-               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
-    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
-               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
-               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+  (REG_SEQUENCE VReg_64,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+               (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
+               (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+               (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
+               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
 >;
 
 // SHA-256 Ch function
 // z ^ (x & (y ^ z))
 def : AMDGPUPat <
   (DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
-  (V_BFI_B32_e64 $x, $y, $z)
+  (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
 >;
 
 // 64-bit version
 def : AMDGPUPat <
   (DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
-  (REG_SEQUENCE SReg_64,
-    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
-               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0)),
-               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0))), sub0,
-    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
-               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1)),
-               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1))), sub1)
+  (REG_SEQUENCE VReg_64,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+               (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
+               (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+    (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+               (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
+               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
 >;
 
 def : AMDGPUPat <
@@ -2725,21 +2773,21 @@ def : AMDGPUPat <
 def : AMDGPUPat <
   (DivergentBinFrag<or> (and i32:$x, i32:$z),
                         (and i32:$y, (or i32:$x, i32:$z))),
-  (V_BFI_B32_e64 (V_XOR_B32_e64 i32:$x, i32:$y), i32:$z, i32:$y)
+  (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y)
 >;
 
 def : AMDGPUPat <
   (DivergentBinFrag<or> (and i64:$x, i64:$z),
                         (and i64:$y, (or i64:$x, i64:$z))),
-  (REG_SEQUENCE SReg_64,
-    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub0)),
-                    (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))),
-               (i32 (EXTRACT_SUBREG SReg_64:$z, sub0)),
-               (i32 (EXTRACT_SUBREG SReg_64:$y, sub0))), sub0,
-    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG SReg_64:$x, sub1)),
-                    (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))),
-               (i32 (EXTRACT_SUBREG SReg_64:$z, sub1)),
-               (i32 (EXTRACT_SUBREG SReg_64:$y, sub1))), sub1)
+  (REG_SEQUENCE VReg_64,
+    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
+                    (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))),
+               (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)),
+               (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0,
+    (V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
+                    (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))),
+               (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)),
+               (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1)
 >;
 
 multiclass IntMed3Pat<Instruction med3Inst,
@@ -2825,6 +2873,15 @@ class AMDGPUGenericInstruction : GenericInstruction {
   let Namespace = "AMDGPU";
 }
 
+// Convert a wave address to a swizzled vector address (i.e. this is
+// for copying the stack pointer to a vector address appropriate to
+// use in the offset field of mubuf instructions).
+def G_AMDGPU_WAVE_ADDRESS : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
 // Returns -1 if the input is zero.
 def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$dst);
@@ -3027,6 +3084,16 @@ def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
   let mayStore = 1;
 }
 
+def G_AMDGPU_INTRIN_IMAGE_LOAD_D16 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins unknown:$intrin, variable_ops);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+
+  // FIXME: Use separate opcode for atomics.
+  let mayStore = 1;
+}
+
 // This is equivalent to the G_INTRINSIC*, but the operands may have
 // been legalized depending on the subtarget requirements.
 def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
@@ -3036,6 +3103,13 @@ def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
   let mayStore = 1;
 }
 
+def G_AMDGPU_INTRIN_IMAGE_STORE_D16 : AMDGPUGenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins unknown:$intrin, variable_ops);
+  let hasSideEffects = 0;
+  let mayStore = 1;
+}
+
 def G_AMDGPU_INTRIN_BVH_INTERSECT_RAY : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins unknown:$intrin, variable_ops);
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index f4d9002e930e..c18637bdbc43 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -105,6 +105,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     unsigned DMask;
     InstClassEnum InstClass;
     unsigned CPol = 0;
+    bool IsAGPR;
     bool UseST64;
     int AddrIdx[MaxAddressRegs];
     const MachineOperand *AddrReg[MaxAddressRegs];
@@ -158,8 +159,7 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
       return true;
     }
 
-    void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
-               const GCNSubtarget &STM);
+    void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
   };
 
   struct BaseRegisters {
@@ -484,15 +484,16 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
 }
 
 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
-                                              const SIInstrInfo &TII,
-                                              const GCNSubtarget &STM) {
+                                              const SILoadStoreOptimizer &LSO) {
   I = MI;
   unsigned Opc = MI->getOpcode();
-  InstClass = getInstClass(Opc, TII);
+  InstClass = getInstClass(Opc, *LSO.TII);
 
   if (InstClass == UNKNOWN)
     return;
 
+  IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
+
   switch (InstClass) {
   case DS_READ:
    EltSize =
@@ -505,7 +506,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
                                                                             : 4;
     break;
   case S_BUFFER_LOAD_IMM:
-    EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
+    EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
     break;
   default:
     EltSize = 4;
@@ -513,7 +514,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
   }
 
   if (InstClass == MIMG) {
-    DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+    DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
     // Offset is not considered for MIMG instructions.
     Offset = 0;
   } else {
@@ -522,17 +523,17 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
   }
 
   if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
-    Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
+    Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
 
-  Width = getOpcodeWidth(*I, TII);
+  Width = getOpcodeWidth(*I, *LSO.TII);
 
   if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
     Offset &= 0xffff;
   } else if (InstClass != MIMG) {
-    CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
+    CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
   }
 
-  AddressRegs Regs = getRegs(Opc, TII);
+  AddressRegs Regs = getRegs(Opc, *LSO.TII);
 
   NumAddresses = 0;
   for (unsigned J = 0; J < Regs.NumVAddrs; J++)
@@ -910,19 +911,10 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
   }
   const unsigned InstSubclass = getInstSubclass(Opc, *TII);
 
-  // Do not merge VMEM buffer instructions with "swizzled" bit set.
-  int Swizzled =
-      AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
-  if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
-    return false;
-
   DenseSet<Register> RegDefsToMove;
   DenseSet<Register> PhysRegUsesToMove;
   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
-  const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
-  bool IsAGPR = TRI->hasAGPRs(DataRC);
-
   MachineBasicBlock::iterator E = std::next(Paired.I);
   MachineBasicBlock::iterator MBBI = std::next(CI.I);
   MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
@@ -971,15 +963,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
       continue;
     }
 
-    // Don't merge volatiles.
-    if (MBBI->hasOrderedMemoryRef())
-      return false;
-
-    int Swizzled =
-        AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
-    if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
-      return false;
-
     // Handle a case like
     //   DS_WRITE_B32 addr, v, idx0
     //   w = DS_READ_B32 addr, idx0
@@ -991,17 +974,6 @@ bool SILoadStoreOptimizer::checkAndPrepareMerge(
       continue;
 
     if (&*MBBI == &*Paired.I) {
-      if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
-        return false;
-      // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
-      //        operands. However we are reporting that ds_write2 shall have
-      //        only VGPR data so that machine copy propagation does not
-      //        create an illegal instruction with a VGPR and AGPR sources.
-      //        Consequenctially if we create such instruction the verifier
-      //        will complain.
-      if (IsAGPR && CI.InstClass == DS_WRITE)
-        return false;
-
       // We need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
@@ -1542,49 +1514,36 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
 std::pair<unsigned, unsigned>
 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
                                     const CombineInfo &Paired) {
-
-  assert(CI.Width != 0 && Paired.Width != 0 && "Width cannot be zero");
-
   bool ReverseOrder;
   if (CI.InstClass == MIMG) {
     assert(
         (countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
         "No overlaps");
     ReverseOrder = CI.DMask > Paired.DMask;
-  } else
+  } else {
     ReverseOrder = CI.Offset > Paired.Offset;
+  }
 
   unsigned Idx0;
   unsigned Idx1;
 
-  if (CI.Width + Paired.Width > 4) {
-    assert(CI.Width == 4 && Paired.Width == 4);
+  static const unsigned Idxs[5][4] = {
+      {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+      {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
+      {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
+      {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
+      {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
+  };
 
-    if (ReverseOrder) {
-      Idx1 = AMDGPU::sub0_sub1_sub2_sub3;
-      Idx0 = AMDGPU::sub4_sub5_sub6_sub7;
-    } else {
-      Idx0 = AMDGPU::sub0_sub1_sub2_sub3;
-      Idx1 = AMDGPU::sub4_sub5_sub6_sub7;
-    }
+  assert(CI.Width >= 1 && CI.Width <= 4);
+  assert(Paired.Width >= 1 && Paired.Width <= 4);
+
+  if (ReverseOrder) {
+    Idx1 = Idxs[0][Paired.Width - 1];
+    Idx0 = Idxs[Paired.Width][CI.Width - 1];
   } else {
-    static const unsigned Idxs[4][4] = {
-        {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
-        {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
-        {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
-        {AMDGPU::sub3, 0, 0, 0},
-    };
-
-    assert(CI.Width >= 1 && CI.Width <= 3);
-    assert(Paired.Width >= 1 && Paired.Width <= 3);
-
-    if (ReverseOrder) {
-      Idx1 = Idxs[0][Paired.Width - 1];
-      Idx0 = Idxs[Paired.Width][CI.Width - 1];
-    } else {
-      Idx0 = Idxs[0][CI.Width - 1];
-      Idx1 = Idxs[CI.Width][Paired.Width - 1];
-    }
+    Idx0 = Idxs[0][CI.Width - 1];
+    Idx1 = Idxs[CI.Width][Paired.Width - 1];
   }
 
   return std::make_pair(Idx0, Idx1);
@@ -1847,7 +1806,8 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
   if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
     return false;
 
-  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+  if (MI.mayLoad() &&
+      TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
     return false;
 
   if (AnchorList.count(&MI))
@@ -1988,6 +1948,7 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
                  std::list<std::list<CombineInfo> > &MergeableInsts) const {
   for (std::list<CombineInfo> &AddrList : MergeableInsts) {
     if (AddrList.front().InstClass == CI.InstClass &&
+        AddrList.front().IsAGPR == CI.IsAGPR &&
         AddrList.front().hasSameBaseAddress(*CI.I)) {
       AddrList.emplace_back(CI);
       return;
@@ -2030,13 +1991,29 @@ SILoadStoreOptimizer::collectMergeableInsts(
     if (InstClass == UNKNOWN)
       continue;
 
+    // Do not merge VMEM buffer instructions with "swizzled" bit set.
+    int Swizzled =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
+    if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
+      continue;
+
     CombineInfo CI;
-    CI.setMI(MI, *TII, *STM);
+    CI.setMI(MI, *this);
     CI.Order = Order++;
 
     if (!CI.hasMergeableAddress(*MRI))
       continue;
 
+    if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
+      // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
+      //        operands. However we are reporting that ds_write2 shall have
+      //        only VGPR data so that machine copy propagation does not
+      //        create an illegal instruction with a VGPR and AGPR sources.
+      //        Consequenctially if we create such instruction the verifier
+      //        will complain.
+      continue;
+    }
+
     LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
 
     addInstToMergeableList(CI, MergeableInsts);
@@ -2144,54 +2121,54 @@ SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
     case DS_READ: {
       MachineBasicBlock::iterator NewMI =
           mergeRead2Pair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       break;
     }
     case DS_WRITE: {
       MachineBasicBlock::iterator NewMI =
           mergeWrite2Pair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       break;
     }
     case S_BUFFER_LOAD_IMM: {
       MachineBasicBlock::iterator NewMI =
           mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 8;
       break;
     }
     case BUFFER_LOAD: {
       MachineBasicBlock::iterator NewMI =
           mergeBufferLoadPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case BUFFER_STORE: {
       MachineBasicBlock::iterator NewMI =
           mergeBufferStorePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case MIMG: {
       MachineBasicBlock::iterator NewMI =
           mergeImagePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case TBUFFER_LOAD: {
       MachineBasicBlock::iterator NewMI =
           mergeTBufferLoadPair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case TBUFFER_STORE: {
       MachineBasicBlock::iterator NewMI =
           mergeTBufferStorePair(CI, Paired, InstsToMove);
-      CI.setMI(NewMI, *TII, *STM);
+      CI.setMI(NewMI, *this);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 3168bcd53eda..e1018bdfde46 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -56,6 +56,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -90,6 +91,8 @@ private:
   unsigned OrSaveExecOpc;
   unsigned Exec;
 
+  bool EnableOptimizeEndCf = false;
+
   bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
 
   void emitIf(MachineInstr &MI);
@@ -579,10 +582,10 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
 void SILowerControlFlow::optimizeEndCf() {
   // If the only instruction immediately following this END_CF is an another
   // END_CF in the only successor we can avoid emitting exec mask restore here.
-  if (!RemoveRedundantEndcf)
+  if (!EnableOptimizeEndCf)
     return;
 
-  for (MachineInstr *MI : LoweredEndCf) {
+  for (MachineInstr *MI : reverse(LoweredEndCf)) {
     MachineBasicBlock &MBB = *MI->getParent();
     auto Next =
       skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
@@ -807,6 +810,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
+  EnableOptimizeEndCf =
+      RemoveRedundantEndcf && MF.getTarget().getOptLevel() > CodeGenOpt::None;
 
   // This doesn't actually need LiveIntervals, but we can preserve them.
   LIS = getAnalysisIfAvailable<LiveIntervals>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 55196fe334e6..0fbdbef6fcce 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -127,7 +127,7 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   // FIXME: Just emit the readlane/writelane directly
   if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
     for (const CalleeSavedInfo &CI : reverse(CSI)) {
-      unsigned Reg = CI.getReg();
+      Register Reg = CI.getReg();
       const TargetRegisterClass *RC =
         TRI->getMinimalPhysRegClass(Reg, MVT::i32);
 
@@ -239,50 +239,6 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
   return false;
 }
 
-// Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
-static bool lowerShiftReservedVGPR(MachineFunction &MF,
-                                   const GCNSubtarget &ST) {
-  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  const Register PreReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
-  // Early out if pre-reservation of a VGPR for SGPR spilling is disabled.
-  if (!PreReservedVGPR)
-    return false;
-
-  // If there are no free lower VGPRs available, default to using the
-  // pre-reserved register instead.
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  Register LowestAvailableVGPR =
-      TRI->findUnusedRegister(MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF);
-  if (!LowestAvailableVGPR)
-    LowestAvailableVGPR = PreReservedVGPR;
-
-  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  // Create a stack object for a possible spill in the function prologue.
-  // Note Non-CSR VGPR also need this as we may overwrite inactive lanes.
-  Optional<int> FI = FrameInfo.CreateSpillStackObject(4, Align(4));
-
-  // Find saved info about the pre-reserved register.
-  const auto *ReservedVGPRInfoItr =
-      llvm::find_if(FuncInfo->getSGPRSpillVGPRs(),
-                    [PreReservedVGPR](const auto &SpillRegInfo) {
-                      return SpillRegInfo.VGPR == PreReservedVGPR;
-                    });
-
-  assert(ReservedVGPRInfoItr != FuncInfo->getSGPRSpillVGPRs().end());
-  auto Index =
-      std::distance(FuncInfo->getSGPRSpillVGPRs().begin(), ReservedVGPRInfoItr);
-
-  FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, Index);
-
-  for (MachineBasicBlock &MBB : MF) {
-    assert(LowestAvailableVGPR.isValid() && "Did not find an available VGPR");
-    MBB.addLiveIn(LowestAvailableVGPR);
-    MBB.sortUniqueLiveIns();
-  }
-
-  return true;
-}
-
 bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -304,11 +260,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   if (!MFI.hasStackObjects() && !HasCSRs) {
     SaveBlocks.clear();
     RestoreBlocks.clear();
-    if (FuncInfo->VGPRReservedForSGPRSpill) {
-      // Free the reserved VGPR for later possible use by frame lowering.
-      FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
-      MRI.freezeReservedRegs(MF);
-    }
     return false;
   }
 
@@ -326,8 +277,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     // This operates under the assumption that only other SGPR spills are users
     // of the frame index.
 
-    lowerShiftReservedVGPR(MF, ST);
-
     // To track the spill frame indices handled in this pass.
     BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
 
@@ -375,8 +324,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     FuncInfo->removeDeadFrameIndices(MFI);
 
     MadeChange = true;
-  } else if (FuncInfo->VGPRReservedForSGPRSpill) {
-    FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
   }
 
   SaveBlocks.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 3ce368ef4db9..cca8565c9ff9 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -118,10 +118,12 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
       WorkItemIDX = true;
 
-    if (!F.hasFnAttribute("amdgpu-no-workitem-id-y"))
+    if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
+        ST.getMaxWorkitemID(F, 1) != 0)
       WorkItemIDY = true;
 
-    if (!F.hasFnAttribute("amdgpu-no-workitem-id-z"))
+    if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
+        ST.getMaxWorkitemID(F, 2) != 0)
       WorkItemIDZ = true;
 
     if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
@@ -274,7 +276,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned WaveSize = ST.getWavefrontSize();
-  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned Size = FrameInfo.getObjectSize(FI);
   unsigned NumLanes = Size / 4;
@@ -291,16 +292,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
     Register LaneVGPR;
     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
 
-    // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and
-    // when one of the two conditions is true:
-    // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet
-    // reserved.
-    // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is
-    // required.
-    if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) {
-      assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR);
-      LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill;
-    } else if (VGPRIndex == 0) {
+    if (VGPRIndex == 0) {
       LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
       if (LaneVGPR == AMDGPU::NoRegister) {
         // We have no VGPRs left for spilling SGPRs. Reset because we will not
@@ -308,6 +300,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
         SGPRToVGPRSpills.erase(FI);
         NumVGPRSpillLanes -= I;
 
+        // FIXME: We can run out of free registers with split allocation if
+        // IPRA is enabled and a called function already uses every VGPR.
 #if 0
         DiagnosticInfoResourceLimit DiagOutOfRegs(MF.getFunction(),
                                                   "VGPRs for SGPR spilling",
@@ -340,21 +334,6 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
   return true;
 }
 
-/// Reserve a VGPR for spilling of SGPRs
-bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-
-  Register LaneVGPR = TRI->findUnusedRegister(
-      MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
-  if (LaneVGPR == Register())
-    return false;
-  SpillVGPRs.push_back(SGPRSpillVGPR(LaneVGPR, None));
-  FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
-  return true;
-}
-
 /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
 /// Either AGPR is spilled to VGPR to vice versa.
 /// Returns true if a \p FI can be eliminated completely.
@@ -616,24 +595,6 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   return false;
 }
 
-// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs
-bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
-                                                   MachineFunction &MF) {
-  for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) {
-    if (i->VGPR == ReservedVGPR) {
-      SpillVGPRs.erase(i);
-
-      for (MachineBasicBlock &MBB : MF) {
-        MBB.removeLiveIn(ReservedVGPR);
-        MBB.sortUniqueLiveIns();
-      }
-      this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister;
-      return true;
-    }
-  }
-  return false;
-}
-
 bool SIMachineFunctionInfo::usesAGPRs(const MachineFunction &MF) const {
   if (UsesAGPRs)
     return *UsesAGPRs;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 8accbf611c5f..8e821274bb77 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -502,7 +502,6 @@ public: // FIXME
   Register SGPRForBPSaveRestoreCopy;
   Optional<int> BasePointerSaveIndex;
 
-  Register VGPRReservedForSGPRSpill;
   bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
 
 public:
@@ -528,7 +527,6 @@ public:
   void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
     SpillVGPRs[Index].VGPR = NewVGPR;
     SpillVGPRs[Index].FI = newFI;
-    VGPRReservedForSGPRSpill = NewVGPR;
   }
 
   bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
@@ -556,7 +554,6 @@ public:
   bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
                                  unsigned NumLane) const;
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
-  bool reserveVGPRforSGPRSpills(MachineFunction &MF);
   bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
   void removeDeadFrameIndices(MachineFrameInfo &MFI);
 
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 69eab762f05c..24a8879b5684 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -188,7 +188,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
     unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
     unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
     unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
-    BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+    BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
         .addImm(Value)
         .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
                 (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 6bf6c45d8cf6..e13e33ed5457 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -155,6 +155,11 @@ public:
     return MachineFunctionProperties().set(
         MachineFunctionProperties::Property::IsSSA);
   }
+
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoPHIs);
+  }
 };
 
 } // end anonymous namespace
@@ -366,47 +371,42 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
 // Re-calculate the liveness of \p Reg in the THEN-region
 void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
     Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
-
-  SmallPtrSet<MachineBasicBlock *, 16> PHIIncoming;
-
-  MachineBasicBlock *ThenEntry = nullptr;
-  for (auto *Succ : If->successors()) {
-    if (Succ != Flow) {
-      ThenEntry = Succ;
-      break;
+  SetVector<MachineBasicBlock *> Blocks;
+  SmallVector<MachineBasicBlock *> WorkList({If});
+
+  // Collect all successors until we see the flow block, where we should
+  // reconverge.
+  while (!WorkList.empty()) {
+    auto *MBB = WorkList.pop_back_val();
+    for (auto *Succ : MBB->successors()) {
+      if (Succ != Flow && !Blocks.contains(Succ)) {
+        WorkList.push_back(Succ);
+        Blocks.insert(Succ);
+      }
     }
   }
-  assert(ThenEntry && "No successor in Then region?");
 
   LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
-  df_iterator_default_set<MachineBasicBlock *, 16> Visited;
-
-  for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
-    if (MBB == Flow)
-      break;
-
+  for (MachineBasicBlock *MBB : Blocks) {
     // Clear Live bit, as we will recalculate afterwards
     LLVM_DEBUG(dbgs() << "Clear AliveBlock " << printMBBReference(*MBB)
                       << '\n');
     OldVarInfo.AliveBlocks.reset(MBB->getNumber());
   }
 
+  SmallPtrSet<MachineBasicBlock *, 4> PHIIncoming;
+
   // Get the blocks the Reg should be alive through
   for (auto I = MRI->use_nodbg_begin(Reg), E = MRI->use_nodbg_end(); I != E;
        ++I) {
     auto *UseMI = I->getParent();
     if (UseMI->isPHI() && I->readsReg()) {
-      if (Visited.contains(UseMI->getParent()))
+      if (Blocks.contains(UseMI->getParent()))
         PHIIncoming.insert(UseMI->getOperand(I.getOperandNo() + 1).getMBB());
     }
   }
 
-  Visited.clear();
-
-  for (MachineBasicBlock *MBB : depth_first_ext(ThenEntry, Visited)) {
-    if (MBB == Flow)
-      break;
-
+  for (MachineBasicBlock *MBB : Blocks) {
     SmallVector<MachineInstr *> Uses;
     // PHI instructions has been processed before.
     findNonPHIUsesInBlock(Reg, MBB, Uses);
@@ -433,7 +433,7 @@ void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
 
   // Set the isKilled flag if we get new Kills in the THEN region.
   for (auto *MI : OldVarInfo.Kills) {
-    if (Visited.contains(MI->getParent()))
+    if (Blocks.contains(MI->getParent()))
       MI->addRegisterKilled(Reg, TRI);
   }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 340e2b48e5cd..eb9452f4b85e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -617,7 +617,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16
   let HasSGPR = 1;
 }
 
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16], 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -784,7 +784,7 @@ multiclass SRegClass<int numRegs, int priority,
 }
 
 defm "" : SRegClass<3, 14, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<4, 15, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], SGPR_128Regs, TTMP_128Regs>;
 defm "" : SRegClass<5, 16, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, 17, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, 18, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
@@ -824,7 +824,7 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
 defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
                                 (add VGPR_64)>;
 defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64], (add VGPR_128)>;
+defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add VGPR_128)>;
 defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;
@@ -846,7 +846,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
 defm AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
                         (add AGPR_64)>;
 defm AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
-defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
+defm AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16], (add AGPR_128)>;
 defm AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
 defm AReg_192 : ARegClass<6, [v6i32, v6f32, v3i64, v3f64], (add AGPR_192)>;
 defm AReg_224 : ARegClass<7, [v7i32, v7f32], (add AGPR_224)>;
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 77ee3c0ff0e4..46efb3c605c6 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -861,12 +861,16 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
   MachineInstr *VcmpMI;
   const MachineOperand &Op0 = MI.getOperand(0);
   const MachineOperand &Op1 = MI.getOperand(1);
+
+  // VCC represents lanes killed.
+  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
+
   if (TRI->isVGPR(*MRI, Op0.getReg())) {
     Opcode = AMDGPU::getVOPe32(Opcode);
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
   } else {
     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
-                 .addReg(AMDGPU::VCC, RegState::Define)
+                 .addReg(VCC, RegState::Define)
                  .addImm(0) // src0 modifiers
                  .add(Op1)
                  .addImm(0) // src1 modifiers
@@ -874,9 +878,6 @@ MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
                  .addImm(0); // omod
   }
 
-  // VCC represents lanes killed.
-  Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
-
   MachineInstr *MaskUpdateMI =
       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
           .addReg(LiveMaskReg)
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 1713586dcf5b..3f7837f7dbf1 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -246,10 +246,10 @@ let Defs = [SCC] in {
 def S_BCNT0_I32_B32 : SOP1_32 <"s_bcnt0_i32_b32">;
 def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
 def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
-  [(set i32:$sdst, (ctpop i32:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<ctpop> i32:$src0))]
 >;
 def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
-  [(set i32:$sdst, (ctpop i64:$src0))]
+  [(set i32:$sdst, (UniformUnaryFrag<ctpop> i64:$src0))]
 >;
 } // End Defs = [SCC]
 
@@ -518,10 +518,9 @@ let Uses = [SCC] in {
     def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
       [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))]
     >;
-    def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64",
-      [(set i64:$sdst, (SelectPat<select> i64:$src0, i64:$src1))]
-    >;
   }
+
+  def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
 } // End Uses = [SCC]
 
 let Defs = [SCC] in {
@@ -551,11 +550,11 @@ def S_XOR_B64 : SOP2_64 <"s_xor_b64",
 >;
 
 def S_XNOR_B32 : SOP2_32 <"s_xnor_b32",
-  [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))]
+  [(set i32:$sdst, (UniformUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1)))]
 >;
 
 def S_XNOR_B64 : SOP2_64 <"s_xnor_b64",
-  [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))]
+  [(set i64:$sdst, (UniformUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1)))]
 >;
 
 def S_NAND_B32 : SOP2_32 <"s_nand_b32",
@@ -1371,7 +1370,7 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i64 (ctpop i64:$src)),
+  (i64 (UniformUnaryFrag<ctpop> i64:$src)),
     (i64 (REG_SEQUENCE SReg_64,
      (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
      (S_MOV_B32 (i32 0)), sub1))
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 0bee9022975e..18c348d1cf89 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -79,8 +79,8 @@ const char* const IdSymbolic[] = {
   "HW_REG_FLAT_SCR_LO",
   "HW_REG_FLAT_SCR_HI",
   "HW_REG_XNACK_MASK",
-  nullptr, // HW_ID1, no predictable values
-  nullptr, // HW_ID2, no predictable values
+  "HW_REG_HW_ID1",
+  "HW_REG_HW_ID2",
   "HW_REG_POPS_PACKER",
   nullptr,
   nullptr,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index d20eaaaa65e8..1e96266eb06c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -132,6 +132,8 @@ bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
 #define GET_MIMGInfoTable_IMPL
 #define GET_MIMGLZMappingTable_IMPL
 #define GET_MIMGMIPMappingTable_IMPL
+#define GET_MIMGBiasMappingTable_IMPL
+#define GET_MIMGOffsetMappingTable_IMPL
 #define GET_MIMGG16MappingTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
@@ -410,7 +412,7 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
 }
 
 std::string AMDGPUTargetID::toString() const {
-  std::string StringRep = "";
+  std::string StringRep;
   raw_string_ostream StreamRep(StringRep);
 
   auto TargetTriple = STI.getTargetTriple();
@@ -421,7 +423,7 @@ std::string AMDGPUTargetID::toString() const {
             << TargetTriple.getOSName() << '-'
             << TargetTriple.getEnvironmentName() << '-';
 
-  std::string Processor = "";
+  std::string Processor;
   // TODO: Following else statement is present here because we used various
   // alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803').
   // Remove once all aliases are removed from GCNProcessors.td.
@@ -432,7 +434,7 @@ std::string AMDGPUTargetID::toString() const {
                  Twine(Version.Stepping))
                     .str();
 
-  std::string Features = "";
+  std::string Features;
   if (Optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) {
     switch (*HsaAbiVersion) {
     case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
@@ -1018,9 +1020,18 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
 }
 
 bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
-  return
-    ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
-    IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI));
+  switch (Id) {
+  case ID_HW_ID:
+    return isSI(STI) || isCI(STI) || isVI(STI) || isGFX9(STI);
+  case ID_HW_ID1:
+  case ID_HW_ID2:
+    return isGFX10Plus(STI);
+  case ID_XNACK_MASK:
+    return isGFX10(STI) && !AMDGPU::isGFX10_BEncoding(STI);
+  default:
+    return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
+           IdSymbolic[Id];
+  }
 }
 
 bool isValidHwreg(int64_t Id) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 061c74c0ace6..89f928eb8b92 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -64,6 +64,7 @@ struct GcnBufferFormatInfo {
 #define GET_MIMGEncoding_DECL
 #define GET_MIMGLZMapping_DECL
 #define GET_MIMGMIPMapping_DECL
+#define GET_MIMGBiASMapping_DECL
 #include "AMDGPUGenSearchableTables.inc"
 
 namespace IsaInfo {
@@ -330,6 +331,16 @@ struct MIMGMIPMappingInfo {
   MIMGBaseOpcode NONMIP;
 };
 
+struct MIMGBiasMappingInfo {
+  MIMGBaseOpcode Bias;
+  MIMGBaseOpcode NoBias;
+};
+
+struct MIMGOffsetMappingInfo {
+  MIMGBaseOpcode Offset;
+  MIMGBaseOpcode NoOffset;
+};
+
 struct MIMGG16MappingInfo {
   MIMGBaseOpcode G;
   MIMGBaseOpcode G16;
@@ -342,6 +353,12 @@ LLVM_READONLY
 const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
 
 LLVM_READONLY
+const MIMGBiasMappingInfo *getMIMGBiasMappingInfo(unsigned Bias);
+
+LLVM_READONLY
+const MIMGOffsetMappingInfo *getMIMGOffsetMappingInfo(unsigned Offset);
+
+LLVM_READONLY
 const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G);
 
 LLVM_READONLY
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 8d232ffe4114..b9ff814a4dc5 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -637,9 +637,9 @@ class divergent_i64_BinOp <SDPatternOperator Op, Instruction Inst> :
       )
   >;
 
-def :  divergent_i64_BinOp <and, V_AND_B32_e32>;
-def :  divergent_i64_BinOp <or,  V_OR_B32_e32>;
-def :  divergent_i64_BinOp <xor, V_XOR_B32_e32>;
+def :  divergent_i64_BinOp <and, V_AND_B32_e64>;
+def :  divergent_i64_BinOp <or,  V_OR_B32_e64>;
+def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
 
 let SubtargetPredicate = Has16BitInsts in {
 
@@ -688,6 +688,36 @@ let SubtargetPredicate = HasDLInsts in {
 let isReMaterializable = 1 in
 defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>;
 
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<not> (xor_oneuse i32:$src0, i32:$src1))),
+  (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+  (i32 (DivergentBinFrag<xor_oneuse> (not i32:$src0), i32:$src1)),
+  (i32 (V_XNOR_B32_e64 $src0, $src1))
+>;
+
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<not> (xor_oneuse i64:$src0, i64:$src1))),
+  (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub0)),
+                            (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+                     (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub1)),
+                            (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
+def : GCNPat<
+  (i64 (DivergentBinFrag<xor_oneuse> (not i64:$src0), i64:$src1)),
+  (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub0)),
+                            (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0,
+                     (i32 (V_XNOR_B32_e64
+                            (i32 (EXTRACT_SUBREG $src0, sub1)),
+                            (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1)
+>;
+
 let Constraints = "$vdst = $src2",
     DisableEncoding = "$src2",
     isConvertibleToThreeAddress = 1,
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 32222b3eb93c..707475ceccee 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -388,6 +388,12 @@ class VOPProfileMAI<VOPProfile P, RegisterOperand _SrcRC, RegisterOperand _DstRC
   let HasModifiers = 0;
   let Asm64 = "$vdst, $src0, $src1, $src2$cbsz$abid$blgp";
   let Ins64 = (ins Src0RC64:$src0, Src1RC64:$src1, Src2RC64:$src2, cbsz:$cbsz, abid:$abid, blgp:$blgp);
+  // Dst and SrcC cannot partially overlap if SrcC/Dst is bigger than 4 VGPRs.
+  // We then create two versions of the instruction: with tied dst and src2
+  // and with the eralyclobber flag on the dst. This is strciter than the
+  // actual HW restriction. In particular earlyclobber also affects src0 and
+  // src1 allocation which is not required.
+  bit NoDstOverlap = !gt(DstVT.Size, 128);
 }
 
 def VOPProfileMAI_F32_F32_X4    : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32,       AISrc_128_f32,  ADst_128>;
@@ -426,6 +432,11 @@ def VOPProfileMAI_F32_V4I16_X32_VCD  : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F
 def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64,       VISrc_256_f64,  VDst_256,  AVSrc_64>;
 def VOPProfileMAI_F64_4X4X4F64_VCD   : VOPProfileMAI<VOP_F64_F64_F64_F64,           VISrc_64_f64,   VDst_64,   AVSrc_64>;
 
+class MFMATable <bit is_mac, string Name> {
+  bit IsMac = is_mac;
+  string FMAOp = Name;
+}
+
 let Predicates = [HasMAIInsts] in {
 
 let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
@@ -435,13 +446,31 @@ let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
   } // End isMoveImm = 1
 } // End isAsCheapAsAMove = 1, isReMaterializable = 1
 
-multiclass MAIInst<string OpName, string P, SDPatternOperator node> {
+multiclass MAIInst<string OpName, string P, SDPatternOperator node,
+                   bit NoDstOverlap = !cast<VOPProfileMAI>("VOPProfileMAI_" # P).NoDstOverlap> {
   let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
     // FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
-    defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>;
-
-    let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
-    defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>;
+    let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
+      defm "" : VOP3Inst<OpName, !cast<VOPProfileMAI>("VOPProfileMAI_" # P), !if(NoDstOverlap, null_frag, node)>,
+                MFMATable<0, NAME # "_e64">;
+
+      let SubtargetPredicate = isGFX90APlus, Mnemonic = OpName in
+      defm _vgprcd : VOP3Inst<OpName # "_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
+                     MFMATable<0, NAME # "_vgprcd_e64">;
+    }
+
+    foreach _ = BoolToList<NoDstOverlap>.ret in {
+      let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
+          isConvertibleToThreeAddress = NoDstOverlap,
+          Mnemonic = OpName in {
+        defm "_mac" : VOP3Inst<OpName # "_mac", !cast<VOPProfileMAI>("VOPProfileMAI_" # P), node>,
+                      MFMATable<1, NAME # "_e64">;
+
+        let SubtargetPredicate = isGFX90APlus in
+        defm _mac_vgprcd : VOP3Inst<OpName # "_mac_vgprcd", !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD")>,
+                           MFMATable<1, NAME # "_vgprcd_e64">;
+      }
+    }
   } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
 }
 
@@ -517,6 +546,7 @@ multiclass VOP3P_Real_MAI<bits<7> op> {
   }
 }
 
+let Constraints = "" in {
 multiclass VOP3P_Real_MFMA_gfx90a<bits<7> op> {
   let SubtargetPredicate = isGFX90AOnly,
       AssemblerPredicate = isGFX90AOnly, DecoderNamespace = "GFX90A" in {
@@ -536,6 +566,7 @@ multiclass VOP3P_Real_MFMA<bits<7> op> :
     let DecoderNamespace = "GFX8";
   }
 }
+}
 
 defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
 defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
index 1d5e45aec06c..979371bf7cf6 100644
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -25,12 +25,9 @@ class ARMAsmPrinter;
 class ARMBaseTargetMachine;
 class ARMRegisterBankInfo;
 class ARMSubtarget;
-struct BasicBlockInfo;
 class Function;
 class FunctionPass;
 class InstructionSelector;
-class MachineBasicBlock;
-class MachineFunction;
 class MachineInstr;
 class MCInst;
 class PassRegistry;
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 8173fe4036a8..4efbdbb2abc8 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -512,8 +512,7 @@ def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
 
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
-                                   [HasV6T2Ops, FeaturePerfMon,
-                                    FeatureV7Clrex]>;
+                                   [HasV6T2Ops, FeatureV7Clrex]>;
 
 def HasV8MMainlineOps :
                   SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true",
@@ -522,7 +521,7 @@ def HasV8MMainlineOps :
 
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops, FeatureAcquireRelease]>;
+                                   [HasV7Ops, FeaturePerfMon, FeatureAcquireRelease]>;
 
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
@@ -553,6 +552,10 @@ def HasV8_7aOps   : SubtargetFeature<"v8.7a", "HasV8_7aOps", "true",
                                    "Support ARM v8.7a instructions",
                                    [HasV8_6aOps]>;
 
+def HasV8_8aOps   : SubtargetFeature<"v8.8a", "HasV8_8aOps", "true",
+                                   "Support ARM v8.8a instructions",
+                                   [HasV8_7aOps]>;
+
 def HasV9_0aOps   : SubtargetFeature<"v9a", "HasV9_0aOps", "true",
                                    "Support ARM v9a instructions",
                                    [HasV8_5aOps]>;
@@ -565,6 +568,10 @@ def HasV9_2aOps   : SubtargetFeature<"v9.2a", "HasV9_2aOps", "true",
                                    "Support ARM v9.2a instructions",
                                    [HasV8_7aOps, HasV9_1aOps]>;
 
+def HasV9_3aOps   : SubtargetFeature<"v9.3a", "HasV9_3aOps", "true",
+                                   "Support ARM v9.3a instructions",
+                                   [HasV8_8aOps, HasV9_2aOps]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
                "Support ARM v8-1M Mainline instructions",
@@ -757,7 +764,8 @@ def ARMv7a    : Architecture<"armv7-a",   "ARMv7a",   [HasV7Ops,
                                                        FeatureNEON,
                                                        FeatureDB,
                                                        FeatureDSP,
-                                                       FeatureAClass]>;
+                                                       FeatureAClass,
+                                                       FeaturePerfMon]>;
 
 def ARMv7ve   : Architecture<"armv7ve",   "ARMv7ve",  [HasV7Ops,
                                                        FeatureNEON,
@@ -766,13 +774,15 @@ def ARMv7ve   : Architecture<"armv7ve",   "ARMv7ve",  [HasV7Ops,
                                                        FeatureTrustZone,
                                                        FeatureMP,
                                                        FeatureVirtualization,
-                                                       FeatureAClass]>;
+                                                       FeatureAClass,
+                                                       FeaturePerfMon]>;
 
 def ARMv7r    : Architecture<"armv7-r",   "ARMv7r",   [HasV7Ops,
                                                        FeatureDB,
                                                        FeatureDSP,
                                                        FeatureHWDivThumb,
-                                                       FeatureRClass]>;
+                                                       FeatureRClass,
+                                                       FeaturePerfMon]>;
 
 def ARMv7m    : Architecture<"armv7-m",   "ARMv7m",   [HasV7Ops,
                                                        FeatureThumb2,
@@ -894,6 +904,19 @@ def ARMv87a   : Architecture<"armv8.7-a", "ARMv87a",  [HasV8_7aOps,
                                                        FeatureCRC,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
+def ARMv88a   : Architecture<"armv8.8-a", "ARMv88a",  [HasV8_8aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
 
 def ARMv9a   : Architecture<"armv9-a", "ARMv9a",       [HasV9_0aOps,
                                                        FeatureAClass,
@@ -931,6 +954,19 @@ def ARMv92a   : Architecture<"armv9.2-a", "ARMv92a",  [HasV9_2aOps,
                                                        FeatureCRC,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
+def ARMv93a   : Architecture<"armv9.3-a", "ARMv93a",  [HasV9_3aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
 
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
@@ -1425,8 +1461,7 @@ def : ProcNoItin<"neoverse-n1",                         [ARMv82a,
 
 def : ProcNoItin<"neoverse-n2",                         [ARMv85a,
                                                          FeatureBF16,
-                                                         FeatureMatMulInt8,
-                                                         FeaturePerfMon]>;
+                                                         FeatureMatMulInt8]>;
 
 def : ProcessorModel<"cyclone",     SwiftModel,         [ARMv8a, ProcSwift,
                                                          FeatureHasRetAddrStack,
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 884f38ff6c58..cde715880376 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4868,6 +4868,36 @@ bool ARMBaseInstrInfo::verifyInstruction(const MachineInstr &MI,
       return false;
     }
   }
+
+  // Check the address model by taking the first Imm operand and checking it is
+  // legal for that addressing mode.
+  ARMII::AddrMode AddrMode =
+      (ARMII::AddrMode)(MI.getDesc().TSFlags & ARMII::AddrModeMask);
+  switch (AddrMode) {
+  default:
+    break;
+  case ARMII::AddrModeT2_i7:
+  case ARMII::AddrModeT2_i7s2:
+  case ARMII::AddrModeT2_i7s4:
+  case ARMII::AddrModeT2_i8:
+  case ARMII::AddrModeT2_i8pos:
+  case ARMII::AddrModeT2_i8neg:
+  case ARMII::AddrModeT2_i8s4:
+  case ARMII::AddrModeT2_i12: {
+    uint32_t Imm = 0;
+    for (auto Op : MI.operands()) {
+      if (Op.isImm()) {
+        Imm = Op.getImm();
+        break;
+      }
+    }
+    if (!isLegalAddressImm(MI.getOpcode(), Imm, this)) {
+      ErrInfo = "Incorrect AddrMode Imm for instruction";
+      return false;
+    }
+    break;
+  }
+  }
   return true;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMCallLowering.cpp b/llvm/lib/Target/ARM/ARMCallLowering.cpp
index 81ec4d09a408..b15ef094d9d2 100644
--- a/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -534,7 +534,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &
 
   MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
       .addImm(ArgAssigner.StackOffset)
-      .addImm(0)
+      .addImm(-1ULL)
       .add(predOps(ARMCC::AL));
 
   return true;
diff --git a/llvm/lib/Target/ARM/ARMCallLowering.h b/llvm/lib/Target/ARM/ARMCallLowering.h
index 87b18f811747..38095617fb4f 100644
--- a/llvm/lib/Target/ARM/ARMCallLowering.h
+++ b/llvm/lib/Target/ARM/ARMCallLowering.h
@@ -23,7 +23,6 @@
 namespace llvm {
 
 class ARMTargetLowering;
-class MachineFunction;
 class MachineInstrBuilder;
 class MachineIRBuilder;
 class Value;
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index fa244786a80d..2f083561bbd4 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1144,7 +1144,7 @@ static bool determineFPRegsToClear(const MachineInstr &MI,
     if (!Op.isReg())
       continue;
 
-    unsigned Reg = Op.getReg();
+    Register Reg = Op.getReg();
     if (Op.isDef()) {
       if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) ||
           (Reg >= ARM::D0 && Reg <= ARM::D15) ||
@@ -1356,7 +1356,7 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
   std::vector<unsigned> NonclearedFPRegs;
   for (const MachineOperand &Op : MBBI->operands()) {
     if (Op.isReg() && Op.isUse()) {
-      unsigned Reg = Op.getReg();
+      Register Reg = Op.getReg();
       assert(!ARM::DPRRegClass.contains(Reg) ||
              ARM::DPR_VFP2RegClass.contains(Reg));
       assert(!ARM::QPRRegClass.contains(Reg));
@@ -1451,9 +1451,9 @@ void ARMExpandPseudo::CMSESaveClearFPRegsV8(
   // restore FPSCR from stack and clear bits 0-4, 7, 28-31
   // The other bits are program global according to the AAPCS
   if (passesFPReg) {
-    BuildMI(MBB, MBBI, DL, TII->get(ARM::t2LDRi8), SpareReg)
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::tLDRspi), SpareReg)
         .addReg(ARM::SP)
-        .addImm(0x40)
+        .addImm(0x10)
         .add(predOps(ARMCC::AL));
     BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg)
         .addReg(SpareReg)
@@ -1543,7 +1543,7 @@ void ARMExpandPseudo::CMSERestoreFPRegsV8(
   std::vector<unsigned> NonclearedFPRegs;
   for (const MachineOperand &Op : MBBI->operands()) {
     if (Op.isReg() && Op.isDef()) {
-      unsigned Reg = Op.getReg();
+      Register Reg = Op.getReg();
       assert(!ARM::DPRRegClass.contains(Reg) ||
              ARM::DPR_VFP2RegClass.contains(Reg));
       assert(!ARM::QPRRegClass.contains(Reg));
@@ -1663,7 +1663,7 @@ static bool definesOrUsesFPReg(const MachineInstr &MI) {
   for (const MachineOperand &Op : MI.operands()) {
     if (!Op.isReg())
       continue;
-    unsigned Reg = Op.getReg();
+    Register Reg = Op.getReg();
     if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) ||
         (Reg >= ARM::D0 && Reg <= ARM::D15) ||
         (Reg >= ARM::S0 && Reg <= ARM::S31))
@@ -2201,7 +2201,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     }
     case ARM::tBLXNS_CALL: {
       DebugLoc DL = MBBI->getDebugLoc();
-      unsigned JumpReg = MBBI->getOperand(0).getReg();
+      Register JumpReg = MBBI->getOperand(0).getReg();
 
       // Figure out which registers are live at the point immediately before the
       // call. When we indiscriminately push a set of registers, the live
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 28a076edd6dc..5d94b99d4c5d 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -319,7 +319,7 @@ unsigned ARMFastISel::fastEmitInst_r(unsigned MachineInstOpcode,
 unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, unsigned Op1) {
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   // Make sure the input operands are sufficiently constrained to be legal
@@ -346,7 +346,7 @@ unsigned ARMFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
 unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
                                       const TargetRegisterClass *RC,
                                       unsigned Op0, uint64_t Imm) {
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   // Make sure the input operand is sufficiently constrained to be legal
@@ -371,7 +371,7 @@ unsigned ARMFastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
 unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      uint64_t Imm) {
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1) {
@@ -392,7 +392,7 @@ unsigned ARMFastISel::fastEmitInst_i(unsigned MachineInstOpcode,
 unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
   if (VT == MVT::f64) return 0;
 
-  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  Register MoveReg = createResultReg(TLI.getRegClassFor(VT));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VMOVSR), MoveReg)
                   .addReg(SrcReg));
@@ -402,7 +402,7 @@ unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
 unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) {
   if (VT == MVT::i64) return 0;
 
-  unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
+  Register MoveReg = createResultReg(TLI.getRegClassFor(VT));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VMOVRS), MoveReg)
                   .addReg(SrcReg));
@@ -428,7 +428,7 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
       Imm = ARM_AM::getFP32Imm(Val);
       Opc = ARM::FCONSTS;
     }
-    unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+    Register DestReg = createResultReg(TLI.getRegClassFor(VT));
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), DestReg).addImm(Imm));
     return DestReg;
@@ -440,7 +440,7 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
   // MachineConstantPool wants an explicit alignment.
   Align Alignment = DL.getPrefTypeAlign(CFP->getType());
   unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment);
-  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  Register DestReg = createResultReg(TLI.getRegClassFor(VT));
   unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
 
   // The extra reg is for addrmode5.
@@ -462,7 +462,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
     unsigned Opc = isThumb2 ? ARM::t2MOVi16 : ARM::MOVi16;
     const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
       &ARM::GPRRegClass;
-    unsigned ImmReg = createResultReg(RC);
+    Register ImmReg = createResultReg(RC);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), ImmReg)
                     .addImm(CI->getZExtValue()));
@@ -478,7 +478,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
       unsigned Opc = isThumb2 ? ARM::t2MVNi : ARM::MVNi;
       const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass :
                                                  &ARM::GPRRegClass;
-      unsigned ImmReg = createResultReg(RC);
+      Register ImmReg = createResultReg(RC);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(Opc), ImmReg)
                       .addImm(Imm));
@@ -531,7 +531,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   bool IsIndirect = Subtarget->isGVIndirectSymbol(GV);
   const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
                                            : &ARM::GPRRegClass;
-  unsigned DestReg = createResultReg(RC);
+  Register DestReg = createResultReg(RC);
 
   // FastISel TLS support on non-MachO is broken, punt to SelectionDAG.
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
@@ -589,7 +589,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
 
       if (IsPositionIndependent) {
         unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
-        unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+        Register NewDestReg = createResultReg(TLI.getRegClassFor(VT));
 
         MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                           DbgLoc, TII.get(Opc), NewDestReg)
@@ -605,7 +605,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
       (Subtarget->isTargetMachO() && IsIndirect) ||
       Subtarget->genLongCalls()) {
     MachineInstrBuilder MIB;
-    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    Register NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb2)
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                     TII.get(ARM::t2LDRi12), NewDestReg)
@@ -657,7 +657,7 @@ unsigned ARMFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
   if (SI != FuncInfo.StaticAllocaMap.end()) {
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
     const TargetRegisterClass* RC = TLI.getRegClassFor(VT);
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     ResultReg = constrainOperandRegClass(TII.get(Opc), ResultReg, 0);
 
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -832,7 +832,7 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
   if (needsLowering && Addr.BaseType == Address::FrameIndexBase) {
     const TargetRegisterClass *RC = isThumb2 ? &ARM::tGPRRegClass
                                              : &ARM::GPRRegClass;
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     unsigned Opc = isThumb2 ? ARM::t2ADDri : ARM::ADDri;
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(Opc), ResultReg)
@@ -991,7 +991,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
   // If we had an unaligned load of a float we've converted it to an regular
   // load.  Now we must move from the GRP to the FP register.
   if (needVMOV) {
-    unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+    Register MoveReg = createResultReg(TLI.getRegClassFor(MVT::f32));
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                             TII.get(ARM::VMOVSR), MoveReg)
                     .addReg(ResultReg));
@@ -1044,7 +1044,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
     // This is mostly going to be Neon/vector support.
     default: return false;
     case MVT::i1: {
-      unsigned Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass
+      Register Res = createResultReg(isThumb2 ? &ARM::tGPRRegClass
                                               : &ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
       SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
@@ -1095,7 +1095,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
       if (!Subtarget->hasVFP2Base()) return false;
       // Unaligned stores need special handling. Floats require word-alignment.
       if (Alignment && Alignment < 4) {
-        unsigned MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+        Register MoveReg = createResultReg(TLI.getRegClassFor(MVT::i32));
         AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(ARM::VMOVRS), MoveReg)
                         .addReg(SrcReg));
@@ -1257,7 +1257,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
     if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
         (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
       unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
-      unsigned OpReg = getRegForValue(TI->getOperand(0));
+      Register OpReg = getRegForValue(TI->getOperand(0));
       OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                               TII.get(TstOpc))
@@ -1284,7 +1284,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
     return true;
   }
 
-  unsigned CmpReg = getRegForValue(BI->getCondition());
+  Register CmpReg = getRegForValue(BI->getCondition());
   if (CmpReg == 0) return false;
 
   // We've been divorced from our compare!  Our block was split, and
@@ -1315,7 +1315,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
 }
 
 bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
-  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  Register AddrReg = getRegForValue(I->getOperand(0));
   if (AddrReg == 0) return false;
 
   unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
@@ -1406,7 +1406,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
       break;
   }
 
-  unsigned SrcReg1 = getRegForValue(Src1Value);
+  Register SrcReg1 = getRegForValue(Src1Value);
   if (SrcReg1 == 0) return false;
 
   unsigned SrcReg2 = 0;
@@ -1468,7 +1468,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   unsigned MovCCOpc = isThumb2 ? ARM::t2MOVCCi : ARM::MOVCCi;
   const TargetRegisterClass *RC = isThumb2 ? &ARM::rGPRRegClass
                                            : &ARM::GPRRegClass;
-  unsigned DestReg = createResultReg(RC);
+  Register DestReg = createResultReg(RC);
   Constant *Zero = ConstantInt::get(Type::getInt32Ty(*Context), 0);
   unsigned ZeroReg = fastMaterializeConstant(Zero);
   // ARMEmitCmp emits a FMSTAT when necessary, so it's always safe to use CPSR.
@@ -1488,10 +1488,10 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
   if (!I->getType()->isDoubleTy() ||
       !V->getType()->isFloatTy()) return false;
 
-  unsigned Op = getRegForValue(V);
+  Register Op = getRegForValue(V);
   if (Op == 0) return false;
 
-  unsigned Result = createResultReg(&ARM::DPRRegClass);
+  Register Result = createResultReg(&ARM::DPRRegClass);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VCVTDS), Result)
                   .addReg(Op));
@@ -1507,10 +1507,10 @@ bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
   if (!(I->getType()->isFloatTy() &&
         V->getType()->isDoubleTy())) return false;
 
-  unsigned Op = getRegForValue(V);
+  Register Op = getRegForValue(V);
   if (Op == 0) return false;
 
-  unsigned Result = createResultReg(&ARM::SPRRegClass);
+  Register Result = createResultReg(&ARM::SPRRegClass);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(ARM::VCVTSD), Result)
                   .addReg(Op));
@@ -1535,7 +1535,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
   if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (SrcReg == 0) return false;
 
   // Handle sign-extension.
@@ -1556,7 +1556,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
     Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
   else return false;
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
+  Register ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg).addReg(FP));
   updateValueMap(I, ResultReg);
@@ -1572,7 +1572,7 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
   if (!isTypeLegal(RetTy, DstVT))
     return false;
 
-  unsigned Op = getRegForValue(I->getOperand(0));
+  Register Op = getRegForValue(I->getOperand(0));
   if (Op == 0) return false;
 
   unsigned Opc;
@@ -1583,7 +1583,7 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
   else return false;
 
   // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
+  Register ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg).addReg(Op));
 
@@ -1604,9 +1604,9 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
   // Things need to be register sized for register moves.
   if (VT != MVT::i32) return false;
 
-  unsigned CondReg = getRegForValue(I->getOperand(0));
+  Register CondReg = getRegForValue(I->getOperand(0));
   if (CondReg == 0) return false;
-  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  Register Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0) return false;
 
   // Check to see if we can use an immediate in the conditional move.
@@ -1649,7 +1649,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
     else
       MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
   }
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   if (!UseImm) {
     Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
     Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
@@ -1752,15 +1752,15 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
       break;
   }
 
-  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  Register SrcReg1 = getRegForValue(I->getOperand(0));
   if (SrcReg1 == 0) return false;
 
   // TODO: Often the 2nd operand is an immediate, which can be encoded directly
   // in the instruction, rather then materializing the value in a register.
-  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  Register SrcReg2 = getRegForValue(I->getOperand(1));
   if (SrcReg2 == 0) return false;
 
-  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  Register ResultReg = createResultReg(&ARM::GPRnopcRegClass);
   SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
   SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1803,13 +1803,13 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
       Opc = is64bit ? ARM::VMULD : ARM::VMULS;
       break;
   }
-  unsigned Op1 = getRegForValue(I->getOperand(0));
+  Register Op1 = getRegForValue(I->getOperand(0));
   if (Op1 == 0) return false;
 
-  unsigned Op2 = getRegForValue(I->getOperand(1));
+  Register Op2 = getRegForValue(I->getOperand(1));
   if (Op2 == 0) return false;
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+  Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(Opc), ResultReg)
                   .addReg(Op1).addReg(Op2));
@@ -2022,7 +2022,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                           TII.get(AdjStackUp))
-                  .addImm(NumBytes).addImm(0));
+                  .addImm(NumBytes).addImm(-1ULL));
 
   // Now the return value.
   if (RetVT != MVT::isVoid) {
@@ -2101,7 +2101,7 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
                                                  F.isVarArg()));
 
     const Value *RV = Ret->getOperand(0);
-    unsigned Reg = getRegForValue(RV);
+    Register Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
 
@@ -2226,7 +2226,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   ArgVTs.reserve(I->getNumOperands());
   ArgFlags.reserve(I->getNumOperands());
   for (Value *Op :  I->operands()) {
-    unsigned Arg = getRegForValue(Op);
+    Register Arg = getRegForValue(Op);
     if (Arg == 0) return false;
 
     Type *ArgTy = Op->getType();
@@ -2588,7 +2588,7 @@ bool ARMFastISel::SelectTrunc(const Instruction *I) {
   if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
     return false;
 
-  unsigned SrcReg = getRegForValue(Op);
+  Register SrcReg = getRegForValue(Op);
   if (!SrcReg) return false;
 
   // Because the high bits are undefined, a truncate doesn't generate
@@ -2744,7 +2744,7 @@ bool ARMFastISel::SelectIntExt(const Instruction *I) {
   Type *SrcTy = Src->getType();
 
   bool isZExt = isa<ZExtInst>(I);
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (!SrcReg) return false;
 
   EVT SrcEVT, DestEVT;
@@ -2788,7 +2788,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
   }
 
   Value *Src1Value = I->getOperand(0);
-  unsigned Reg1 = getRegForValue(Src1Value);
+  Register Reg1 = getRegForValue(Src1Value);
   if (Reg1 == 0) return false;
 
   unsigned Reg2 = 0;
@@ -2797,7 +2797,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
     if (Reg2 == 0) return false;
   }
 
-  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  Register ResultReg = createResultReg(&ARM::GPRnopcRegClass);
   if(ResultReg == 0) return false;
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2975,7 +2975,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MVT VT) {
   MIB.add(predOps(ARMCC::AL));
 
   // Fix the address by adding pc.
-  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  Register DestReg = createResultReg(TLI.getRegClassFor(VT));
   Opc = Subtarget->isThumb() ? ARM::tPICADD : UseGOT_PREL ? ARM::PICLDR
                                                           : ARM::PICADD;
   DestReg = constrainOperandRegClass(TII.get(Opc), DestReg, 0);
@@ -2987,7 +2987,7 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MVT VT) {
     MIB.add(predOps(ARMCC::AL));
 
   if (UseGOT_PREL && Subtarget->isThumb()) {
-    unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+    Register NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                   TII.get(ARM::t2LDRi12), NewDestReg)
               .addReg(DestReg)
@@ -3057,11 +3057,11 @@ bool ARMFastISel::fastLowerArguments() {
   for (const Argument &Arg : F->args()) {
     unsigned ArgNo = Arg.getArgNo();
     unsigned SrcReg = GPRArgRegs[ArgNo];
-    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY),
             ResultReg).addReg(DstReg, getKillRegState(true));
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 4b59f9cb94ce..1f2f6f7497e0 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -516,7 +516,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
 
   // Determine spill area sizes.
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     int FI = I.getFrameIdx();
     switch (Reg) {
     case ARM::R8:
@@ -751,7 +751,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     MachineBasicBlock::iterator Pos = std::next(GPRCS1Push);
     int CFIIndex;
     for (const auto &Entry : CSI) {
-      unsigned Reg = Entry.getReg();
+      Register Reg = Entry.getReg();
       int FI = Entry.getFrameIdx();
       switch (Reg) {
       case ARM::R8:
@@ -784,7 +784,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   if (GPRCS2Size > 0) {
     MachineBasicBlock::iterator Pos = std::next(GPRCS2Push);
     for (const auto &Entry : CSI) {
-      unsigned Reg = Entry.getReg();
+      Register Reg = Entry.getReg();
       int FI = Entry.getFrameIdx();
       switch (Reg) {
       case ARM::R8:
@@ -794,7 +794,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
       case ARM::R12:
         if (STI.splitFramePushPop(MF)) {
           unsigned DwarfReg = MRI->getDwarfRegNum(
-              Reg == ARM::R12 ? (unsigned)ARM::RA_AUTH_CODE : Reg, true);
+              Reg == ARM::R12 ? ARM::RA_AUTH_CODE : Reg, true);
           unsigned Offset = MFI.getObjectOffset(FI);
           unsigned CFIIndex = MF.addFrameInst(
               MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
@@ -812,7 +812,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     // instructions in the prologue.
     MachineBasicBlock::iterator Pos = std::next(LastPush);
     for (const auto &Entry : CSI) {
-      unsigned Reg = Entry.getReg();
+      Register Reg = Entry.getReg();
       int FI = Entry.getFrameIdx();
       if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
           (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
@@ -1144,7 +1144,7 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
   while (i != 0) {
     unsigned LastReg = 0;
     for (; i != 0; --i) {
-      unsigned Reg = CSI[i-1].getReg();
+      Register Reg = CSI[i-1].getReg();
       if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue;
 
       // D-registers in the aligned area DPRCS2 are NOT spilled here.
@@ -1237,7 +1237,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     bool DeleteRet = false;
     for (; i != 0; --i) {
       CalleeSavedInfo &Info = CSI[i-1];
-      unsigned Reg = Info.getReg();
+      Register Reg = Info.getReg();
       if (!(Func)(Reg, STI.splitFramePushPop(MF))) continue;
 
       // The aligned reloads from area DPRCS2 are not inserted here.
@@ -1812,7 +1812,7 @@ bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
   // shrinkwrapping can cause clobbering of r12 when the PAC code is
   // generated. A follow-up patch will fix this in a more performant manner.
   if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(
-          false /*SpillsLR */))
+          true /* SpillsLR */))
     return false;
 
   return true;
@@ -2353,7 +2353,7 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots(
     // LR, R7, R6, R5, R4, <R12>, R11, R10,  R9,  R8, D15-D8
     CSI.insert(find_if(CSI,
                        [=](const auto &CS) {
-                         unsigned Reg = CS.getReg();
+                         Register Reg = CS.getReg();
                          return Reg == ARM::R10 || Reg == ARM::R11 ||
                                 Reg == ARM::R8 || Reg == ARM::R9 ||
                                 ARM::DPRRegClass.contains(Reg);
diff --git a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
index f083fa6662e9..0d201a67af46 100644
--- a/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/llvm/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -164,7 +164,7 @@ static bool getBaseOffset(const MachineInstr &MI, const MachineOperand *&BaseOp,
 
 ARMBankConflictHazardRecognizer::ARMBankConflictHazardRecognizer(
     const ScheduleDAG *DAG, int64_t CPUBankMask, bool CPUAssumeITCMConflict)
-    : ScheduleHazardRecognizer(), MF(DAG->MF), DL(DAG->MF.getDataLayout()),
+    : MF(DAG->MF), DL(DAG->MF.getDataLayout()),
       DataMask(DataBankMask.getNumOccurrences() ? int64_t(DataBankMask)
                                                 : CPUBankMask),
       AssumeITCMBankConflict(AssumeITCMConflict.getNumOccurrences()
diff --git a/llvm/lib/Target/ARM/ARMHazardRecognizer.h b/llvm/lib/Target/ARM/ARMHazardRecognizer.h
index c1f1bcd0a629..66a1477e5e08 100644
--- a/llvm/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/llvm/lib/Target/ARM/ARMHazardRecognizer.h
@@ -34,7 +34,7 @@ class ARMHazardRecognizerFPMLx : public ScheduleHazardRecognizer {
   unsigned FpMLxStalls = 0;
 
 public:
-  ARMHazardRecognizerFPMLx() : ScheduleHazardRecognizer() { MaxLookAhead = 1; }
+  ARMHazardRecognizerFPMLx() { MaxLookAhead = 1; }
 
   HazardType getHazardType(SUnit *SU, int Stalls) override;
   void Reset() override;
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index bb2859c766c2..98c8133282a2 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3227,7 +3227,7 @@ bool ARMDAGToDAGISel::transformFixedFloatingPointConversion(SDNode *N,
     if (!ImmAPF.getExactInverse(&ToConvert))
       return false;
   }
-  APSInt Converted(64, 0);
+  APSInt Converted(64, false);
   bool IsExact;
   ToConvert.convertToInteger(Converted, llvm::RoundingMode::NearestTiesToEven,
                              &IsExact);
@@ -5737,8 +5737,7 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
   // them into a GPRPair.
 
   SDLoc dl(N);
-  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
-                                   : SDValue(nullptr,0);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps - 1) : SDValue();
 
   SmallVector<bool, 8> OpChanged;
   // Glue node will be appended late.
@@ -5801,8 +5800,8 @@ bool ARMDAGToDAGISel::tryInlineAsm(SDNode *N){
     assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
     SDValue V0 = N->getOperand(i+1);
     SDValue V1 = N->getOperand(i+2);
-    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
-    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    Register Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    Register Reg1 = cast<RegisterSDNode>(V1)->getReg();
     SDValue PairedReg;
     MachineRegisterInfo &MRI = MF->getRegInfo();
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 3d45db349644..fe4e6b24367a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2899,7 +2899,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
   unsigned Bytes = Arg.getValueSizeInBits() / 8;
   int FI = std::numeric_limits<int>::max();
   if (Arg.getOpcode() == ISD::CopyFromReg) {
-    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!Register::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
@@ -4018,7 +4018,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
           ARI->getCallPreservedMask(DAG.getMachineFunction(), CallingConv::C);
       assert(Mask && "Missing call preserved mask for calling convention");
       // Mark LR an implicit live-in.
-      unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+      Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
       SDValue ReturnAddress =
           DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
       constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
@@ -4272,7 +4272,7 @@ SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
     RC = &ARM::GPRRegClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
-  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
 
   SDValue ArgValue2;
@@ -4342,7 +4342,7 @@ int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
 
   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
-    unsigned VReg = MF.addLiveIn(Reg, RC);
+    Register VReg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                  MachinePointerInfo(OrigArg, 4 * i));
@@ -4527,7 +4527,7 @@ SDValue ARMTargetLowering::LowerFormalArguments(
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
         // Transform the arguments in physical registers into virtual ones.
-        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
         // If this value is passed in r0 and has the returned attribute (e.g.
@@ -6065,7 +6065,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
+  Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
@@ -14682,7 +14682,9 @@ static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Check that N is CMPZ(CSINC(0, 0, CC, X)), return X if valid.
+// Check that N is CMPZ(CSINC(0, 0, CC, X)),
+//              or CMPZ(CMOV(1, 0, CC, $cpsr, X))
+// return X if valid.
 static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
   if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
     return SDValue();
@@ -14696,12 +14698,24 @@ static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC) {
          CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
     CSInc = CSInc.getOperand(0);
 
-  if (CSInc.getOpcode() != ARMISD::CSINC ||
-      !isNullConstant(CSInc.getOperand(0)) ||
-      !isNullConstant(CSInc.getOperand(1)) || !CSInc->hasOneUse())
-    return SDValue();
-  CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
-  return CSInc.getOperand(3);
+  if (CSInc.getOpcode() == ARMISD::CSINC &&
+      isNullConstant(CSInc.getOperand(0)) &&
+      isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
+    CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
+    return CSInc.getOperand(3);
+  }
+  if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
+      isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
+    CC = (ARMCC::CondCodes)CSInc.getConstantOperandVal(2);
+    return CSInc.getOperand(4);
+  }
+  if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
+      isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
+    CC = ARMCC::getOppositeCondition(
+        (ARMCC::CondCodes)CSInc.getConstantOperandVal(2));
+    return CSInc.getOperand(4);
+  }
+  return SDValue();
 }
 
 static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG) {
@@ -15412,13 +15426,13 @@ static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N,
     return SDValue();
 
   SDLoc DL(Trunc);
-  if (isVMOVNTruncMask(N->getMask(), VT, 0))
+  if (isVMOVNTruncMask(N->getMask(), VT, false))
     return DAG.getNode(
         ARMISD::VMOVN, DL, VT,
         DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
         DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
         DAG.getConstant(1, DL, MVT::i32));
-  else if (isVMOVNTruncMask(N->getMask(), VT, 1))
+  else if (isVMOVNTruncMask(N->getMask(), VT, true))
     return DAG.getNode(
         ARMISD::VMOVN, DL, VT,
         DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
@@ -18218,13 +18232,13 @@ SDValue ARMTargetLowering::PerformMVETruncCombine(
       SmallVector<int, 8> Mask(S0->getMask().begin(), S0->getMask().end());
       Mask.append(S1->getMask().begin(), S1->getMask().end());
 
-      if (isVMOVNTruncMask(Mask, VT, 0))
+      if (isVMOVNTruncMask(Mask, VT, false))
         return DAG.getNode(
             ARMISD::VMOVN, DL, VT,
             DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
             DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
             DAG.getConstant(1, DL, MVT::i32));
-      if (isVMOVNTruncMask(Mask, VT, 1))
+      if (isVMOVNTruncMask(Mask, VT, true))
         return DAG.getNode(
             ARMISD::VMOVN, DL, VT,
             DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
@@ -20775,10 +20789,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -20787,10 +20801,10 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.align = DL.getABITypeAlign(PtrTy->getPointerElementType());
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index 5dee5e04af81..00db13f2eb52 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -28,8 +28,7 @@
 #include "llvm/MC/MCInst.h"
 using namespace llvm;
 
-ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI() {}
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI) : ARMBaseInstrInfo(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst ARMInstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index aaf3280ea150..357aa6d062e9 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -4526,64 +4526,48 @@ let Predicates = [HasNEON, HasV8_1a] in {
   defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
                              null_frag>;
-  def : Pat<(v4i16 (saddsat
-                     (v4i16 DPR:$src1),
-                     (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
-                                                   (v4i16 DPR:$Vm))))),
+  def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1), (v4i16 DPR:$Vn),
+                                                   (v4i16 DPR:$Vm))),
             (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v2i32 (saddsat
-                     (v2i32 DPR:$src1),
-                     (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
-                                                   (v2i32 DPR:$Vm))))),
+  def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1), (v2i32 DPR:$Vn),
+                                                   (v2i32 DPR:$Vm))),
             (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v8i16 (saddsat
-                     (v8i16 QPR:$src1),
-                     (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
-                                                   (v8i16 QPR:$Vm))))),
+  def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1), (v8i16 QPR:$Vn),
+                                                   (v8i16 QPR:$Vm))),
             (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
-  def : Pat<(v4i32 (saddsat
-                     (v4i32 QPR:$src1),
-                     (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
-                                                   (v4i32 QPR:$Vm))))),
+  def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1), (v4i32 QPR:$Vn),
+                                                   (v4i32 QPR:$Vm))),
             (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
 
   defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
                                   IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
                                   null_frag>;
-  def : Pat<(v4i16 (saddsat
-                     (v4i16 DPR:$src1),
-                     (v4i16 (int_arm_neon_vqrdmulh
+  def : Pat<(v4i16 (int_arm_neon_vqrdmlah (v4i16 DPR:$src1),
                               (v4i16 DPR:$Vn),
                               (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
-                                                   imm:$lane)))))),
+                                                   imm:$lane)))),
             (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
                                     imm:$lane))>;
-  def : Pat<(v2i32 (saddsat
-                     (v2i32 DPR:$src1),
-                     (v2i32 (int_arm_neon_vqrdmulh
+  def : Pat<(v2i32 (int_arm_neon_vqrdmlah (v2i32 DPR:$src1),
                               (v2i32 DPR:$Vn),
                               (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
-                                                   imm:$lane)))))),
+                                                   imm:$lane)))),
             (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
                                     imm:$lane))>;
-  def : Pat<(v8i16 (saddsat
-                     (v8i16 QPR:$src1),
-                     (v8i16 (int_arm_neon_vqrdmulh
+  def : Pat<(v8i16 (int_arm_neon_vqrdmlah (v8i16 QPR:$src1),
                               (v8i16 QPR:$src2),
                               (v8i16 (ARMvduplane (v8i16 QPR:$src3),
-                                                   imm:$lane)))))),
+                                                   imm:$lane)))),
             (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1),
                                     (v8i16 QPR:$src2),
                                     (v4i16 (EXTRACT_SUBREG
                                              QPR:$src3,
                                              (DSubReg_i16_reg imm:$lane))),
                                     (SubReg_i16_lane imm:$lane)))>;
-  def : Pat<(v4i32 (saddsat
-                     (v4i32 QPR:$src1),
-                     (v4i32 (int_arm_neon_vqrdmulh
+  def : Pat<(v4i32 (int_arm_neon_vqrdmlah (v4i32 QPR:$src1),
                               (v4i32 QPR:$src2),
                               (v4i32 (ARMvduplane (v4i32 QPR:$src3),
-                                                   imm:$lane)))))),
+                                                   imm:$lane)))),
             (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
                                     (v4i32 QPR:$src2),
                                     (v2i32 (EXTRACT_SUBREG
@@ -4596,63 +4580,47 @@ let Predicates = [HasNEON, HasV8_1a] in {
   defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
                              IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
                              null_frag>;
-  def : Pat<(v4i16 (ssubsat
-                     (v4i16 DPR:$src1),
-                     (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
-                                                   (v4i16 DPR:$Vm))))),
+  def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1), (v4i16 DPR:$Vn),
+                                                   (v4i16 DPR:$Vm))),
             (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v2i32 (ssubsat
-                     (v2i32 DPR:$src1),
-                     (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
-                                                   (v2i32 DPR:$Vm))))),
+  def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1), (v2i32 DPR:$Vn),
+                                                   (v2i32 DPR:$Vm))),
             (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
-  def : Pat<(v8i16 (ssubsat
-                     (v8i16 QPR:$src1),
-                     (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
-                                                   (v8i16 QPR:$Vm))))),
+  def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1), (v8i16 QPR:$Vn),
+                                                   (v8i16 QPR:$Vm))),
             (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
-  def : Pat<(v4i32 (ssubsat
-                     (v4i32 QPR:$src1),
-                     (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
-                                                   (v4i32 QPR:$Vm))))),
+  def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1), (v4i32 QPR:$Vn),
+                                                   (v4i32 QPR:$Vm))),
             (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
 
   defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
                                   IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
                                   null_frag>;
-  def : Pat<(v4i16 (ssubsat
-                     (v4i16 DPR:$src1),
-                     (v4i16 (int_arm_neon_vqrdmulh
+  def : Pat<(v4i16 (int_arm_neon_vqrdmlsh (v4i16 DPR:$src1),
                               (v4i16 DPR:$Vn),
                               (v4i16 (ARMvduplane (v4i16 DPR_8:$Vm),
-                                                   imm:$lane)))))),
+                                                   imm:$lane)))),
             (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
-  def : Pat<(v2i32 (ssubsat
-                     (v2i32 DPR:$src1),
-                     (v2i32 (int_arm_neon_vqrdmulh
+  def : Pat<(v2i32 (int_arm_neon_vqrdmlsh (v2i32 DPR:$src1),
                               (v2i32 DPR:$Vn),
                               (v2i32 (ARMvduplane (v2i32 DPR_VFP2:$Vm),
-                                                   imm:$lane)))))),
+                                                   imm:$lane)))),
             (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
                                     imm:$lane))>;
-  def : Pat<(v8i16 (ssubsat
-                     (v8i16 QPR:$src1),
-                     (v8i16 (int_arm_neon_vqrdmulh
+  def : Pat<(v8i16 (int_arm_neon_vqrdmlsh (v8i16 QPR:$src1),
                               (v8i16 QPR:$src2),
                               (v8i16 (ARMvduplane (v8i16 QPR:$src3),
-                                                   imm:$lane)))))),
+                                                   imm:$lane)))),
             (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
                                     (v8i16 QPR:$src2),
                                     (v4i16 (EXTRACT_SUBREG
                                              QPR:$src3,
                                              (DSubReg_i16_reg imm:$lane))),
                                     (SubReg_i16_lane imm:$lane)))>;
-  def : Pat<(v4i32 (ssubsat
-                     (v4i32 QPR:$src1),
-                     (v4i32 (int_arm_neon_vqrdmulh
+  def : Pat<(v4i32 (int_arm_neon_vqrdmlsh (v4i32 QPR:$src1),
                               (v4i32 QPR:$src2),
                               (v4i32 (ARMvduplane (v4i32 QPR:$src3),
-                                                    imm:$lane)))))),
+                                                    imm:$lane)))),
             (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
                                     (v4i32 QPR:$src2),
                                     (v2i32 (EXTRACT_SUBREG
diff --git a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 8be4e3f160e3..188b5562cac9 100644
--- a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -171,8 +171,8 @@ createARMInstructionSelector(const ARMBaseTargetMachine &TM,
 ARMInstructionSelector::ARMInstructionSelector(const ARMBaseTargetMachine &TM,
                                                const ARMSubtarget &STI,
                                                const ARMRegisterBankInfo &RBI)
-    : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI), STI(STI), Opcodes(STI),
+    : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), TM(TM), RBI(RBI),
+      STI(STI), Opcodes(STI),
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "ARMGenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_INIT
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
index 6649750bb388..ff4647dd46fd 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -15,4 +15,4 @@ using namespace llvm;
 
 void ARMRegisterInfo::anchor() { }
 
-ARMRegisterInfo::ARMRegisterInfo() : ARMBaseRegisterInfo() {}
+ARMRegisterInfo::ARMRegisterInfo() {}
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.h b/llvm/lib/Target/ARM/ARMRegisterInfo.h
index 87c0f322d3b3..2971b765a6fc 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.h
@@ -17,8 +17,6 @@
 
 namespace llvm {
 
-class ARMSubtarget;
-
 struct ARMRegisterInfo : public ARMBaseRegisterInfo {
   virtual void anchor();
 public:
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index 36c4bbaafcbf..2dd25234dc50 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -15,7 +15,6 @@
 #include "ARMCallLowering.h"
 #include "ARMLegalizerInfo.h"
 #include "ARMRegisterBankInfo.h"
-#include "ARMSubtarget.h"
 #include "ARMFrameLowering.h"
 #include "ARMInstrInfo.h"
 #include "ARMSubtarget.h"
@@ -35,6 +34,7 @@
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ARMTargetParser.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Target/TargetOptions.h"
 
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index e61b90af31b0..1c2b7ee6ba35 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -121,6 +121,7 @@ protected:
     ARMv85a,
     ARMv86a,
     ARMv87a,
+    ARMv88a,
     ARMv8a,
     ARMv8mBaseline,
     ARMv8mMainline,
@@ -129,6 +130,7 @@ protected:
     ARMv9a,
     ARMv91a,
     ARMv92a,
+    ARMv93a,
   };
 
 public:
@@ -174,10 +176,12 @@ protected:
   bool HasV8_4aOps = false;
   bool HasV8_5aOps = false;
   bool HasV8_6aOps = false;
+  bool HasV8_8aOps = false;
   bool HasV8_7aOps = false;
   bool HasV9_0aOps = false;
   bool HasV9_1aOps = false;
   bool HasV9_2aOps = false;
+  bool HasV9_3aOps = false;
   bool HasV8MBaselineOps = false;
   bool HasV8MMainlineOps = false;
   bool HasV8_1MMainlineOps = false;
@@ -635,9 +639,11 @@ public:
   bool hasV8_5aOps() const { return HasV8_5aOps; }
   bool hasV8_6aOps() const { return HasV8_6aOps; }
   bool hasV8_7aOps() const { return HasV8_7aOps; }
+  bool hasV8_8aOps() const { return HasV8_8aOps; }
   bool hasV9_0aOps() const { return HasV9_0aOps; }
   bool hasV9_1aOps() const { return HasV9_1aOps; }
   bool hasV9_2aOps() const { return HasV9_2aOps; }
+  bool hasV9_3aOps() const { return HasV9_3aOps; }
   bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
   bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
   bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 0b314ac2a41e..c38970f8e341 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -43,6 +43,7 @@
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ARMTargetParser.h"
 #include "llvm/Support/TargetParser.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 8c5438f7093b..936cae17f004 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -54,9 +54,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
   }
 }
 
-const MCRegister ARMElfTargetObjectFile::getStaticBase() const {
-  return ARM::R9;
-}
+MCRegister ARMElfTargetObjectFile::getStaticBase() const { return ARM::R9; }
 
 const MCExpr *ARMElfTargetObjectFile::
 getIndirectSymViaRWPI(const MCSymbol *Sym) const {
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.h b/llvm/lib/Target/ARM/ARMTargetObjectFile.h
index 8b13198fe144..47334b9a8a45 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.h
@@ -17,14 +17,13 @@ namespace llvm {
 
 class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
 public:
-  ARMElfTargetObjectFile()
-      : TargetLoweringObjectFileELF() {
+  ARMElfTargetObjectFile() {
     PLTRelativeVariantKind = MCSymbolRefExpr::VK_ARM_PREL31;
   }
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
-  const MCRegister getStaticBase() const override;
+  MCRegister getStaticBase() const override;
 
   const MCExpr *getIndirectSymViaRWPI(const MCSymbol *Sym) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 602c6745d310..e0750a9945d2 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1116,18 +1116,6 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
     return false;
 
-  // This method is called in 2 places:
-  //  - from the vectorizer with a scalar type, in which case we need to get
-  //  this as good as we can with the limited info we have (and rely on the cost
-  //  model for the rest).
-  //  - from the masked intrinsic lowering pass with the actual vector type.
-  // For MVE, we have a custom lowering pass that will already have custom
-  // legalised any gathers that we can to MVE intrinsics, and want to expand all
-  // the rest. The pass runs before the masked intrinsic lowering pass, so if we
-  // are here, we know we want to expand.
-  if (isa<VectorType>(Ty))
-    return false;
-
   unsigned EltWidth = Ty->getScalarSizeInBits();
   return ((EltWidth == 32 && Alignment >= 4) ||
           (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index a56886d4fc11..5bb84899e5ef 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -189,6 +189,18 @@ public:
     return isLegalMaskedLoad(DataTy, Alignment);
   }
 
+  bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
+    // For MVE, we have a custom lowering pass that will already have custom
+    // legalised any gathers that we can lower to MVE intrinsics, and want to
+    // expand all the rest. The pass runs before the masked intrinsic lowering
+    // pass.
+    return true;
+  }
+
+  bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
+    return forceScalarizeMaskedGather(VTy, Alignment);
+  }
+
   bool isLegalMaskedGather(Type *Ty, Align Alignment);
 
   bool isLegalMaskedScatter(Type *Ty, Align Alignment) {
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index bfe078b06861..c7734cc2cf11 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -921,7 +921,7 @@ class ARMOperand : public MCParsedAsmOperand {
   };
 
 public:
-  ARMOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  ARMOperand(KindTy K) : Kind(K) {}
 
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
@@ -1870,7 +1870,7 @@ public:
   }
 
   template <int shift> bool isMemRegRQOffset() const {
-    if (!isMVEMem() || Memory.OffsetImm != 0 || Memory.Alignment != 0)
+    if (!isMVEMem() || Memory.OffsetImm != nullptr || Memory.Alignment != 0)
       return false;
 
     if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 851acea94022..23430dfc017a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -1049,11 +1049,11 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   unsigned Kind = Fixup.getKind();
   if (Kind >= FirstLiteralRelocationKind)
     return;
-  unsigned NumBytes = getFixupKindNumBytes(Kind);
   MCContext &Ctx = Asm.getContext();
   Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, STI);
   if (!Value)
     return; // Doesn't change encoding.
+  const unsigned NumBytes = getFixupKindNumBytes(Kind);
 
   unsigned Offset = Fixup.getOffset();
   assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
@@ -1123,9 +1123,8 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
   DenseMap<unsigned, int> RegOffsets;
   int FloatRegCount = 0;
   // Process each .cfi directive and build up compact unwind info.
-  for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+  for (const MCCFIInstruction &Inst : Instrs) {
     unsigned Reg;
-    const MCCFIInstruction &Inst = Instrs[i];
     switch (Inst.getOperation()) {
     case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
       CFARegisterOffset = Inst.getOffset();
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 37d81e4b0af1..df8f54d14a86 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -87,7 +87,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   if (IsPCRel) {
     switch (Fixup.getTargetKind()) {
     default:
-      Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
+      Ctx.reportError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
     case FK_Data_4:
       switch (Modifier) {
@@ -159,7 +159,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   }
   switch (Kind) {
   default:
-    Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
+    Ctx.reportError(Fixup.getLoc(), "unsupported relocation on symbol");
     return ELF::R_ARM_NONE;
   case FK_Data_1:
     switch (Modifier) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index e060e59e3759..16bc0ca179a7 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -264,10 +264,8 @@ void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
 void ARMTargetAsmStreamer::emitUnwindRaw(int64_t Offset,
                                       const SmallVectorImpl<uint8_t> &Opcodes) {
   OS << "\t.unwind_raw " << Offset;
-  for (SmallVectorImpl<uint8_t>::const_iterator OCI = Opcodes.begin(),
-                                                OCE = Opcodes.end();
-       OCI != OCE; ++OCI)
-    OS << ", 0x" << Twine::utohexstr(*OCI);
+  for (uint8_t Opcode : Opcodes)
+    OS << ", 0x" << Twine::utohexstr(Opcode);
   OS << '\n';
 }
 
@@ -788,6 +786,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::ArchKind::ARMV9A:
   case ARM::ArchKind::ARMV9_1A:
   case ARM::ArchKind::ARMV9_2A:
+  case ARM::ArchKind::ARMV9_3A:
     S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     S.setAttributeItem(ARM_ISA_use, Allowed, false);
     S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 05e5a473a3c6..17ca1866cf95 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -338,8 +338,8 @@ void ARM_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
       {codeview::RegisterId::ARM_NQ14, ARM::Q14},
       {codeview::RegisterId::ARM_NQ15, ARM::Q15},
   };
-  for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
-    MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
+  for (const auto &I : RegMap)
+    MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
 }
 
 static MCRegisterInfo *createARMMCRegisterInfo(const Triple &Triple) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 7ccdc6f85500..5c8f9bfdca08 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -36,8 +36,6 @@ class MCTargetStreamer;
 class StringRef;
 class Target;
 class Triple;
-class raw_ostream;
-class raw_pwrite_stream;
 
 namespace ARM_MC {
 std::string ParseARMTriple(const Triple &TT, StringRef CPU);
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 54e80a095dd4..71a82a1e3271 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -167,7 +167,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   DebugLoc dl;
 
   Register FramePtr = RegInfo->getFrameRegister(MF);
-  unsigned BasePtr = RegInfo->getBaseRegister();
+  Register BasePtr = RegInfo->getBaseRegister();
   int CFAOffset = 0;
 
   // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
@@ -206,7 +206,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     int FI = I.getFrameIdx();
     switch (Reg) {
     case ARM::R8:
@@ -267,7 +267,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlags(MachineInstr::FrameSetup);
   }
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     int FI = I.getFrameIdx();
     switch (Reg) {
     case ARM::R8:
@@ -348,7 +348,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
 
   // Emit call frame information for the callee-saved high registers.
   for (auto &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     int FI = I.getFrameIdx();
     switch (Reg) {
     case ARM::R8:
@@ -376,7 +376,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     // at this point in the prologue, so pick one.
     unsigned ScratchRegister = ARM::NoRegister;
     for (auto &I : CSI) {
-      unsigned Reg = I.getReg();
+      Register Reg = I.getReg();
       if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
         ScratchRegister = Reg;
         break;
@@ -531,7 +531,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
       unsigned ScratchRegister = ARM::NoRegister;
       bool HasFP = hasFP(MF);
       for (auto &I : MFI.getCalleeSavedInfo()) {
-        unsigned Reg = I.getReg();
+        Register Reg = I.getReg();
         if (isARMLowRegister(Reg) && !(HasFP && Reg == FramePtr)) {
           ScratchRegister = Reg;
           break;
@@ -825,7 +825,7 @@ bool Thumb1FrameLowering::spillCalleeSavedRegisters(
                           // LoRegs for saving HiRegs.
 
   for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
 
     if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
       LoRegsToSave[Reg] = true;
@@ -949,7 +949,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
   ARMRegSet CopyRegs;
 
   for (CalleeSavedInfo I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
 
     if (ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR) {
       LoRegsToRestore[Reg] = true;
@@ -1022,7 +1022,7 @@ bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
 
   bool NeedsPop = false;
   for (CalleeSavedInfo &Info : llvm::reverse(CSI)) {
-    unsigned Reg = Info.getReg();
+    Register Reg = Info.getReg();
 
     // High registers (excluding lr) have already been dealt with
     if (!(ARM::tGPRRegClass.contains(Reg) || Reg == ARM::LR))
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index 4b18f5e20d40..1a36c2ca9152 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-    : ARMBaseInstrInfo(STI), RI() {}
+    : ARMBaseInstrInfo(STI) {}
 
 /// Return the noop instruction to use for a noop.
 MCInst Thumb1InstrInfo::getNop() const {
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index e6d51796ba4d..a83ff5e51004 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -18,7 +18,6 @@
 
 namespace llvm {
 class ARMSubtarget;
-class ScheduleHazardRecognizer;
 
 class Thumb2InstrInfo : public ARMBaseInstrInfo {
   ThumbRegisterInfo RI;
diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index 1164b6ebbac3..1cc5422523f1 100644
--- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -1147,9 +1147,8 @@ bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
   // predecessors.
   ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
   bool Modified = false;
-  for (ReversePostOrderTraversal<MachineFunction*>::rpo_iterator
-       I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
-    Modified |= ReduceMBB(**I);
+  for (MachineBasicBlock *MBB : RPOT)
+    Modified |= ReduceMBB(*MBB);
   return Modified;
 }
 
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index 4da6f6ab6994..5d2bc4ebe191 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -37,7 +37,7 @@ extern cl::opt<bool> ReuseFrameIndexVals;
 
 using namespace llvm;
 
-ThumbRegisterInfo::ThumbRegisterInfo() : ARMBaseRegisterInfo() {}
+ThumbRegisterInfo::ThumbRegisterInfo() {}
 
 const TargetRegisterClass *
 ThumbRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
diff --git a/llvm/lib/Target/AVR/AVR.h b/llvm/lib/Target/AVR/AVR.h
index 143c339c0664..0b512172ba10 100644
--- a/llvm/lib/Target/AVR/AVR.h
+++ b/llvm/lib/Target/AVR/AVR.h
@@ -28,7 +28,6 @@ FunctionPass *createAVRISelDag(AVRTargetMachine &TM,
 FunctionPass *createAVRExpandPseudoPass();
 FunctionPass *createAVRFrameAnalyzerPass();
 FunctionPass *createAVRRelaxMemPass();
-FunctionPass *createAVRDynAllocaSRPass();
 FunctionPass *createAVRBranchSelectionPass();
 
 void initializeAVRShiftExpandPass(PassRegistry &);
@@ -39,17 +38,56 @@ void initializeAVRRelaxMemPass(PassRegistry &);
 namespace AVR {
 
 /// An integer that identifies all of the supported AVR address spaces.
-enum AddressSpace { DataMemory, ProgramMemory };
+enum AddressSpace {
+  DataMemory,
+  ProgramMemory,
+  ProgramMemory1,
+  ProgramMemory2,
+  ProgramMemory3,
+  ProgramMemory4,
+  ProgramMemory5,
+  NumAddrSpaces,
+};
 
 /// Checks if a given type is a pointer to program memory.
 template <typename T> bool isProgramMemoryAddress(T *V) {
-  return cast<PointerType>(V->getType())->getAddressSpace() == ProgramMemory;
+  auto *PT = cast<PointerType>(V->getType());
+  assert(PT != nullptr && "unexpected MemSDNode");
+  return PT->getAddressSpace() == ProgramMemory ||
+         PT->getAddressSpace() == ProgramMemory1 ||
+         PT->getAddressSpace() == ProgramMemory2 ||
+         PT->getAddressSpace() == ProgramMemory3 ||
+         PT->getAddressSpace() == ProgramMemory4 ||
+         PT->getAddressSpace() == ProgramMemory5;
+}
+
+template <typename T> AddressSpace getAddressSpace(T *V) {
+  auto *PT = cast<PointerType>(V->getType());
+  assert(PT != nullptr && "unexpected MemSDNode");
+  unsigned AS = PT->getAddressSpace();
+  if (AS < NumAddrSpaces)
+    return static_cast<AddressSpace>(AS);
+  return NumAddrSpaces;
 }
 
 inline bool isProgramMemoryAccess(MemSDNode const *N) {
-  auto V = N->getMemOperand()->getValue();
+  auto *V = N->getMemOperand()->getValue();
+  if (V != nullptr && isProgramMemoryAddress(V))
+    return true;
+  return false;
+}
 
-  return (V != nullptr) ? isProgramMemoryAddress(V) : false;
+// Get the index of the program memory bank.
+//  -1: not program memory
+//   0: ordinary program memory
+// 1~5: extended program memory
+inline int getProgramMemoryBank(MemSDNode const *N) {
+  auto *V = N->getMemOperand()->getValue();
+  if (V == nullptr || !isProgramMemoryAddress(V))
+    return -1;
+  AddressSpace AS = getAddressSpace(V);
+  assert(ProgramMemory <= AS && AS <= ProgramMemory5);
+  return static_cast<int>(AS - ProgramMemory);
 }
 
 } // end of namespace AVR
diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td
index 87874c5c50b2..b4bc35e191c0 100644
--- a/llvm/lib/Target/AVR/AVRCallingConv.td
+++ b/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -36,4 +36,4 @@ def ArgCC_AVR_Vararg : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CSR_Normal : CalleeSavedRegs<(add R29, R28, (sequence "R%u", 17, 2))>;
-def CSR_Interrupts : CalleeSavedRegs<(add(sequence "R%u", 31, 0))>;
+def CSR_Interrupts : CalleeSavedRegs<(add(sequence "R%u", 31, 2))>;
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index cb85d73772c5..144ae2b320f9 100644
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -92,6 +92,7 @@ private:
   /// Specific shift implementation.
   bool expandLSLB7Rd(Block &MBB, BlockIt MBBI);
   bool expandLSRB7Rd(Block &MBB, BlockIt MBBI);
+  bool expandASRB6Rd(Block &MBB, BlockIt MBBI);
   bool expandASRB7Rd(Block &MBB, BlockIt MBBI);
   bool expandLSLW4Rd(Block &MBB, BlockIt MBBI);
   bool expandLSRW4Rd(Block &MBB, BlockIt MBBI);
@@ -101,6 +102,9 @@ private:
   bool expandLSLW12Rd(Block &MBB, BlockIt MBBI);
   bool expandLSRW12Rd(Block &MBB, BlockIt MBBI);
 
+  // Common implementation of LPMWRdZ and ELPMWRdZ.
+  bool expandLPMWELPMW(Block &MBB, BlockIt MBBI, bool IsExt);
+
   /// Scavenges a free GPR8 register for use.
   Register scavengeGPR8(MachineInstr &MI);
 };
@@ -808,18 +812,25 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
   return true;
 }
 
-template <>
-bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
+bool AVRExpandPseudo::expandLPMWELPMW(Block &MBB, BlockIt MBBI, bool IsExt) {
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
   Register TmpReg = 0; // 0 for no temporary register
   Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
-  unsigned OpLo = AVR::LPMRdZPi;
-  unsigned OpHi = AVR::LPMRdZ;
+  unsigned OpLo = IsExt ? AVR::ELPMRdZPi : AVR::LPMRdZPi;
+  unsigned OpHi = IsExt ? AVR::ELPMRdZ : AVR::LPMRdZ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
+  // Set the I/O register RAMPZ for ELPM.
+  if (IsExt) {
+    const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
+    Register Bank = MI.getOperand(2).getReg();
+    // out RAMPZ, rtmp
+    buildMI(MBB, MBBI, AVR::OUTARr).addImm(STI.getIORegRAMPZ()).addReg(Bank);
+  }
+
   // Use a temporary register if src and dst registers are the same.
   if (DstReg == SrcReg)
     TmpReg = scavengeGPR8(MI);
@@ -857,8 +868,51 @@ bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
 }
 
 template <>
+bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
+  return expandLPMWELPMW(MBB, MBBI, false);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ELPMWRdZ>(Block &MBB, BlockIt MBBI) {
+  return expandLPMWELPMW(MBB, MBBI, true);
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ELPMBRdZ>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register BankReg = MI.getOperand(2).getReg();
+  bool SrcIsKill = MI.getOperand(1).isKill();
+  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
+
+  // Set the I/O register RAMPZ for ELPM (out RAMPZ, rtmp).
+  buildMI(MBB, MBBI, AVR::OUTARr).addImm(STI.getIORegRAMPZ()).addReg(BankReg);
+
+  // Load byte.
+  auto MILB = buildMI(MBB, MBBI, AVR::ELPMRdZ)
+                  .addReg(DstReg, RegState::Define)
+                  .addReg(SrcReg, getKillRegState(SrcIsKill));
+
+  MILB.setMemRefs(MI.memoperands());
+
+  MI.eraseFromParent();
+  return true;
+}
+
+template <>
 bool AVRExpandPseudo::expand<AVR::LPMWRdZPi>(Block &MBB, BlockIt MBBI) {
-  llvm_unreachable("wide LPMPi is unimplemented");
+  llvm_unreachable("16-bit LPMPi is unimplemented");
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ELPMBRdZPi>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("byte ELPMPi is unimplemented");
+}
+
+template <>
+bool AVRExpandPseudo::expand<AVR::ELPMWRdZPi>(Block &MBB, BlockIt MBBI) {
+  llvm_unreachable("16-bit ELPMPi is unimplemented");
 }
 
 template <typename Func>
@@ -1411,6 +1465,30 @@ bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) {
   return true;
 }
 
+template <>
+bool AVRExpandPseudo::expand<AVR::LSLWHiRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // add hireg, hireg <==> lsl hireg
+  auto MILSL =
+      buildMI(MBB, MBBI, AVR::ADDRdRr)
+          .addReg(DstHiReg, RegState::Define, getDeadRegState(DstIsDead))
+          .addReg(DstHiReg, getKillRegState(DstIsKill))
+          .addReg(DstHiReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MILSL->getOperand(3).setIsDead();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AVRExpandPseudo::expandLSLW4Rd(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
@@ -1586,6 +1664,29 @@ bool AVRExpandPseudo::expand<AVR::LSRWRd>(Block &MBB, BlockIt MBBI) {
   return true;
 }
 
+template <>
+bool AVRExpandPseudo::expand<AVR::LSRWLoRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // lsr loreg
+  auto MILSR =
+      buildMI(MBB, MBBI, AVR::LSRRd)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MILSR->getOperand(2).setIsDead();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AVRExpandPseudo::expandLSRW4Rd(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
@@ -1773,6 +1874,29 @@ bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
   return true;
 }
 
+template <>
+bool AVRExpandPseudo::expand<AVR::ASRWLoRd>(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+  bool ImpIsDead = MI.getOperand(2).isDead();
+  TRI->splitReg(DstReg, DstLoReg, DstHiReg);
+
+  // asr loreg
+  auto MIASR =
+      buildMI(MBB, MBBI, AVR::ASRRd)
+          .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
+          .addReg(DstLoReg, getKillRegState(DstIsKill));
+
+  if (ImpIsDead)
+    MIASR->getOperand(2).setIsDead();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AVRExpandPseudo::expandASRW8Rd(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
@@ -1921,6 +2045,44 @@ bool AVRExpandPseudo::expand<AVR::LSRBNRd>(Block &MBB, BlockIt MBBI) {
   }
 }
 
+bool AVRExpandPseudo::expandASRB6Rd(Block &MBB, BlockIt MBBI) {
+  MachineInstr &MI = *MBBI;
+  Register DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+  bool DstIsKill = MI.getOperand(1).isKill();
+
+  // bst r24, 6
+  // lsl r24
+  // sbc r24, r24
+  // bld r24, 0
+
+  buildMI(MBB, MBBI, AVR::BST)
+      .addReg(DstReg)
+      .addImm(6)
+      ->getOperand(2)
+      .setIsUndef(true);
+
+  buildMI(MBB, MBBI, AVR::ADDRdRr) // LSL Rd <==> ADD Rd, Rd
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  buildMI(MBB, MBBI, AVR::SBCRdRr)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addReg(DstReg, getKillRegState(DstIsKill));
+
+  buildMI(MBB, MBBI, AVR::BLD)
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg, getKillRegState(DstIsKill))
+      .addImm(0)
+      ->getOperand(3)
+      .setIsKill();
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AVRExpandPseudo::expandASRB7Rd(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   Register DstReg = MI.getOperand(0).getReg();
@@ -1957,6 +2119,8 @@ bool AVRExpandPseudo::expand<AVR::ASRBNRd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Imm = MI.getOperand(2).getImm();
   switch (Imm) {
+  case 6:
+    return expandASRB6Rd(MBB, MBBI);
   case 7:
     return expandASRB7Rd(MBB, MBBI);
   default:
@@ -2158,6 +2322,10 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
     EXPAND(AVR::LDDWRdPtrQ);
     EXPAND(AVR::LPMWRdZ);
     EXPAND(AVR::LPMWRdZPi);
+    EXPAND(AVR::ELPMBRdZ);
+    EXPAND(AVR::ELPMWRdZ);
+    EXPAND(AVR::ELPMBRdZPi);
+    EXPAND(AVR::ELPMWRdZPi);
     EXPAND(AVR::AtomicLoad8);
     EXPAND(AVR::AtomicLoad16);
     EXPAND(AVR::AtomicStore8);
@@ -2189,6 +2357,9 @@ bool AVRExpandPseudo::expandMI(Block &MBB, BlockIt MBBI) {
     EXPAND(AVR::RORWRd);
     EXPAND(AVR::ROLWRd);
     EXPAND(AVR::ASRWRd);
+    EXPAND(AVR::LSLWHiRd);
+    EXPAND(AVR::LSRWLoRd);
+    EXPAND(AVR::ASRWLoRd);
     EXPAND(AVR::LSLWNRd);
     EXPAND(AVR::LSRWNRd);
     EXPAND(AVR::ASRWNRd);
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index 543d94875037..b3bc9ede205e 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -79,11 +79,6 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(AVR::R0, RegState::Kill)
         .setMIFlag(MachineInstr::FrameSetup);
     BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr))
-        .addReg(AVR::R0, RegState::Define)
-        .addReg(AVR::R0, RegState::Kill)
-        .addReg(AVR::R0, RegState::Kill)
-        .setMIFlag(MachineInstr::FrameSetup);
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::EORRdRr))
         .addReg(AVR::R1, RegState::Define)
         .addReg(AVR::R1, RegState::Kill)
         .addReg(AVR::R1, RegState::Kill)
@@ -176,7 +171,7 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
   const AVRInstrInfo &TII = *STI.getInstrInfo();
 
   // Early exit if there is no need to restore the frame pointer.
-  if (!FrameSize) {
+  if (!FrameSize && !MF.getFrameInfo().hasVarSizedObjects()) {
     restoreStatusRegister(MF, MBB);
     return;
   }
@@ -193,22 +188,24 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
     --MBBI;
   }
 
-  unsigned Opcode;
+  if (FrameSize) {
+    unsigned Opcode;
 
-  // Select the optimal opcode depending on how big it is.
-  if (isUInt<6>(FrameSize)) {
-    Opcode = AVR::ADIWRdK;
-  } else {
-    Opcode = AVR::SUBIWRdK;
-    FrameSize = -FrameSize;
-  }
+    // Select the optimal opcode depending on how big it is.
+    if (isUInt<6>(FrameSize)) {
+      Opcode = AVR::ADIWRdK;
+    } else {
+      Opcode = AVR::SUBIWRdK;
+      FrameSize = -FrameSize;
+    }
 
-  // Restore the frame pointer by doing FP += <size>.
-  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28)
-                         .addReg(AVR::R29R28, RegState::Kill)
-                         .addImm(FrameSize);
-  // The SREG implicit def is dead.
-  MI->getOperand(3).setIsDead();
+    // Restore the frame pointer by doing FP += <size>.
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opcode), AVR::R29R28)
+                          .addReg(AVR::R29R28, RegState::Kill)
+                          .addImm(FrameSize);
+    // The SREG implicit def is dead.
+    MI->getOperand(3).setIsDead();
+  }
 
   // Write back R29R28 to SP and temporarily disable interrupts.
   BuildMI(MBB, MBBI, DL, TII.get(AVR::SPWRITE), AVR::SP)
@@ -230,7 +227,8 @@ bool AVRFrameLowering::hasFP(const MachineFunction &MF) const {
   const AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
 
   return (FuncInfo->getHasSpills() || FuncInfo->getHasAllocas() ||
-          FuncInfo->getHasStackArgs());
+          FuncInfo->getHasStackArgs() ||
+          MF.getFrameInfo().hasVarSizedObjects());
 }
 
 bool AVRFrameLowering::spillCalleeSavedRegisters(
@@ -248,7 +246,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters(
   AVRMachineFunctionInfo *AVRFI = MF.getInfo<AVRMachineFunctionInfo>();
 
   for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     bool IsNotLiveIn = !MBB.isLiveIn(Reg);
 
     assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 &&
@@ -286,7 +284,7 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters(
   const TargetInstrInfo &TII = *STI.getInstrInfo();
 
   for (const CalleeSavedInfo &CCSI : CSI) {
-    unsigned Reg = CCSI.getReg();
+    Register Reg = CCSI.getReg();
 
     assert(TRI->getRegSizeInBits(*TRI->getMinimalPhysRegClass(Reg)) == 8 &&
            "Invalid register size");
@@ -480,56 +478,4 @@ char AVRFrameAnalyzer::ID = 0;
 /// Creates instance of the frame analyzer pass.
 FunctionPass *createAVRFrameAnalyzerPass() { return new AVRFrameAnalyzer(); }
 
-/// Create the Dynalloca Stack Pointer Save/Restore pass.
-/// Insert a copy of SP before allocating the dynamic stack memory and restore
-/// it in function exit to restore the original SP state. This avoids the need
-/// of reserving a register pair for a frame pointer.
-struct AVRDynAllocaSR : public MachineFunctionPass {
-  static char ID;
-  AVRDynAllocaSR() : MachineFunctionPass(ID) {}
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    // Early exit when there are no variable sized objects in the function.
-    if (!MF.getFrameInfo().hasVarSizedObjects()) {
-      return false;
-    }
-
-    const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
-    const TargetInstrInfo &TII = *STI.getInstrInfo();
-    MachineBasicBlock &EntryMBB = MF.front();
-    MachineBasicBlock::iterator MBBI = EntryMBB.begin();
-    DebugLoc DL = EntryMBB.findDebugLoc(MBBI);
-
-    Register SPCopy =
-        MF.getRegInfo().createVirtualRegister(&AVR::DREGSRegClass);
-
-    // Create a copy of SP in function entry before any dynallocas are
-    // inserted.
-    BuildMI(EntryMBB, MBBI, DL, TII.get(AVR::COPY), SPCopy).addReg(AVR::SP);
-
-    // Restore SP in all exit basic blocks.
-    for (MachineBasicBlock &MBB : MF) {
-      // If last instruction is a return instruction, add a restore copy.
-      if (!MBB.empty() && MBB.back().isReturn()) {
-        MBBI = MBB.getLastNonDebugInstr();
-        DL = MBBI->getDebugLoc();
-        BuildMI(MBB, MBBI, DL, TII.get(AVR::COPY), AVR::SP)
-            .addReg(SPCopy, RegState::Kill);
-      }
-    }
-
-    return true;
-  }
-
-  StringRef getPassName() const override {
-    return "AVR dynalloca stack pointer save/restore";
-  }
-};
-
-char AVRDynAllocaSR::ID = 0;
-
-/// createAVRDynAllocaSRPass - returns an instance of the dynalloca stack
-/// pointer save/restore pass.
-FunctionPass *createAVRDynAllocaSRPass() { return new AVRDynAllocaSR(); }
-
 } // end of namespace llvm
diff --git a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 7ec2629ab45d..df364cae671c 100644
--- a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -38,7 +38,7 @@ public:
   bool SelectAddr(SDNode *Op, SDValue N, SDValue &Base, SDValue &Disp);
 
   bool selectIndexedLoad(SDNode *N);
-  unsigned selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT);
+  unsigned selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT, int Bank);
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintCode,
                                     std::vector<SDValue> &OutOps) override;
@@ -165,35 +165,31 @@ bool AVRDAGToDAGISel::selectIndexedLoad(SDNode *N) {
   return true;
 }
 
-unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD,
-                                                   MVT VT) {
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-
+unsigned AVRDAGToDAGISel::selectIndexedProgMemLoad(const LoadSDNode *LD, MVT VT,
+                                                   int Bank) {
   // Progmem indexed loads only work in POSTINC mode.
-  if (LD->getExtensionType() != ISD::NON_EXTLOAD || AM != ISD::POST_INC) {
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD ||
+      LD->getAddressingMode() != ISD::POST_INC)
     return 0;
-  }
+
+  // Feature ELPM is needed for loading from extended program memory.
+  assert((Bank == 0 || Subtarget->hasELPM()) &&
+         "cannot load from extended program memory on this mcu");
 
   unsigned Opcode = 0;
   int Offs = cast<ConstantSDNode>(LD->getOffset())->getSExtValue();
 
   switch (VT.SimpleTy) {
-  case MVT::i8: {
-    if (Offs != 1) {
-      return 0;
-    }
-    Opcode = AVR::LPMRdZPi;
+  case MVT::i8:
+    if (Offs == 1)
+      Opcode = Bank > 0 ? AVR::ELPMBRdZPi : AVR::LPMRdZPi;
     break;
-  }
-  case MVT::i16: {
-    if (Offs != 2) {
-      return 0;
-    }
-    Opcode = AVR::LPMWRdZPi;
+  case MVT::i16:
+    if (Offs == 2)
+      Opcode = Bank > 0 ? AVR::ELPMWRdZPi : AVR::LPMWRdZPi;
     break;
-  }
   default:
-    return 0;
+    break;
   }
 
   return Opcode;
@@ -360,7 +356,12 @@ template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) {
     return selectIndexedLoad(N);
   }
 
-  assert(Subtarget->hasLPM() && "cannot load from program memory on this mcu");
+  if (!Subtarget->hasLPM())
+    report_fatal_error("cannot load from program memory on this mcu");
+
+  int ProgMemBank = AVR::getProgramMemoryBank(LD);
+  if (ProgMemBank < 0 || ProgMemBank > 5)
+    report_fatal_error("unexpected program memory bank");
 
   // This is a flash memory load, move the pointer into R31R30 and emit
   // the lpm instruction.
@@ -374,25 +375,48 @@ template <> bool AVRDAGToDAGISel::select<ISD::LOAD>(SDNode *N) {
   Ptr = CurDAG->getCopyFromReg(Chain, DL, AVR::R31R30, MVT::i16,
                                Chain.getValue(1));
 
-  SDValue RegZ = CurDAG->getRegister(AVR::R31R30, MVT::i16);
-
   // Check if the opcode can be converted into an indexed load.
-  if (unsigned LPMOpc = selectIndexedProgMemLoad(LD, VT)) {
+  if (unsigned LPMOpc = selectIndexedProgMemLoad(LD, VT, ProgMemBank)) {
     // It is legal to fold the load into an indexed load.
-    ResNode =
-        CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr, RegZ);
-    ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+    if (ProgMemBank == 0) {
+      ResNode =
+          CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other, Ptr);
+    } else {
+      // Do not combine the LDI instruction into the ELPM pseudo instruction,
+      // since it may be reused by other ELPM pseudo instructions.
+      SDValue NC = CurDAG->getTargetConstant(ProgMemBank, DL, MVT::i8);
+      auto *NP = CurDAG->getMachineNode(AVR::LDIRdK, DL, MVT::i8, NC);
+      ResNode = CurDAG->getMachineNode(LPMOpc, DL, VT, MVT::i16, MVT::Other,
+                                       Ptr, SDValue(NP, 0));
+    }
   } else {
     // Selecting an indexed load is not legal, fallback to a normal load.
     switch (VT.SimpleTy) {
     case MVT::i8:
-      ResNode = CurDAG->getMachineNode(AVR::LPMRdZ, DL, MVT::i8, MVT::Other,
-                                       Ptr, RegZ);
+      if (ProgMemBank == 0) {
+        ResNode =
+            CurDAG->getMachineNode(AVR::LPMRdZ, DL, MVT::i8, MVT::Other, Ptr);
+      } else {
+        // Do not combine the LDI instruction into the ELPM pseudo instruction,
+        // since it may be reused by other ELPM pseudo instructions.
+        SDValue NC = CurDAG->getTargetConstant(ProgMemBank, DL, MVT::i8);
+        auto *NP = CurDAG->getMachineNode(AVR::LDIRdK, DL, MVT::i8, NC);
+        ResNode = CurDAG->getMachineNode(AVR::ELPMBRdZ, DL, MVT::i8, MVT::Other,
+                                         Ptr, SDValue(NP, 0));
+      }
       break;
     case MVT::i16:
-      ResNode = CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16, MVT::Other,
-                                       Ptr, RegZ);
-      ReplaceUses(SDValue(N, 1), SDValue(ResNode, 1));
+      if (ProgMemBank == 0) {
+        ResNode =
+            CurDAG->getMachineNode(AVR::LPMWRdZ, DL, MVT::i16, MVT::Other, Ptr);
+      } else {
+        // Do not combine the LDI instruction into the ELPM pseudo instruction,
+        // since LDI requires the destination register in range R16~R31.
+        SDValue NC = CurDAG->getTargetConstant(ProgMemBank, DL, MVT::i8);
+        auto *NP = CurDAG->getMachineNode(AVR::LDIRdK, DL, MVT::i8, NC);
+        ResNode = CurDAG->getMachineNode(AVR::ELPMWRdZ, DL, MVT::i16,
+                                         MVT::Other, Ptr, SDValue(NP, 0));
+      }
       break;
     default:
       llvm_unreachable("Unsupported VT!");
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index a6f2afb87102..a58fedf6cd36 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -359,6 +359,11 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
       Victim = DAG.getNode(AVRISD::LSRBN, dl, VT, Victim,
                            DAG.getConstant(7, dl, VT));
       ShiftAmount = 0;
+    } else if (Op.getOpcode() == ISD::SRA && ShiftAmount == 6) {
+      // Optimize ASR when ShiftAmount == 6.
+      Victim = DAG.getNode(AVRISD::ASRBN, dl, VT, Victim,
+                           DAG.getConstant(6, dl, VT));
+      ShiftAmount = 0;
     } else if (Op.getOpcode() == ISD::SRA && ShiftAmount == 7) {
       // Optimize ASR when ShiftAmount == 7.
       Victim = DAG.getNode(AVRISD::ASRBN, dl, VT, Victim,
@@ -387,16 +392,22 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
         Victim = DAG.getNode(AVRISD::LSLWN, dl, VT, Victim,
                              DAG.getConstant(8, dl, VT));
         ShiftAmount -= 8;
+        // Only operate on the higher byte for remaining shift bits.
+        Opc8 = AVRISD::LSLHI;
         break;
       case ISD::SRL:
         Victim = DAG.getNode(AVRISD::LSRWN, dl, VT, Victim,
                              DAG.getConstant(8, dl, VT));
         ShiftAmount -= 8;
+        // Only operate on the lower byte for remaining shift bits.
+        Opc8 = AVRISD::LSRLO;
         break;
       case ISD::SRA:
         Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim,
                              DAG.getConstant(8, dl, VT));
         ShiftAmount -= 8;
+        // Only operate on the lower byte for remaining shift bits.
+        Opc8 = AVRISD::ASRLO;
         break;
       default:
         break;
@@ -407,11 +418,22 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
         Victim = DAG.getNode(AVRISD::LSLWN, dl, VT, Victim,
                              DAG.getConstant(12, dl, VT));
         ShiftAmount -= 12;
+        // Only operate on the higher byte for remaining shift bits.
+        Opc8 = AVRISD::LSLHI;
         break;
       case ISD::SRL:
         Victim = DAG.getNode(AVRISD::LSRWN, dl, VT, Victim,
                              DAG.getConstant(12, dl, VT));
         ShiftAmount -= 12;
+        // Only operate on the lower byte for remaining shift bits.
+        Opc8 = AVRISD::LSRLO;
+        break;
+      case ISD::SRA:
+        Victim = DAG.getNode(AVRISD::ASRWN, dl, VT, Victim,
+                             DAG.getConstant(8, dl, VT));
+        ShiftAmount -= 8;
+        // Only operate on the lower byte for remaining shift bits.
+        Opc8 = AVRISD::ASRLO;
         break;
       default:
         break;
@@ -874,7 +896,8 @@ bool AVRTargetLowering::isLegalAddressingMode(const DataLayout &DL,
   // Allow reg+<6bit> offset.
   if (Offs < 0)
     Offs = -Offs;
-  if (AM.BaseGV == 0 && AM.HasBaseReg && AM.Scale == 0 && isUInt<6>(Offs)) {
+  if (AM.BaseGV == nullptr && AM.HasBaseReg && AM.Scale == 0 &&
+      isUInt<6>(Offs)) {
     return true;
   }
 
@@ -1169,7 +1192,7 @@ SDValue AVRTargetLowering::LowerFormalArguments(
         llvm_unreachable("Unknown argument type!");
       }
 
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
 
       // :NOTE: Clang should not promote any i8 into i16 but for safety the
@@ -1672,6 +1695,18 @@ MachineBasicBlock *AVRTargetLowering::insertMul(MachineInstr &MI,
   return BB;
 }
 
+// Insert a read from R1, which almost always contains the value 0.
+MachineBasicBlock *
+AVRTargetLowering::insertCopyR1(MachineInstr &MI, MachineBasicBlock *BB) const {
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineBasicBlock::iterator I(MI);
+  BuildMI(*BB, I, MI.getDebugLoc(), TII.get(AVR::COPY))
+      .add(MI.getOperand(0))
+      .addReg(AVR::R1);
+  MI.eraseFromParent();
+  return BB;
+}
+
 MachineBasicBlock *
 AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *MBB) const {
@@ -1694,6 +1729,8 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case AVR::MULRdRr:
   case AVR::MULSRdRr:
     return insertMul(MI, MBB);
+  case AVR::CopyR1:
+    return insertCopyR1(MI, MBB);
   }
 
   assert((Opc == AVR::Select16 || Opc == AVR::Select8) &&
@@ -2012,7 +2049,7 @@ void AVRTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue> &Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
   SDLoc DL(Op);
   EVT Ty = Op.getValueType();
 
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index 3ae036b66bcb..116417b61566 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -38,12 +38,15 @@ enum NodeType {
   LSL,     ///< Logical shift left.
   LSLBN,   ///< Byte logical shift left N bits.
   LSLWN,   ///< Word logical shift left N bits.
+  LSLHI,   ///< Higher 8-bit of word logical shift left.
   LSR,     ///< Logical shift right.
   LSRBN,   ///< Byte logical shift right N bits.
   LSRWN,   ///< Word logical shift right N bits.
+  LSRLO,   ///< Lower 8-bit of word logical shift right.
   ASR,     ///< Arithmetic shift right.
   ASRBN,   ///< Byte arithmetic shift right N bits.
   ASRWN,   ///< Word arithmetic shift right N bits.
+  ASRLO,   ///< Lower 8-bit of word arithmetic shift right.
   ROR,     ///< Bit rotate right.
   ROL,     ///< Bit rotate left.
   LSLLOOP, ///< A loop of single logical shift left instructions.
@@ -184,6 +187,8 @@ protected:
 private:
   MachineBasicBlock *insertShift(MachineInstr &MI, MachineBasicBlock *BB) const;
   MachineBasicBlock *insertMul(MachineInstr &MI, MachineBasicBlock *BB) const;
+  MachineBasicBlock *insertCopyR1(MachineInstr &MI,
+                                  MachineBasicBlock *BB) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index 51060018a5ca..ac52c47f93d5 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -304,11 +304,11 @@ bool AVRInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
       }
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         UnCondBrIter = MBB.end();
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index c7f423292da0..2b96dc0b833a 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -60,6 +60,9 @@ def AVRlsr : SDNode<"AVRISD::LSR", SDTIntUnaryOp>;
 def AVRrol : SDNode<"AVRISD::ROL", SDTIntUnaryOp>;
 def AVRror : SDNode<"AVRISD::ROR", SDTIntUnaryOp>;
 def AVRasr : SDNode<"AVRISD::ASR", SDTIntUnaryOp>;
+def AVRlslhi : SDNode<"AVRISD::LSLHI", SDTIntUnaryOp>;
+def AVRlsrlo : SDNode<"AVRISD::LSRLO", SDTIntUnaryOp>;
+def AVRasrlo : SDNode<"AVRISD::ASRLO", SDTIntUnaryOp>;
 def AVRlslbn : SDNode<"AVRISD::LSLBN", SDTIntBinOp>;
 def AVRlsrbn : SDNode<"AVRISD::LSRBN", SDTIntBinOp>;
 def AVRasrbn : SDNode<"AVRISD::ASRBN", SDTIntBinOp>;
@@ -1391,7 +1394,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
   // ldd Rd,   P+q
   // ldd Rd+1, P+q+1
   let Constraints = "@earlyclobber $dst" in def LDDWRdPtrQ
-      : Pseudo<(outs DREGS_WITHOUT_YZ_WORKAROUND
+      : Pseudo<(outs DREGS
                 : $dst),
                (ins memri
                 : $memri),
@@ -1699,21 +1702,34 @@ let mayLoad = 1, hasSideEffects = 0 in {
       : F16<0b1001010111011000, (outs), (ins), "elpm", []>,
       Requires<[HasELPM]>;
 
-  def ELPMRdZ : FLPMX<1, 0,
-                      (outs GPR8
-                       : $dst),
-                      (ins ZREG
-                       : $z),
+  def ELPMRdZ : FLPMX<1, 0, (outs GPR8:$dst), (ins ZREG:$z),
                       "elpm\t$dst, $z", []>,
                 Requires<[HasELPMX]>;
 
-  let Defs = [R31R30] in def ELPMRdZPi : FLPMX<1, 1,
-                                               (outs GPR8
-                                                : $dst),
-                                               (ins ZREG
-                                                : $z),
-                                               "elpm\t$dst, $z+", []>,
-      Requires<[HasELPMX]>;
+  let Defs = [R31R30] in {
+    def ELPMRdZPi : FLPMX<1, 1, (outs GPR8:$dst), (ins ZREG:$z),
+                          "elpm\t$dst, $z+", []>,
+                    Requires<[HasELPMX]>;
+  }
+
+  // These pseudos are combination of the OUT and ELPM instructions.
+  let Defs = [R31R30], hasSideEffects = 1 in {
+    def ELPMBRdZ : Pseudo<(outs GPR8:$dst), (ins ZREG:$z, LD8:$p),
+                          "elpmb\t$dst, $z, $p", []>,
+                   Requires<[HasELPMX]>;
+
+    def ELPMWRdZ : Pseudo<(outs DREGS:$dst), (ins ZREG:$z, LD8:$p),
+                          "elpmw\t$dst, $z, $p", []>,
+                   Requires<[HasELPMX]>;
+
+    def ELPMBRdZPi : Pseudo<(outs GPR8:$dst), (ins ZREG:$z, LD8:$p),
+                            "elpmb\t$dst, $z+, $p", []>,
+                     Requires<[HasELPMX]>;
+
+    def ELPMWRdZPi : Pseudo<(outs DREGS:$dst), (ins ZREG:$z, LD8:$p),
+                            "elpmw\t$dst, $z+, $p", []>,
+                     Requires<[HasELPMX]>;
+  }
 }
 
 // Store program memory operations.
@@ -1848,6 +1864,9 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
                                 : $src)),
                        (implicit SREG)]>;
 
+  def LSLWHiRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lslwhi\t$rd",
+                        [(set i16:$rd, (AVRlslhi i16:$src)), (implicit SREG)]>;
+
   def LSLWNRd : Pseudo<(outs DLDREGS
                         : $rd),
                        (ins DREGS
@@ -1895,6 +1914,9 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
                                 : $src)),
                        (implicit SREG)]>;
 
+  def LSRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "lsrwlo\t$rd",
+                        [(set i16:$rd, (AVRlsrlo i16:$src)), (implicit SREG)]>;
+
   def LSRWNRd : Pseudo<(outs DLDREGS
                         : $rd),
                        (ins DREGS
@@ -1968,6 +1990,9 @@ let Constraints = "$src = $rd", Defs = [SREG] in {
                                 : $src)),
                        (implicit SREG)]>;
 
+  def ASRWLoRd : Pseudo<(outs DREGS:$rd), (ins DREGS:$src), "asrwlo\t$rd",
+                        [(set i16:$rd, (AVRasrlo i16:$src)), (implicit SREG)]>;
+
   def ROLBRd : Pseudo<(outs GPR8
                        : $rd),
                       (ins GPR8
@@ -2365,6 +2390,10 @@ def Asr16 : ShiftPseudo<(outs DREGS
                                                      : $src, i8
                                                      : $cnt))]>;
 
+// lowered to a copy from R1, which contains the value zero.
+let usesCustomInserter=1 in
+def CopyR1 : Pseudo<(outs GPR8:$rd), (ins), "clrz\t$rd", [(set i8:$rd, 0)]>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
index 1886debaf492..5dd7f5c55695 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -44,10 +44,7 @@ AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t *
 AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
-  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
-
-  return AFI->isInterruptOrSignalHandler() ? CSR_Interrupts_RegMask
-                                           : CSR_Normal_RegMask;
+  return CSR_Normal_RegMask;
 }
 
 BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.h b/llvm/lib/Target/AVR/AVRRegisterInfo.h
index fa27d9283209..2c5647b52c1c 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.h
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -27,7 +27,7 @@ public:
 
 public:
   const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override;
   BitVector getReservedRegs(const MachineFunction &MF) const override;
@@ -39,7 +39,7 @@ public:
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 
   Register getFrameRegister(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index bb4e86ca0536..c5fda788fe4d 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -178,26 +178,6 @@ def DREGSMOVW : RegisterClass<"AVR", [i16], 8,
                                   R29R28, R17R16, R15R14, R13R12, R11R10, R9R8,
                                   R7R6, R5R4, R3R2, R1R0)>;
 
-// The 16-bit DREGS register class, excluding the Z pointer register.
-//
-// This is used by instructions which cause high pointer register
-// contention which leads to an assertion in the register allocator.
-//
-// There is no technical reason why instructions that use this class
-// cannot use Z; it's simply a workaround a regalloc bug.
-//
-// More information can be found in PR39553.
-def DREGS_WITHOUT_YZ_WORKAROUND
-    : RegisterClass<"AVR", [i16], 8,
-                    (
-                        // Return value and arguments.
-                        add R25R24, R19R18, R21R20, R23R22,
-                        // Scratch registers.
-                        R27R26,
-                        // Callee saved registers.
-                        R17R16, R15R14, R13R12, R11R10, R9R8, R7R6, R5R4, R3R2,
-                        R1R0)>;
-
 // 16-bit register class for immediate instructions.
 def DLDREGS : RegisterClass<"AVR", [i16], 8,
                             (
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.cpp b/llvm/lib/Target/AVR/AVRSubtarget.cpp
index 990e1c57e63f..8a5481423e9f 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.cpp
+++ b/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -40,8 +40,7 @@ AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
       m_hasTinyEncoding(false), m_hasMemMappedGPR(false),
       m_FeatureSetDummy(false),
 
-      InstrInfo(), FrameLowering(),
-      TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo() {
+      TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)) {
   // Parse features string.
   ParseSubtargetFeatures(CPU, /*TuneCPU*/ CPU, FS);
 }
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h
index 90b9cd4da7c1..f8ca191b1868 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -91,6 +91,9 @@ public:
     return ELFArch;
   }
 
+  /// Get I/O register address.
+  int getIORegRAMPZ(void) const { return 0x3b; }
+
 private:
   /// The ELF e_flags architecture.
   unsigned ELFArch;
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index 65740f7c2306..22b9ba3ece07 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -70,7 +70,6 @@ public:
   bool addInstSelector() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
-  void addPreRegAlloc() override;
 };
 } // namespace
 
@@ -118,11 +117,6 @@ bool AVRPassConfig::addInstSelector() {
   return false;
 }
 
-void AVRPassConfig::addPreRegAlloc() {
-  // Create the dynalloc SP save/restore pass to handle variable sized allocas.
-  addPass(createAVRDynAllocaSRPass());
-}
-
 void AVRPassConfig::addPreSched2() {
   addPass(createAVRRelaxMemPass());
   addPass(createAVRExpandPseudoPass());
diff --git a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
index c7715ca1f51b..fe8e863be1a3 100644
--- a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AVRTargetObjectFile.h"
+#include "AVRTargetMachine.h"
 
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -22,14 +23,60 @@ void AVRTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
   Base::Initialize(Ctx, TM);
   ProgmemDataSection =
       Ctx.getELFSection(".progmem.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  Progmem1DataSection =
+      Ctx.getELFSection(".progmem1.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  Progmem2DataSection =
+      Ctx.getELFSection(".progmem2.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  Progmem3DataSection =
+      Ctx.getELFSection(".progmem3.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  Progmem4DataSection =
+      Ctx.getELFSection(".progmem4.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+  Progmem5DataSection =
+      Ctx.getELFSection(".progmem5.data", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
 }
 
 MCSection *AVRTargetObjectFile::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
-  // Global values in flash memory are placed in the progmem.data section
+  // Global values in flash memory are placed in the progmem*.data section
   // unless they already have a user assigned section.
-  if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection() && Kind.isReadOnly())
-    return ProgmemDataSection;
+  const auto &AVRTM = static_cast<const AVRTargetMachine &>(TM);
+  if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection() &&
+      Kind.isReadOnly()) {
+    // The AVR subtarget should support LPM to access section '.progmem*.data'.
+    if (!AVRTM.getSubtargetImpl()->hasLPM()) {
+      // TODO: Get the global object's location in source file.
+      getContext().reportError(
+          SMLoc(),
+          "Current AVR subtarget does not support accessing program memory");
+      return Base::SelectSectionForGlobal(GO, Kind, TM);
+    }
+    // The AVR subtarget should support ELPM to access section
+    // '.progmem[1|2|3|4|5].data'.
+    if (!AVRTM.getSubtargetImpl()->hasELPM() &&
+        AVR::getAddressSpace(GO) != AVR::ProgramMemory) {
+      // TODO: Get the global object's location in source file.
+      getContext().reportError(SMLoc(),
+                               "Current AVR subtarget does not support "
+                               "accessing extended program memory");
+      return ProgmemDataSection;
+    }
+    switch (AVR::getAddressSpace(GO)) {
+    case AVR::ProgramMemory: // address space 1
+      return ProgmemDataSection;
+    case AVR::ProgramMemory1: // address space 2
+      return Progmem1DataSection;
+    case AVR::ProgramMemory2: // address space 3
+      return Progmem2DataSection;
+    case AVR::ProgramMemory3: // address space 4
+      return Progmem3DataSection;
+    case AVR::ProgramMemory4: // address space 5
+      return Progmem4DataSection;
+    case AVR::ProgramMemory5: // address space 6
+      return Progmem5DataSection;
+    default:
+      llvm_unreachable("unexpected program memory index");
+    }
+  }
 
   // Otherwise, we work the same way as ELF.
   return Base::SelectSectionForGlobal(GO, Kind, TM);
diff --git a/llvm/lib/Target/AVR/AVRTargetObjectFile.h b/llvm/lib/Target/AVR/AVRTargetObjectFile.h
index 53d8510d9a21..609849b44029 100644
--- a/llvm/lib/Target/AVR/AVRTargetObjectFile.h
+++ b/llvm/lib/Target/AVR/AVRTargetObjectFile.h
@@ -25,6 +25,11 @@ public:
 
 private:
   MCSection *ProgmemDataSection;
+  MCSection *Progmem1DataSection;
+  MCSection *Progmem2DataSection;
+  MCSection *Progmem3DataSection;
+  MCSection *Progmem4DataSection;
+  MCSection *Progmem5DataSection;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index 95ecd28200ba..f19e7840eb31 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -107,13 +107,13 @@ class AVROperand : public MCParsedAsmOperand {
 
 public:
   AVROperand(StringRef Tok, SMLoc const &S)
-      : Base(), Kind(k_Token), Tok(Tok), Start(S), End(S) {}
+      : Kind(k_Token), Tok(Tok), Start(S), End(S) {}
   AVROperand(unsigned Reg, SMLoc const &S, SMLoc const &E)
-      : Base(), Kind(k_Register), RegImm({Reg, nullptr}), Start(S), End(E) {}
+      : Kind(k_Register), RegImm({Reg, nullptr}), Start(S), End(E) {}
   AVROperand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
-      : Base(), Kind(k_Immediate), RegImm({0, Imm}), Start(S), End(E) {}
+      : Kind(k_Immediate), RegImm({0, Imm}), Start(S), End(E) {}
   AVROperand(unsigned Reg, MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
-      : Base(), Kind(k_Memri), RegImm({Reg, Imm}), Start(S), End(E) {}
+      : Kind(k_Memri), RegImm({Reg, Imm}), Start(S), End(E) {}
 
   struct RegisterImmediate {
     unsigned Reg;
@@ -281,7 +281,7 @@ bool AVRAsmParser::invalidOperand(SMLoc const &Loc,
                                   OperandVector const &Operands,
                                   uint64_t const &ErrorInfo) {
   SMLoc ErrorLoc = Loc;
-  char const *Diag = 0;
+  char const *Diag = nullptr;
 
   if (ErrorInfo != ~0U) {
     if (ErrorInfo >= Operands.size()) {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index a3a4d63932c0..3624ade854c0 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -47,7 +47,7 @@ static void signed_width(unsigned Width, uint64_t Value,
                   " to " + std::to_string(Max) + ")";
 
     if (Ctx) {
-      Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
+      Ctx->reportError(Fixup.getLoc(), Diagnostic);
     } else {
       llvm_unreachable(Diagnostic.c_str());
     }
@@ -66,7 +66,7 @@ static void unsigned_width(unsigned Width, uint64_t Value,
         " (expected an integer in the range 0 to " + std::to_string(Max) + ")";
 
     if (Ctx) {
-      Ctx->reportFatalError(Fixup.getLoc(), Diagnostic);
+      Ctx->reportError(Fixup.getLoc(), Diagnostic);
     } else {
       llvm_unreachable(Diagnostic.c_str());
     }
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 50298bf5e943..697deb117bcb 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -101,7 +101,7 @@ struct BPFOperand : public MCParsedAsmOperand {
     ImmOp Imm;
   };
 
-  BPFOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  BPFOperand(KindTy K) : Kind(K) {}
 
 public:
   BPFOperand(const BPFOperand &o) : MCParsedAsmOperand() {
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index ab7e848409d9..46141e69d9d4 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -1002,7 +1002,7 @@ bool BPFAbstractMemberAccess::transformGEPChain(CallInst *Call,
       VarType = Type::getInt64Ty(BB->getContext()); // 64bit ptr or enum value
 
     GV = new GlobalVariable(*M, VarType, false, GlobalVariable::ExternalLinkage,
-                            NULL, AccessKey);
+                            nullptr, AccessKey);
     GV->addAttribute(BPFCoreSharedInfo::AmaAttr);
     GV->setMetadata(LLVMContext::MD_preserve_access_index, TypeMeta);
     GEPGlobals[AccessKey] = GV;
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 90723ac04f64..0587cb0e16e3 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -325,7 +325,7 @@ SDValue BPFTargetLowering::LowerFormalArguments(
       default: {
         errs() << "LowerFormalArguments Unhandled argument type: "
                << RegVT.getEVTString() << '\n';
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
       case MVT::i32:
       case MVT::i64:
diff --git a/llvm/lib/Target/BPF/BPFMIChecking.cpp b/llvm/lib/Target/BPF/BPFMIChecking.cpp
index eb8c48ac49de..2bc2302cf55c 100644
--- a/llvm/lib/Target/BPF/BPFMIChecking.cpp
+++ b/llvm/lib/Target/BPF/BPFMIChecking.cpp
@@ -41,7 +41,7 @@ private:
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
-  bool processAtomicInsts(void);
+  bool processAtomicInsts();
 
 public:
 
@@ -151,7 +151,7 @@ static bool hasLiveDefs(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
   return false;
 }
 
-bool BPFMIPreEmitChecking::processAtomicInsts(void) {
+bool BPFMIPreEmitChecking::processAtomicInsts() {
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : MBB) {
       if (MI.getOpcode() != BPF::XADDW &&
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index 354980e4bf3c..7f69c8a63443 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -56,8 +56,8 @@ private:
   bool isInsnFrom32Def(MachineInstr *DefInsn);
   bool isPhiFrom32Def(MachineInstr *MovMI);
   bool isMovFrom32Def(MachineInstr *MovMI);
-  bool eliminateZExtSeq(void);
-  bool eliminateZExt(void);
+  bool eliminateZExtSeq();
+  bool eliminateZExt();
 
   std::set<MachineInstr *> PhiInsns;
 
@@ -172,7 +172,7 @@ bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI)
   return true;
 }
 
-bool BPFMIPeephole::eliminateZExtSeq(void) {
+bool BPFMIPeephole::eliminateZExtSeq() {
   MachineInstr* ToErase = nullptr;
   bool Eliminated = false;
 
@@ -240,7 +240,7 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
   return Eliminated;
 }
 
-bool BPFMIPeephole::eliminateZExt(void) {
+bool BPFMIPeephole::eliminateZExt() {
   MachineInstr* ToErase = nullptr;
   bool Eliminated = false;
 
@@ -312,7 +312,7 @@ private:
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
-  bool eliminateRedundantMov(void);
+  bool eliminateRedundantMov();
 
 public:
 
@@ -334,7 +334,7 @@ void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) {
   LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n");
 }
 
-bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
+bool BPFMIPreEmitPeephole::eliminateRedundantMov() {
   MachineInstr* ToErase = nullptr;
   bool Eliminated = false;
 
@@ -405,7 +405,7 @@ private:
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
-  bool eliminateTruncSeq(void);
+  bool eliminateTruncSeq();
 
 public:
 
@@ -452,7 +452,7 @@ void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) {
 // are 32-bit registers, but later on, kernel verifier will rewrite
 // it with 64-bit value. Therefore, truncating the value after the
 // load will result in incorrect code.
-bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) {
+bool BPFMIPeepholeTruncElim::eliminateTruncSeq() {
   MachineInstr* ToErase = nullptr;
   bool Eliminated = false;
 
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index 7e829ea43e89..b4232875383c 100644
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -55,7 +55,7 @@ private:
   // Initialize class variables.
   void initialize(MachineFunction &MFParm);
 
-  bool removeLD(void);
+  bool removeLD();
   void processCandidate(MachineRegisterInfo *MRI, MachineBasicBlock &MBB,
                         MachineInstr &MI, Register &SrcReg, Register &DstReg,
                         const GlobalValue *GVal, bool IsAma);
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
index 36237b2fc4fd..6dfb7dc39922 100644
--- a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -105,10 +105,10 @@ static bool BPFPreserveDITypeImpl(Function &F) {
 
     BasicBlock *BB = Call->getParent();
     IntegerType *VarType = Type::getInt64Ty(BB->getContext());
-    std::string GVName = BaseName + std::to_string(Count) + "$" +
-        std::to_string(Reloc);
+    std::string GVName =
+        BaseName + std::to_string(Count) + "$" + std::to_string(Reloc);
     GlobalVariable *GV = new GlobalVariable(
-        *M, VarType, false, GlobalVariable::ExternalLinkage, NULL, GVName);
+        *M, VarType, false, GlobalVariable::ExternalLinkage, nullptr, GVName);
     GV->addAttribute(BPFCoreSharedInfo::TypeIdAttr);
     GV->setMetadata(LLVMContext::MD_preserve_access_index, MD);
 
diff --git a/llvm/lib/Target/BPF/BPFSubtarget.cpp b/llvm/lib/Target/BPF/BPFSubtarget.cpp
index 77e3cd393f87..e4d98b85e58b 100644
--- a/llvm/lib/Target/BPF/BPFSubtarget.cpp
+++ b/llvm/lib/Target/BPF/BPFSubtarget.cpp
@@ -59,6 +59,6 @@ void BPFSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
 
 BPFSubtarget::BPFSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const TargetMachine &TM)
-    : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(),
+    : BPFGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
       FrameLowering(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 0c510686a13b..d536aed1d211 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -1366,7 +1366,7 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
 
     // Calculate symbol size
     const DataLayout &DL = Global.getParent()->getDataLayout();
-    uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());
+    uint32_t Size = DL.getTypeAllocSize(Global.getValueType());
 
     DataSecEntries[std::string(SecName)]->addDataSecEntry(VarId,
         Asm->getSymbol(&Global), Size);
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
index e0aeec989879..200c72a07ed6 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp
@@ -50,7 +50,7 @@ static void printExpr(const MCExpr *Expr, raw_ostream &O) {
 
 void BPFInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                   raw_ostream &O, const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     O << getRegisterName(Op.getReg());
diff --git a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
index 29b99a84a6cd..a62bd111cba9 100644
--- a/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
+++ b/llvm/lib/Target/CSKY/AsmParser/CSKYAsmParser.cpp
@@ -303,6 +303,14 @@ public:
 
   bool isRegSeq() const { return isRegSeqTemplate<CSKY::R0, CSKY::R31>(); }
 
+  bool isRegSeqV1() const {
+    return isRegSeqTemplate<CSKY::F0_32, CSKY::F15_32>();
+  }
+
+  bool isRegSeqV2() const {
+    return isRegSeqTemplate<CSKY::F0_32, CSKY::F31_32>();
+  }
+
   static bool isLegalRegList(unsigned from, unsigned to) {
     if (from == 0 && to == 0)
       return true;
diff --git a/llvm/lib/Target/CSKY/CSKY.h b/llvm/lib/Target/CSKY/CSKY.h
index 357b1e96e606..401d6fa1a0a5 100644
--- a/llvm/lib/Target/CSKY/CSKY.h
+++ b/llvm/lib/Target/CSKY/CSKY.h
@@ -21,6 +21,9 @@ class CSKYTargetMachine;
 class FunctionPass;
 
 FunctionPass *createCSKYISelDag(CSKYTargetMachine &TM);
+FunctionPass *createCSKYConstantIslandPass();
+
+void initializeCSKYConstantIslandsPass(PassRegistry &);
 
 } // namespace llvm
 
diff --git a/llvm/lib/Target/CSKY/CSKY.td b/llvm/lib/Target/CSKY/CSKY.td
index e26781ca6aa1..ddb7fe93706e 100644
--- a/llvm/lib/Target/CSKY/CSKY.td
+++ b/llvm/lib/Target/CSKY/CSKY.td
@@ -11,6 +11,40 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 // CSKY subtarget features and instruction predicates.
 //===----------------------------------------------------------------------===//
+def ModeHardFloat :
+  SubtargetFeature<"hard-float", "UseHardFloat",
+                   "true", "Use hard floating point features">;
+def ModeHardFloatABI :
+  SubtargetFeature<"hard-float-abi", "UseHardFloatABI",
+                   "true", "Use hard floating point ABI to pass args">;
+
+def FeatureFPUV2_SF
+    : SubtargetFeature<"fpuv2_sf", "HasFPUv2SingleFloat", "true",
+                       "Enable FPUv2 single float instructions">;
+def HasFPUv2_SF : Predicate<"Subtarget->hasFPUv2SingleFloat()">,
+                  AssemblerPredicate<(all_of FeatureFPUV2_SF),
+                  "Enable FPUv2 single float instructions">;
+
+def FeatureFPUV2_DF
+    : SubtargetFeature<"fpuv2_df", "HasFPUv2DoubleFloat", "true",
+                       "Enable FPUv2 double float instructions">;
+def HasFPUv2_DF : Predicate<"Subtarget->hasFPUv2DoubleFloat()">,
+                  AssemblerPredicate<(all_of FeatureFPUV2_DF),
+                  "Enable FPUv2 double float instructions">;
+
+def FeatureFPUV3_SF
+    : SubtargetFeature<"fpuv3_sf", "HasFPUv3SingleFloat", "true",
+                       "Enable FPUv3 single float instructions">;
+def HasFPUv3_SF : Predicate<"Subtarget->hasFPUv3SingleFloat()">,
+                  AssemblerPredicate<(all_of FeatureFPUV3_SF),
+                  "Enable FPUv3 single float instructions">;
+
+def FeatureFPUV3_DF
+    : SubtargetFeature<"fpuv3_df", "HasFPUv3DoubleFloat", "true",
+                       "Enable FPUv3 double float instructions">;
+def HasFPUv3_DF : Predicate<"Subtarget->hasFPUv3DoubleFloat()">,
+                  AssemblerPredicate<(all_of FeatureFPUV3_DF),
+                  "Enable FPUv3 double float instructions">;
 
 def FeatureBTST16 : SubtargetFeature<"btst16", "HasBTST16", "true",
                                      "Use the 16-bit btsti instruction">;
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
index 85129f78e726..c8269eeacfdb 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #include "CSKYAsmPrinter.h"
 #include "CSKY.h"
+#include "CSKYConstantPoolValue.h"
 #include "CSKYTargetMachine.h"
 #include "MCTargetDesc/CSKYInstPrinter.h"
 #include "MCTargetDesc/CSKYMCExpr.h"
@@ -38,6 +39,7 @@ CSKYAsmPrinter::CSKYAsmPrinter(llvm::TargetMachine &TM,
     : AsmPrinter(TM, std::move(Streamer)), MCInstLowering(OutContext, *this) {}
 
 bool CSKYAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  MCP = MF.getConstantPool();
   Subtarget = &MF.getSubtarget<CSKYSubtarget>();
   return AsmPrinter::runOnMachineFunction(MF);
 }
@@ -56,16 +58,166 @@ void CSKYAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
 // instructions) auto-generated.
 #include "CSKYGenMCPseudoLowering.inc"
 
+void CSKYAsmPrinter::expandTLSLA(const MachineInstr *MI) {
+  const CSKYInstrInfo *TII = Subtarget->getInstrInfo();
+
+  DebugLoc DL = MI->getDebugLoc();
+
+  MCSymbol *PCLabel = OutContext.getOrCreateSymbol(
+      Twine(MAI->getPrivateGlobalPrefix()) + "PC" + Twine(getFunctionNumber()) +
+      "_" + Twine(MI->getOperand(3).getImm()));
+
+  OutStreamer->emitLabel(PCLabel);
+
+  auto Instr = BuildMI(*MF, DL, TII->get(CSKY::LRW32))
+                   .add(MI->getOperand(0))
+                   .add(MI->getOperand(2));
+  MCInst LRWInst;
+  MCInstLowering.Lower(Instr, LRWInst);
+  EmitToStreamer(*OutStreamer, LRWInst);
+
+  Instr = BuildMI(*MF, DL, TII->get(CSKY::GRS32))
+              .add(MI->getOperand(1))
+              .addSym(PCLabel);
+  MCInst GRSInst;
+  MCInstLowering.Lower(Instr, GRSInst);
+  EmitToStreamer(*OutStreamer, GRSInst);
+  return;
+}
+
+void CSKYAsmPrinter::emitCustomConstantPool(const MachineInstr *MI) {
+
+  // This instruction represents a floating constant pool in the function.
+  // The first operand is the ID# for this instruction, the second is the
+  // index into the MachineConstantPool that this is, the third is the size
+  // in bytes of this constant pool entry.
+  // The required alignment is specified on the basic block holding this MI.
+  unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+  unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex();
+
+  // If this is the first entry of the pool, mark it.
+  if (!InConstantPool) {
+    OutStreamer->emitValueToAlignment(4);
+    InConstantPool = true;
+  }
+
+  OutStreamer->emitLabel(GetCPISymbol(LabelId));
+
+  const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+  if (MCPE.isMachineConstantPoolEntry())
+    emitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+  else
+    emitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal);
+  return;
+}
+
+void CSKYAsmPrinter::emitFunctionBodyEnd() {
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+}
+
 void CSKYAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
 
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != CSKY::CONSTPOOL_ENTRY) {
+    InConstantPool = false;
+  }
+
+  if (MI->getOpcode() == CSKY::PseudoTLSLA32)
+    return expandTLSLA(MI);
+
+  if (MI->getOpcode() == CSKY::CONSTPOOL_ENTRY)
+    return emitCustomConstantPool(MI);
+
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+// Convert a CSKY-specific constant pool modifier into the associated
+// MCSymbolRefExpr variant kind.
+static CSKYMCExpr::VariantKind
+getModifierVariantKind(CSKYCP::CSKYCPModifier Modifier) {
+  switch (Modifier) {
+  case CSKYCP::NO_MOD:
+    return CSKYMCExpr::VK_CSKY_None;
+  case CSKYCP::ADDR:
+    return CSKYMCExpr::VK_CSKY_ADDR;
+  case CSKYCP::GOT:
+    return CSKYMCExpr::VK_CSKY_GOT;
+  case CSKYCP::GOTOFF:
+    return CSKYMCExpr::VK_CSKY_GOTOFF;
+  case CSKYCP::PLT:
+    return CSKYMCExpr::VK_CSKY_PLT;
+  case CSKYCP::TLSGD:
+    return CSKYMCExpr::VK_CSKY_TLSGD;
+  case CSKYCP::TLSLE:
+    return CSKYMCExpr::VK_CSKY_TLSLE;
+  case CSKYCP::TLSIE:
+    return CSKYMCExpr::VK_CSKY_TLSIE;
+  }
+  llvm_unreachable("Invalid CSKYCPModifier!");
+}
+
+void CSKYAsmPrinter::emitMachineConstantPoolValue(
+    MachineConstantPoolValue *MCPV) {
+  int Size = getDataLayout().getTypeAllocSize(MCPV->getType());
+  CSKYConstantPoolValue *CCPV = static_cast<CSKYConstantPoolValue *>(MCPV);
+  MCSymbol *MCSym;
+
+  if (CCPV->isBlockAddress()) {
+    const BlockAddress *BA =
+        cast<CSKYConstantPoolConstant>(CCPV)->getBlockAddress();
+    MCSym = GetBlockAddressSymbol(BA);
+  } else if (CCPV->isGlobalValue()) {
+    const GlobalValue *GV = cast<CSKYConstantPoolConstant>(CCPV)->getGV();
+    MCSym = getSymbol(GV);
+  } else if (CCPV->isMachineBasicBlock()) {
+    const MachineBasicBlock *MBB = cast<CSKYConstantPoolMBB>(CCPV)->getMBB();
+    MCSym = MBB->getSymbol();
+  } else if (CCPV->isJT()) {
+    signed JTI = cast<CSKYConstantPoolJT>(CCPV)->getJTI();
+    MCSym = GetJTISymbol(JTI);
+  } else {
+    assert(CCPV->isExtSymbol() && "unrecognized constant pool value");
+    StringRef Sym = cast<CSKYConstantPoolSymbol>(CCPV)->getSymbol();
+    MCSym = GetExternalSymbolSymbol(Sym);
+  }
+  // Create an MCSymbol for the reference.
+  const MCExpr *Expr =
+      MCSymbolRefExpr::create(MCSym, MCSymbolRefExpr::VK_None, OutContext);
+
+  if (CCPV->getPCAdjustment()) {
+
+    MCSymbol *PCLabel = OutContext.getOrCreateSymbol(
+        Twine(MAI->getPrivateGlobalPrefix()) + "PC" +
+        Twine(getFunctionNumber()) + "_" + Twine(CCPV->getLabelID()));
+
+    const MCExpr *PCRelExpr = MCSymbolRefExpr::create(PCLabel, OutContext);
+    if (CCPV->mustAddCurrentAddress()) {
+      // We want "(<expr> - .)", but MC doesn't have a concept of the '.'
+      // label, so just emit a local label end reference that instead.
+      MCSymbol *DotSym = OutContext.createTempSymbol();
+      OutStreamer->emitLabel(DotSym);
+      const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+      PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext);
+    }
+    Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext);
+  }
+
+  // Create an MCSymbol for the reference.
+  Expr = CSKYMCExpr::create(Expr, getModifierVariantKind(CCPV->getModifier()),
+                            OutContext);
+
+  OutStreamer->emitValue(Expr, Size);
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYAsmPrinter() {
   RegisterAsmPrinter<CSKYAsmPrinter> X(getTheCSKYTarget());
 }
diff --git a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
index b30311e0ca64..04a253d349c8 100644
--- a/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
+++ b/llvm/lib/Target/CSKY/CSKYAsmPrinter.h
@@ -20,6 +20,15 @@ class LLVM_LIBRARY_VISIBILITY CSKYAsmPrinter : public AsmPrinter {
 
   const CSKYSubtarget *Subtarget;
 
+  bool InConstantPool = false;
+
+  /// Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  MachineConstantPool *MCP;
+
+  void expandTLSLA(const MachineInstr *MI);
+  void emitCustomConstantPool(const MachineInstr *MI);
+
 public:
   explicit CSKYAsmPrinter(TargetMachine &TM,
                           std::unique_ptr<MCStreamer> Streamer);
@@ -33,9 +42,16 @@ public:
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
 
+  void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+
+  void emitFunctionBodyEnd() override;
+
   void emitInstruction(const MachineInstr *MI) override;
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+
+  // we emit constant pools customly!
+  void emitConstantPool() override{};
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
new file mode 100644
index 000000000000..3ac335e2ad9d
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -0,0 +1,1376 @@
+//===- CSKYConstantIslandPass.cpp - Emit PC Relative loads ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Loading constants inline is expensive on CSKY and it's in general better
+// to place the constant nearby in code space and then it can be loaded with a
+// simple 16/32 bit load instruction like lrw.
+//
+// The constants can be not just numbers but addresses of functions and labels.
+// This can be particularly helpful in static relocation mode for embedded
+// non-linux targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKY.h"
+#include "CSKYConstantPoolValue.h"
+#include "CSKYMachineFunctionInfo.h"
+#include "CSKYSubtarget.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "CSKY-constant-islands"
+
+STATISTIC(NumCPEs, "Number of constpool entries");
+STATISTIC(NumSplit, "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed, "Number of cond branches fixed");
+STATISTIC(NumUBrFixed, "Number of uncond branches fixed");
+
+namespace {
+
+using Iter = MachineBasicBlock::iterator;
+using ReverseIter = MachineBasicBlock::reverse_iterator;
+
+/// CSKYConstantIslands - Due to limited PC-relative displacements, CSKY
+/// requires constant pool entries to be scattered among the instructions
+/// inside a function.  To do this, it completely ignores the normal LLVM
+/// constant pool; instead, it places constants wherever it feels like with
+/// special instructions.
+///
+/// The terminology used in this pass includes:
+///   Islands - Clumps of constants placed in the function.
+///   Water   - Potential places where an island could be formed.
+///   CPE     - A constant pool entry that has been placed somewhere, which
+///             tracks a list of users.
+
+class CSKYConstantIslands : public MachineFunctionPass {
+  /// BasicBlockInfo - Information about the offset and size of a single
+  /// basic block.
+  struct BasicBlockInfo {
+    /// Offset - Distance from the beginning of the function to the beginning
+    /// of this basic block.
+    ///
+    /// Offsets are computed assuming worst case padding before an aligned
+    /// block. This means that subtracting basic block offsets always gives a
+    /// conservative estimate of the real distance which may be smaller.
+    ///
+    /// Because worst case padding is used, the computed offset of an aligned
+    /// block may not actually be aligned.
+    unsigned Offset = 0;
+
+    /// Size - Size of the basic block in bytes.  If the block contains
+    /// inline assembly, this is a worst case estimate.
+    ///
+    /// The size does not include any alignment padding whether from the
+    /// beginning of the block, or from an aligned jump table at the end.
+    unsigned Size = 0;
+
+    BasicBlockInfo() = default;
+
+    unsigned postOffset() const { return Offset + Size; }
+  };
+
+  std::vector<BasicBlockInfo> BBInfo;
+
+  /// WaterList - A sorted list of basic blocks where islands could be placed
+  /// (i.e. blocks that don't fall through to the following block, due
+  /// to a return, unreachable, or unconditional branch).
+  std::vector<MachineBasicBlock *> WaterList;
+
+  /// NewWaterList - The subset of WaterList that was created since the
+  /// previous iteration by inserting unconditional branches.
+  SmallSet<MachineBasicBlock *, 4> NewWaterList;
+
+  using water_iterator = std::vector<MachineBasicBlock *>::iterator;
+
+  /// CPUser - One user of a constant pool, keeping the machine instruction
+  /// pointer, the constant pool being referenced, and the max displacement
+  /// allowed from the instruction to the CP.  The HighWaterMark records the
+  /// highest basic block where a new CPEntry can be placed.  To ensure this
+  /// pass terminates, the CP entries are initially placed at the end of the
+  /// function and then move monotonically to lower addresses.  The
+  /// exception to this rule is when the current CP entry for a particular
+  /// CPUser is out of range, but there is another CP entry for the same
+  /// constant value in range.  We want to use the existing in-range CP
+  /// entry, but if it later moves out of range, the search for new water
+  /// should resume where it left off.  The HighWaterMark is used to record
+  /// that point.
+  struct CPUser {
+    MachineInstr *MI;
+    MachineInstr *CPEMI;
+    MachineBasicBlock *HighWaterMark;
+
+  private:
+    unsigned MaxDisp;
+
+  public:
+    bool NegOk;
+
+    CPUser(MachineInstr *Mi, MachineInstr *Cpemi, unsigned Maxdisp, bool Neg)
+        : MI(Mi), CPEMI(Cpemi), MaxDisp(Maxdisp), NegOk(Neg) {
+      HighWaterMark = CPEMI->getParent();
+    }
+
+    /// getMaxDisp - Returns the maximum displacement supported by MI.
+    unsigned getMaxDisp() const { return MaxDisp - 16; }
+
+    void setMaxDisp(unsigned Val) { MaxDisp = Val; }
+  };
+
+  /// CPUsers - Keep track of all of the machine instructions that use various
+  /// constant pools and their max displacement.
+  std::vector<CPUser> CPUsers;
+
+  /// CPEntry - One per constant pool entry, keeping the machine instruction
+  /// pointer, the constpool index, and the number of CPUser's which
+  /// reference this entry.
+  struct CPEntry {
+    MachineInstr *CPEMI;
+    unsigned CPI;
+    unsigned RefCount;
+
+    CPEntry(MachineInstr *Cpemi, unsigned Cpi, unsigned Rc = 0)
+        : CPEMI(Cpemi), CPI(Cpi), RefCount(Rc) {}
+  };
+
+  /// CPEntries - Keep track of all of the constant pool entry machine
+  /// instructions. For each original constpool index (i.e. those that
+  /// existed upon entry to this pass), it keeps a vector of entries.
+  /// Original elements are cloned as we go along; the clones are
+  /// put in the vector of the original element, but have distinct CPIs.
+  std::vector<std::vector<CPEntry>> CPEntries;
+
+  /// ImmBranch - One per immediate branch, keeping the machine instruction
+  /// pointer, conditional or unconditional, the max displacement,
+  /// and (if isCond is true) the corresponding unconditional branch
+  /// opcode.
+  struct ImmBranch {
+    MachineInstr *MI;
+    unsigned MaxDisp : 31;
+    bool IsCond : 1;
+    int UncondBr;
+
+    ImmBranch(MachineInstr *Mi, unsigned Maxdisp, bool Cond, int Ubr)
+        : MI(Mi), MaxDisp(Maxdisp), IsCond(Cond), UncondBr(Ubr) {}
+  };
+
+  /// ImmBranches - Keep track of all the immediate branch instructions.
+  ///
+  std::vector<ImmBranch> ImmBranches;
+
+  const CSKYSubtarget *STI = nullptr;
+  const CSKYInstrInfo *TII;
+  CSKYMachineFunctionInfo *MFI;
+  MachineFunction *MF = nullptr;
+  MachineConstantPool *MCP = nullptr;
+
+  unsigned PICLabelUId;
+
+  void initPICLabelUId(unsigned UId) { PICLabelUId = UId; }
+
+  unsigned createPICLabelUId() { return PICLabelUId++; }
+
+public:
+  static char ID;
+
+  CSKYConstantIslands() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "CSKY Constant Islands"; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  void doInitialPlacement(std::vector<MachineInstr *> &CPEMIs);
+  CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+  Align getCPEAlign(const MachineInstr &CPEMI);
+  void initializeFunctionInfo(const std::vector<MachineInstr *> &CPEMIs);
+  unsigned getOffsetOf(MachineInstr *MI) const;
+  unsigned getUserOffset(CPUser &) const;
+  void dumpBBs();
+
+  bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset, unsigned Disp,
+                       bool NegativeOK);
+  bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                       const CPUser &U);
+
+  void computeBlockSize(MachineBasicBlock *MBB);
+  MachineBasicBlock *splitBlockBeforeInstr(MachineInstr &MI);
+  void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+  void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+  bool decrementCPEReferenceCount(unsigned CPI, MachineInstr *CPEMI);
+  int findInRangeCPEntry(CPUser &U, unsigned UserOffset);
+  bool findAvailableWater(CPUser &U, unsigned UserOffset,
+                          water_iterator &WaterIter);
+  void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                      MachineBasicBlock *&NewMBB);
+  bool handleConstantPoolUser(unsigned CPUserIndex);
+  void removeDeadCPEMI(MachineInstr *CPEMI);
+  bool removeUnusedCPEntries();
+  bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                        MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                        bool DoDump = false);
+  bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water, CPUser &U,
+                      unsigned &Growth);
+  bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+  bool fixupImmediateBr(ImmBranch &Br);
+  bool fixupConditionalBr(ImmBranch &Br);
+  bool fixupUnconditionalBr(ImmBranch &Br);
+};
+} // end anonymous namespace
+
+char CSKYConstantIslands::ID = 0;
+
+bool CSKYConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                          unsigned TrialOffset,
+                                          const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset, U.getMaxDisp(), U.NegOk);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// print block size and offset information - debugging
+LLVM_DUMP_METHOD void CSKYConstantIslands::dumpBBs() {
+  for (unsigned J = 0, E = BBInfo.size(); J != E; ++J) {
+    const BasicBlockInfo &BBI = BBInfo[J];
+    dbgs() << format("%08x %bb.%u\t", BBI.Offset, J)
+           << format(" size=%#x\n", BBInfo[J].Size);
+  }
+}
+#endif
+
+bool CSKYConstantIslands::runOnMachineFunction(MachineFunction &Mf) {
+  MF = &Mf;
+  MCP = Mf.getConstantPool();
+  STI = &static_cast<const CSKYSubtarget &>(Mf.getSubtarget());
+
+  LLVM_DEBUG(dbgs() << "***** CSKYConstantIslands: "
+                    << MCP->getConstants().size() << " CP entries, aligned to "
+                    << MCP->getConstantPoolAlign().value() << " bytes *****\n");
+
+  TII = STI->getInstrInfo();
+  MFI = MF->getInfo<CSKYMachineFunctionInfo>();
+
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  bool MadeChange = false;
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr *> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  LLVM_DEBUG(dumpBBs());
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  (void)NoBRIters;
+  while (true) {
+    LLVM_DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned I = 0, E = CPUsers.size(); I != E; ++I)
+      CPChange |= handleConstantPoolUser(I);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    LLVM_DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    LLVM_DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned I = 0, E = ImmBranches.size(); I != E; ++I)
+      BRChange |= fixupImmediateBr(ImmBranches[I]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    LLVM_DEBUG(dumpBBs());
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  LLVM_DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void CSKYConstantIslands::doInitialPlacement(
+    std::vector<MachineInstr *> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  const Align MaxAlign = MCP->getConstantPoolAlign();
+
+  // Mark the basic block as required by the const-pool.
+  BB->setAlignment(Align(2));
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(Log2(MaxAlign) + 1,
+                                                       BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = MF->getDataLayout();
+  for (unsigned I = 0, E = CPs.size(); I != E; ++I) {
+    unsigned Size = CPs[I].getSizeInBytes(TD);
+    assert(Size >= 4 && "Too small constant pool entry");
+    Align Alignment = CPs[I].getAlign();
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert(isAligned(Alignment, Size) && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2(Alignment);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+
+    MachineInstr *CPEMI =
+        BuildMI(*BB, InsAt, DebugLoc(), TII->get(CSKY::CONSTPOOL_ENTRY))
+            .addImm(I)
+            .addConstantPoolIndex(I)
+            .addImm(Size);
+
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned A = LogAlign + 1; A <= Log2(MaxAlign); ++A)
+      if (InsPoint[A] == InsAt)
+        InsPoint[A] = CPEMI;
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    CPEntries.emplace_back(1, CPEntry(CPEMI, I));
+    ++NumCPEs;
+    LLVM_DEBUG(dbgs() << "Moved CPI#" << I << " to end of function, size = "
+                      << Size << ", align = " << Alignment.value() << '\n');
+  }
+  LLVM_DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool bbHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB->getIterator();
+  // Can't fall off end of function.
+  if (std::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = &*std::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+                                        E = MBB->succ_end();
+       I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+CSKYConstantIslands::CPEntry *
+CSKYConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned I = 0, E = CPEs.size(); I != E; ++I) {
+    if (CPEs[I].CPEMI == CPEMI)
+      return &CPEs[I];
+  }
+  return nullptr;
+}
+
+/// getCPEAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+Align CSKYConstantIslands::getCPEAlign(const MachineInstr &CPEMI) {
+  assert(CPEMI.getOpcode() == CSKY::CONSTPOOL_ENTRY);
+
+  unsigned CPI = CPEMI.getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  return MCP->getConstants()[CPI].getAlign();
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void CSKYConstantIslands::initializeFunctionInfo(
+    const std::vector<MachineInstr *> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(&*I);
+
+  // Compute block offsets.
+  adjustBBOffsetsAfter(&MF->front());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineBasicBlock &MBB : *MF) {
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!bbHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+
+      int Opc = MI.getOpcode();
+      if (MI.isBranch() && !MI.isIndirectBranch()) {
+        bool IsCond = MI.isConditionalBranch();
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = CSKY::BR32;
+
+        switch (MI.getOpcode()) {
+        case CSKY::BR16:
+        case CSKY::BF16:
+        case CSKY::BT16:
+          Bits = 10;
+          Scale = 2;
+          break;
+        default:
+          Bits = 16;
+          Scale = 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits - 1)) - 1) * Scale;
+        ImmBranches.push_back(ImmBranch(&MI, MaxOffs, IsCond, UOpc));
+      }
+
+      if (Opc == CSKY::CONSTPOOL_ENTRY)
+        continue;
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned Op = 0, E = MI.getNumOperands(); Op != E; ++Op)
+        if (MI.getOperand(Op).isCPI()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+          case CSKY::MOVIH32:
+          case CSKY::ORI32:
+            continue;
+          case CSKY::PseudoTLSLA32:
+          case CSKY::JSRI32:
+          case CSKY::JMPI32:
+          case CSKY::LRW32:
+          case CSKY::LRW32_Gen:
+            Bits = 16;
+            Scale = 4;
+            break;
+          case CSKY::f2FLRW_S:
+          case CSKY::f2FLRW_D:
+            Bits = 8;
+            Scale = 4;
+            break;
+          case CSKY::GRS32:
+            Bits = 17;
+            Scale = 2;
+            NegOk = true;
+            break;
+          }
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = MI.getOperand(Op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+          CPUsers.push_back(CPUser(&MI, CPEMI, MaxOffs, NegOk));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+  }
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void CSKYConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+
+  for (const MachineInstr &MI : *MBB)
+    BBI.Size += TII->getInstSizeInBytes(MI);
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned CSKYConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->getInstSizeInBytes(*I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool compareMbbNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void CSKYConstantIslands::updateForInsertedWaterBlock(
+    MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP = llvm::lower_bound(WaterList, NewBB, compareMbbNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+unsigned CSKYConstantIslands::getUserOffset(CPUser &U) const {
+  unsigned UserOffset = getOffsetOf(U.MI);
+
+  UserOffset &= ~3u;
+
+  return UserOffset;
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *
+CSKYConstantIslands::splitBlockBeforeInstr(MachineInstr &MI) {
+  MachineBasicBlock *OrigBB = MI.getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+      MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = ++OrigBB->getIterator();
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+
+  // TODO: Add support for 16bit instr.
+  BuildMI(OrigBB, DebugLoc(), TII->get(CSKY::BR32)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP = llvm::lower_bound(WaterList, OrigBB, compareMbbNumbers);
+  MachineBasicBlock *WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(std::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool CSKYConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                          unsigned TrialOffset,
+                                          unsigned MaxDisp, bool NegativeOK) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool CSKYConstantIslands::isWaterInRange(unsigned UserOffset,
+                                         MachineBasicBlock *Water, CPUser &U,
+                                         unsigned &Growth) {
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset();
+  unsigned NextBlockOffset;
+  Align NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = ++Water->getIterator();
+  if (NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = Align(4);
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += offsetToAlignment(CPEEnd, NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth;
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool CSKYConstantIslands::isCPEntryInRange(MachineInstr *MI,
+                                           unsigned UserOffset,
+                                           MachineInstr *CPEMI,
+                                           unsigned MaxDisp, bool NegOk,
+                                           bool DoDump) {
+  unsigned CPEOffset = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    LLVM_DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset) << " in "
+             << printMBBReference(*MI->getParent()) << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset - UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool bbIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == CSKY::BR32 /*TODO: change to 16bit instr. */)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif
+
+void CSKYConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for (unsigned I = BBNum + 1, E = MF->getNumBlockIDs(); I < E; ++I) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned Offset = BBInfo[I - 1].Offset + BBInfo[I - 1].Size;
+    BBInfo[I].Offset = Offset;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+bool CSKYConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                     MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = nullptr;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int CSKYConstantIslands::findInRangeCPEntry(CPUser &U, unsigned UserOffset) {
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    LLVM_DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned I = 0, E = CPEs.size(); I != E; ++I) {
+    // We already tried this one
+    if (CPEs[I].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[I].CPEMI == nullptr)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[I].CPEMI, U.getMaxDisp(),
+                         U.NegOk)) {
+      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                        << CPEs[I].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[I].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned J = 0, E = UserMI->getNumOperands(); J != E; ++J)
+        if (UserMI->getOperand(J).isCPI()) {
+          UserMI->getOperand(J).setIndex(CPEs[I].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[I].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  unsigned Bits, Scale;
+
+  switch (Opc) {
+  case CSKY::BR16:
+    Bits = 10;
+    Scale = 2;
+    break;
+  case CSKY::BR32:
+    Bits = 16;
+    Scale = 2;
+    break;
+  default:
+    assert(0);
+    break;
+  }
+
+  unsigned MaxOffs = ((1 << (Bits - 1)) - 1) * Scale;
+  return MaxOffs;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.
+/// To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool CSKYConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                             water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = std::prev(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock *WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) &&
+        Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      LLVM_DEBUG(dbgs() << "Found water after " << printMBBReference(*WaterBB)
+                        << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void CSKYConstantIslands::createNewWater(unsigned CPUserIndex,
+                                         unsigned UserOffset,
+                                         MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI = U.CPEMI;
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.
+  if (bbHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = 4;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset() + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      LLVM_DEBUG(dbgs() << "Split at end of " << printMBBReference(*UserMBB)
+                        << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = &*++UserMBB->getIterator();
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+
+      // TODO: Add support for 16bit instr.
+      int UncondBr = CSKY::BR32;
+      auto *NewMI = BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr))
+                        .addMBB(NewMBB)
+                        .getInstr();
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(
+          ImmBranch(&UserMBB->back(), MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += TII->getInstSizeInBytes(*NewMI);
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // Align which is the largest possible alignment in the function.
+  const Align Align = MF->getAlignment();
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  LLVM_DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                              BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  LLVM_DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+                    << " la=" << Log2(Align) << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - 8;
+    LLVM_DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset =
+      BaseInsertOffset + 4 + CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex + 1;
+  unsigned NumCPUsers = CPUsers.size();
+  for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->getInstSizeInBytes(*MI), MI = std::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= Align.value();
+        EndInsertOffset -= Align.value();
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+  }
+
+  NewMBB = splitBlockBeforeInstr(*--MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool CSKYConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result == 1)
+    return false;
+  if (result == 2)
+    return true;
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    LLVM_DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = &*++WaterBB->getIterator();
+  } else {
+    LLVM_DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = &*--NewMBB->getIterator();
+    IP = llvm::find(WaterList, WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB->getIterator(), NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = createPICLabelUId();
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(CSKY::CONSTPOOL_ENTRY))
+                .addImm(ID)
+                .addConstantPoolIndex(CPI)
+                .addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPEAlign(*U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(&*--NewIsland->getIterator());
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned I = 0, E = UserMI->getNumOperands(); I != E; ++I)
+    if (UserMI->getOperand(I).isCPI()) {
+      UserMI->getOperand(I).setIndex(ID);
+      break;
+    }
+
+  LLVM_DEBUG(
+      dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+             << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void CSKYConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(Align(4));
+  } else {
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPEAlign(*CPEBB->begin()));
+  }
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!bbIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool CSKYConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned I = 0, E = CPEntries.size(); I != E; ++I) {
+    std::vector<CPEntry> &CPEs = CPEntries[I];
+    for (unsigned J = 0, Ee = CPEs.size(); J != Ee; ++J) {
+      if (CPEs[J].RefCount == 0 && CPEs[J].CPEMI) {
+        removeDeadCPEMI(CPEs[J].CPEMI);
+        CPEs[J].CPEMI = nullptr;
+        MadeChange = true;
+      }
+    }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool CSKYConstantIslands::isBBInRange(MachineInstr *MI,
+                                      MachineBasicBlock *DestBB,
+                                      unsigned MaxDisp) {
+  unsigned BrOffset = getOffsetOf(MI);
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  LLVM_DEBUG(dbgs() << "Branch of destination " << printMBBReference(*DestBB)
+                    << " from " << printMBBReference(*MI->getParent())
+                    << " max delta=" << MaxDisp << " from " << getOffsetOf(MI)
+                    << " to " << DestOffset << " offset "
+                    << int(DestOffset - BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset - BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset - DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool CSKYConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = TII->getBranchDestBlock(*MI);
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.IsCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BSR to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool CSKYConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+
+  if (!MFI->isLRSpilled())
+    report_fatal_error("underestimated function size");
+
+  // Use BSR to implement far jump.
+  Br.MaxDisp = ((1 << (26 - 1)) - 1) * 2;
+  MI->setDesc(TII->get(CSKY::BSR32_BR));
+  BBInfo[MBB->getNumber()].Size += 4;
+  adjustBBOffsetsAfter(MBB);
+  ++NumUBrFixed;
+
+  LLVM_DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool CSKYConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = TII->getBranchDestBlock(*MI);
+
+  SmallVector<MachineOperand, 4> Cond;
+  Cond.push_back(MachineOperand::CreateImm(MI->getOpcode()));
+  Cond.push_back(MI->getOperand(0));
+  TII->reverseBranchCondition(Cond);
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // bteqz L1
+  // =>
+  // bnez L2
+  // b   L1
+  // L2:
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !bbHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
+        BMI->isUnconditionalBranch()) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beqz L1
+      // b   L2
+      // =>
+      // bnez L2
+      // b   L1
+      MachineBasicBlock *NewDest = TII->getBranchDestBlock(*BMI);
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        LLVM_DEBUG(
+            dbgs() << "  Invert Bcc condition and swap its destination with "
+                   << *BMI);
+        BMI->getOperand(BMI->getNumExplicitOperands() - 1).setMBB(DestBB);
+        MI->getOperand(MI->getNumExplicitOperands() - 1).setMBB(NewDest);
+
+        MI->setDesc(TII->get(Cond[0].getImm()));
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(*MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int Delta = TII->getInstSizeInBytes(MBB->back());
+    BBInfo[MBB->getNumber()].Size -= Delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+
+    // The conditional successor will be swapped between the BBs after this, so
+    // update CFG.
+    MBB->addSuccessor(DestBB);
+    std::next(MBB->getIterator())->removeSuccessor(DestBB);
+  }
+  MachineBasicBlock *NextBB = &*++MBB->getIterator();
+
+  LLVM_DEBUG(dbgs() << "  Insert B to " << printMBBReference(*DestBB)
+                    << " also invert condition and change dest. to "
+                    << printMBBReference(*NextBB) << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+
+  BuildMI(MBB, DebugLoc(), TII->get(Cond[0].getImm()))
+      .addReg(MI->getOperand(0).getReg())
+      .addMBB(NextBB);
+
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->getInstSizeInBytes(*MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+/// Returns a pass that converts branches to long branches.
+FunctionPass *llvm::createCSKYConstantIslandPass() {
+  return new CSKYConstantIslands();
+}
+
+INITIALIZE_PASS(CSKYConstantIslands, DEBUG_TYPE,
+                "CSKY constant island placement and branch shortening pass",
+                false, false)
diff --git a/llvm/lib/Target/CSKY/CSKYConstantPoolValue.cpp b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.cpp
new file mode 100644
index 000000000000..d4c4bb847237
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.cpp
@@ -0,0 +1,216 @@
+//===-- CSKYConstantPoolValue.cpp - CSKY constantpool value ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CSKY specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSKYConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// CSKYConstantPoolValue
+//===----------------------------------------------------------------------===//
+
+CSKYConstantPoolValue::CSKYConstantPoolValue(Type *Ty, CSKYCP::CSKYCPKind Kind,
+                                             unsigned PCAdjust,
+                                             CSKYCP::CSKYCPModifier Modifier,
+                                             bool AddCurrentAddress,
+                                             unsigned ID)
+    : MachineConstantPoolValue(Ty), Kind(Kind), PCAdjust(PCAdjust),
+      Modifier(Modifier), AddCurrentAddress(AddCurrentAddress), LabelId(ID) {}
+
+const char *CSKYConstantPoolValue::getModifierText() const {
+  switch (Modifier) {
+  case CSKYCP::ADDR:
+    return "ADDR";
+  case CSKYCP::GOT:
+    return "GOT";
+  case CSKYCP::GOTOFF:
+    return "GOTOFF";
+  case CSKYCP::PLT:
+    return "PLT";
+  case CSKYCP::TLSIE:
+    return "TLSIE";
+  case CSKYCP::TLSLE:
+    return "TLSLE";
+  case CSKYCP::TLSGD:
+    return "TLSGD";
+  case CSKYCP::NO_MOD:
+    return "";
+  }
+  llvm_unreachable("Unknown modifier!");
+}
+
+int CSKYConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                     Align Alignment) {
+  llvm_unreachable("Shouldn't be calling this directly!");
+}
+
+void CSKYConstantPoolValue::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddInteger(LabelId);
+  ID.AddInteger(PCAdjust);
+  ID.AddInteger(Modifier);
+}
+
+void CSKYConstantPoolValue::print(raw_ostream &O) const {
+  if (Modifier)
+    O << "(" << getModifierText() << ")";
+  if (PCAdjust)
+    O << " + " << PCAdjust;
+}
+
+//===----------------------------------------------------------------------===//
+// CSKYConstantPoolConstant
+//===----------------------------------------------------------------------===//
+
+CSKYConstantPoolConstant::CSKYConstantPoolConstant(
+    const Constant *C, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust,
+    CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress, unsigned ID)
+    : CSKYConstantPoolValue(C->getType(), Kind, PCAdjust, Modifier,
+                            AddCurrentAddress, ID),
+      CVal(C) {}
+
+CSKYConstantPoolConstant *CSKYConstantPoolConstant::Create(
+    const Constant *C, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust,
+    CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress, unsigned ID) {
+  return new CSKYConstantPoolConstant(C, Kind, PCAdjust, Modifier,
+                                      AddCurrentAddress, ID);
+}
+
+const GlobalValue *CSKYConstantPoolConstant::getGV() const {
+  assert(isa<GlobalValue>(CVal) && "CVal should be GlobalValue");
+  return cast<GlobalValue>(CVal);
+}
+
+const BlockAddress *CSKYConstantPoolConstant::getBlockAddress() const {
+  assert(isa<BlockAddress>(CVal) && "CVal should be BlockAddress");
+  return cast<BlockAddress>(CVal);
+}
+
+int CSKYConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                        Align Alignment) {
+  return getExistingMachineCPValueImpl<CSKYConstantPoolConstant>(CP, Alignment);
+}
+
+void CSKYConstantPoolConstant::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(CVal);
+
+  CSKYConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void CSKYConstantPoolConstant::print(raw_ostream &O) const {
+  O << CVal->getName();
+  CSKYConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// CSKYConstantPoolSymbol
+//===----------------------------------------------------------------------===//
+
+CSKYConstantPoolSymbol::CSKYConstantPoolSymbol(Type *Ty, const char *S,
+                                               unsigned PCAdjust,
+                                               CSKYCP::CSKYCPModifier Modifier,
+                                               bool AddCurrentAddress)
+    : CSKYConstantPoolValue(Ty, CSKYCP::CPExtSymbol, PCAdjust, Modifier,
+                            AddCurrentAddress),
+      S(strdup(S)) {}
+
+CSKYConstantPoolSymbol *
+CSKYConstantPoolSymbol::Create(Type *Ty, const char *S, unsigned PCAdjust,
+                               CSKYCP::CSKYCPModifier Modifier) {
+  return new CSKYConstantPoolSymbol(Ty, S, PCAdjust, Modifier, false);
+}
+
+int CSKYConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                      Align Alignment) {
+
+  return getExistingMachineCPValueImpl<CSKYConstantPoolSymbol>(CP, Alignment);
+}
+
+void CSKYConstantPoolSymbol::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddString(S);
+  CSKYConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void CSKYConstantPoolSymbol::print(raw_ostream &O) const {
+  O << S;
+  CSKYConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// CSKYConstantPoolMBB
+//===----------------------------------------------------------------------===//
+
+CSKYConstantPoolMBB::CSKYConstantPoolMBB(Type *Ty, const MachineBasicBlock *Mbb,
+                                         unsigned PCAdjust,
+                                         CSKYCP::CSKYCPModifier Modifier,
+                                         bool AddCurrentAddress)
+    : CSKYConstantPoolValue(Ty, CSKYCP::CPMachineBasicBlock, PCAdjust, Modifier,
+                            AddCurrentAddress),
+      MBB(Mbb) {}
+
+CSKYConstantPoolMBB *CSKYConstantPoolMBB::Create(Type *Ty,
+                                                 const MachineBasicBlock *Mbb,
+                                                 unsigned PCAdjust) {
+  return new CSKYConstantPoolMBB(Ty, Mbb, PCAdjust, CSKYCP::ADDR, false);
+}
+
+int CSKYConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                   Align Alignment) {
+  return getExistingMachineCPValueImpl<CSKYConstantPoolMBB>(CP, Alignment);
+}
+
+void CSKYConstantPoolMBB::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(MBB);
+  CSKYConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void CSKYConstantPoolMBB::print(raw_ostream &O) const {
+  O << "BB#" << MBB->getNumber();
+  CSKYConstantPoolValue::print(O);
+}
+
+//===----------------------------------------------------------------------===//
+// CSKYConstantPoolJT
+//===----------------------------------------------------------------------===//
+
+CSKYConstantPoolJT::CSKYConstantPoolJT(Type *Ty, int JTIndex, unsigned PCAdj,
+                                       CSKYCP::CSKYCPModifier Modifier,
+                                       bool AddCurrentAddress)
+    : CSKYConstantPoolValue(Ty, CSKYCP::CPJT, PCAdj, Modifier,
+                            AddCurrentAddress),
+      JTI(JTIndex) {}
+
+CSKYConstantPoolJT *
+CSKYConstantPoolJT::Create(Type *Ty, int JTI, unsigned PCAdj,
+                           CSKYCP::CSKYCPModifier Modifier) {
+  return new CSKYConstantPoolJT(Ty, JTI, PCAdj, Modifier, false);
+}
+
+int CSKYConstantPoolJT::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                  Align Alignment) {
+  return getExistingMachineCPValueImpl<CSKYConstantPoolJT>(CP, Alignment);
+}
+
+void CSKYConstantPoolJT::addSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddInteger(JTI);
+  CSKYConstantPoolValue::addSelectionDAGCSEId(ID);
+}
+
+void CSKYConstantPoolJT::print(raw_ostream &O) const {
+  O << "JTI#" << JTI;
+  CSKYConstantPoolValue::print(O);
+}
diff --git a/llvm/lib/Target/CSKY/CSKYConstantPoolValue.h b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.h
new file mode 100644
index 000000000000..2eff9404a34c
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYConstantPoolValue.h
@@ -0,0 +1,221 @@
+//===-- CSKYConstantPoolValue.h - CSKY constantpool value -----*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CSKY specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_CSKY_CONSTANTPOOLVALUE_H
+#define LLVM_TARGET_CSKY_CONSTANTPOOLVALUE_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstddef>
+
+namespace llvm {
+
+class BlockAddress;
+class Constant;
+class GlobalValue;
+class LLVMContext;
+class MachineBasicBlock;
+
+namespace CSKYCP {
+enum CSKYCPKind {
+  CPValue,
+  CPExtSymbol,
+  CPBlockAddress,
+  CPMachineBasicBlock,
+  CPJT
+};
+
+enum CSKYCPModifier { NO_MOD, ADDR, GOT, GOTOFF, PLT, TLSLE, TLSIE, TLSGD };
+} // namespace CSKYCP
+
+/// CSKYConstantPoolValue - CSKY specific constantpool value. This is used to
+/// represent PC-relative displacement between the address of the load
+/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
+class CSKYConstantPoolValue : public MachineConstantPoolValue {
+protected:
+  CSKYCP::CSKYCPKind Kind; // Kind of constant.
+  unsigned PCAdjust;       // Extra adjustment if constantpool is pc-relative.
+  CSKYCP::CSKYCPModifier Modifier; // GV modifier
+  bool AddCurrentAddress;
+
+  unsigned LabelId = 0;
+
+  CSKYConstantPoolValue(Type *Ty, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust,
+                        CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress,
+                        unsigned ID = 0);
+
+public:
+  const char *getModifierText() const;
+  unsigned getPCAdjustment() const { return PCAdjust; }
+  bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+  CSKYCP::CSKYCPModifier getModifier() const { return Modifier; }
+  unsigned getLabelID() const { return LabelId; }
+
+  bool isGlobalValue() const { return Kind == CSKYCP::CPValue; }
+  bool isExtSymbol() const { return Kind == CSKYCP::CPExtSymbol; }
+  bool isBlockAddress() const { return Kind == CSKYCP::CPBlockAddress; }
+  bool isMachineBasicBlock() const {
+    return Kind == CSKYCP::CPMachineBasicBlock;
+  }
+  bool isJT() const { return Kind == CSKYCP::CPJT; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                Align Alignment) override;
+
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+
+  void print(raw_ostream &O) const override;
+
+  bool equals(const CSKYConstantPoolValue *A) const {
+    return this->LabelId == A->LabelId && this->PCAdjust == A->PCAdjust &&
+           this->Modifier == A->Modifier;
+  }
+
+  template <typename Derived>
+  int getExistingMachineCPValueImpl(MachineConstantPool *CP, Align Alignment) {
+    const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+    for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+      if (Constants[i].isMachineConstantPoolEntry() &&
+          Constants[i].getAlign() >= Alignment) {
+        auto *CPV =
+            static_cast<CSKYConstantPoolValue *>(Constants[i].Val.MachineCPVal);
+        if (Derived *APC = dyn_cast<Derived>(CPV))
+          if (cast<Derived>(this)->equals(APC))
+            return i;
+      }
+    }
+
+    return -1;
+  }
+};
+
+/// CSKY-specific constant pool values for Constants,
+/// Functions, and BlockAddresses.
+class CSKYConstantPoolConstant : public CSKYConstantPoolValue {
+  const Constant *CVal; // Constant being loaded.
+
+  CSKYConstantPoolConstant(const Constant *C, CSKYCP::CSKYCPKind Kind,
+                           unsigned PCAdjust, CSKYCP::CSKYCPModifier Modifier,
+                           bool AddCurrentAddress, unsigned ID);
+
+public:
+  static CSKYConstantPoolConstant *
+  Create(const Constant *C, CSKYCP::CSKYCPKind Kind, unsigned PCAdjust,
+         CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress,
+         unsigned ID = 0);
+  const GlobalValue *getGV() const;
+  const BlockAddress *getBlockAddress() const;
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                Align Alignment) override;
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+  void print(raw_ostream &O) const override;
+
+  bool equals(const CSKYConstantPoolConstant *A) const {
+    return CVal == A->CVal && CSKYConstantPoolValue::equals(A);
+  }
+
+  static bool classof(const CSKYConstantPoolValue *APV) {
+    return APV->isGlobalValue() || APV->isBlockAddress();
+  }
+};
+
+/// CSKYConstantPoolSymbol - CSKY-specific constantpool values for external
+/// symbols.
+class CSKYConstantPoolSymbol : public CSKYConstantPoolValue {
+  const std::string S; // ExtSymbol being loaded.
+
+  CSKYConstantPoolSymbol(Type *Ty, const char *S, unsigned PCAdjust,
+                         CSKYCP::CSKYCPModifier Modifier,
+                         bool AddCurrentAddress);
+
+public:
+  static CSKYConstantPoolSymbol *Create(Type *Ty, const char *S,
+                                        unsigned PCAdjust,
+                                        CSKYCP::CSKYCPModifier Modifier);
+
+  StringRef getSymbol() const { return S; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                Align Alignment) override;
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+  void print(raw_ostream &O) const override;
+
+  bool equals(const CSKYConstantPoolSymbol *A) const {
+    return S == A->S && CSKYConstantPoolValue::equals(A);
+  }
+
+  static bool classof(const CSKYConstantPoolValue *ACPV) {
+    return ACPV->isExtSymbol();
+  }
+};
+
+/// CSKYConstantPoolMBB - CSKY-specific constantpool value of a machine basic
+/// block.
+class CSKYConstantPoolMBB : public CSKYConstantPoolValue {
+  const MachineBasicBlock *MBB; // Machine basic block.
+
+  CSKYConstantPoolMBB(Type *Ty, const MachineBasicBlock *Mbb, unsigned PCAdjust,
+                      CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress);
+
+public:
+  static CSKYConstantPoolMBB *Create(Type *Ty, const MachineBasicBlock *Mbb,
+                                     unsigned PCAdjust);
+
+  const MachineBasicBlock *getMBB() const { return MBB; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                Align Alignment) override;
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+  void print(raw_ostream &O) const override;
+
+  bool equals(const CSKYConstantPoolMBB *A) const {
+    return MBB == A->MBB && CSKYConstantPoolValue::equals(A);
+  }
+
+  static bool classof(const CSKYConstantPoolValue *ACPV) {
+    return ACPV->isMachineBasicBlock();
+  }
+};
+
+/// CSKY-specific constantpool value of a jump table.
+class CSKYConstantPoolJT : public CSKYConstantPoolValue {
+  signed JTI; // Machine basic block.
+
+  CSKYConstantPoolJT(Type *Ty, int JTIndex, unsigned PCAdj,
+                     CSKYCP::CSKYCPModifier Modifier, bool AddCurrentAddress);
+
+public:
+  static CSKYConstantPoolJT *Create(Type *Ty, int JTI, unsigned PCAdj,
+                                    CSKYCP::CSKYCPModifier Modifier);
+
+  signed getJTI() { return JTI; }
+
+  int getExistingMachineCPValue(MachineConstantPool *CP,
+                                Align Alignment) override;
+  void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
+  void print(raw_ostream &O) const override;
+
+  bool equals(const CSKYConstantPoolJT *A) const {
+    return JTI == A->JTI && CSKYConstantPoolValue::equals(A);
+  }
+
+  static bool classof(const CSKYConstantPoolValue *ACPV) {
+    return ACPV->isJT();
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
index 3a8ee5713584..3bf001c2cee7 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSKYFrameLowering.h"
+#include "CSKYMachineFunctionInfo.h"
 #include "CSKYSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -46,12 +47,555 @@ bool CSKYFrameLowering::hasBP(const MachineFunction &MF) const {
   return MFI.hasVarSizedObjects();
 }
 
+// Determines the size of the frame and maximum call frame size.
+void CSKYFrameLowering::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const CSKYRegisterInfo *RI = STI.getRegisterInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t FrameSize = MFI.getStackSize();
+
+  // Get the alignment.
+  Align StackAlign = getStackAlign();
+  if (RI->hasStackRealignment(MF)) {
+    Align MaxStackAlign = std::max(StackAlign, MFI.getMaxAlign());
+    FrameSize += (MaxStackAlign.value() - StackAlign.value());
+    StackAlign = MaxStackAlign;
+  }
+
+  // Set Max Call Frame Size
+  uint64_t MaxCallSize = alignTo(MFI.getMaxCallFrameSize(), StackAlign);
+  MFI.setMaxCallFrameSize(MaxCallSize);
+
+  // Make sure the frame is aligned.
+  FrameSize = alignTo(FrameSize, StackAlign);
+
+  // Update frame info.
+  MFI.setStackSize(FrameSize);
+}
+
 void CSKYFrameLowering::emitPrologue(MachineFunction &MF,
                                      MachineBasicBlock &MBB) const {
-  // FIXME: Implement this when we have function calls
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const CSKYRegisterInfo *RI = STI.getRegisterInfo();
+  const CSKYInstrInfo *TII = STI.getInstrInfo();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  Register FPReg = getFPReg(STI);
+  Register SPReg = CSKY::R14;
+  Register BPReg = getBPReg(STI);
+
+  // Debug location must be unknown since the first debug location is used
+  // to determine the end of the prologue.
+  DebugLoc DL;
+
+  if (MF.getFunction().hasFnAttribute("interrupt"))
+    BuildMI(MBB, MBBI, DL, TII->get(CSKY::NIE));
+
+  // Determine the correct frame layout
+  determineFrameLayout(MF);
+
+  // FIXME (note copied from Lanai): This appears to be overallocating.  Needs
+  // investigation. Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI.getStackSize();
+
+  // Early exit if there is no need to allocate on the stack
+  if (StackSize == 0 && !MFI.adjustsStack())
+    return;
+
+  const auto &CSI = MFI.getCalleeSavedInfo();
+
+  unsigned spillAreaSize = CFI->getCalleeSaveAreaSize();
+
+  uint64_t ActualSize = spillAreaSize + CFI->getVarArgsSaveSize();
+
+  // First part stack allocation.
+  adjustReg(MBB, MBBI, DL, SPReg, SPReg, -(static_cast<int64_t>(ActualSize)),
+            MachineInstr::NoFlags);
+
+  // Emit ".cfi_def_cfa_offset FirstSPAdjustAmount"
+  unsigned CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, ActualSize));
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
+
+  // The frame pointer is callee-saved, and code has been generated for us to
+  // save it to the stack. We need to skip over the storing of callee-saved
+  // registers as the frame pointer must be modified after it has been saved
+  // to the stack, not before.
+  // FIXME: assumes exactly one instruction is used to save each callee-saved
+  // register.
+  std::advance(MBBI, CSI.size());
+
+  // Iterate over list of callee-saved registers and emit .cfi_offset
+  // directives.
+  for (const auto &Entry : CSI) {
+    int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
+    Register Reg = Entry.getReg();
+
+    unsigned Num = TRI->getRegSizeInBits(Reg, MRI) / 32;
+    for (unsigned i = 0; i < Num; i++) {
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
+          nullptr, RI->getDwarfRegNum(Reg, true) + i, Offset + i * 4));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    }
+  }
+
+  // Generate new FP.
+  if (hasFP(MF)) {
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), FPReg)
+        .addReg(SPReg)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    // Emit ".cfi_def_cfa_register $fp"
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+        nullptr, RI->getDwarfRegNum(FPReg, true)));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+
+    // Second part stack allocation.
+    adjustReg(MBB, MBBI, DL, SPReg, SPReg,
+              -(static_cast<int64_t>(StackSize - ActualSize)),
+              MachineInstr::NoFlags);
+
+    // Realign Stack
+    const CSKYRegisterInfo *RI = STI.getRegisterInfo();
+    if (RI->hasStackRealignment(MF)) {
+      Align MaxAlignment = MFI.getMaxAlign();
+
+      const CSKYInstrInfo *TII = STI.getInstrInfo();
+      if (STI.hasE2() && isUInt<12>(~(-(int)MaxAlignment.value()))) {
+        BuildMI(MBB, MBBI, DL, TII->get(CSKY::ANDNI32), SPReg)
+            .addReg(SPReg)
+            .addImm(~(-(int)MaxAlignment.value()));
+      } else {
+        unsigned ShiftAmount = Log2(MaxAlignment);
+
+        if (STI.hasE2()) {
+          Register VR =
+              MF.getRegInfo().createVirtualRegister(&CSKY::GPRRegClass);
+          BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSRI32), VR)
+              .addReg(SPReg)
+              .addImm(ShiftAmount);
+          BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSLI32), SPReg)
+              .addReg(VR)
+              .addImm(ShiftAmount);
+        } else {
+          Register VR =
+              MF.getRegInfo().createVirtualRegister(&CSKY::mGPRRegClass);
+          BuildMI(MBB, MBBI, DL, TII->get(CSKY::MOV16), VR).addReg(SPReg);
+          BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSRI16), VR)
+              .addReg(VR)
+              .addImm(ShiftAmount);
+          BuildMI(MBB, MBBI, DL, TII->get(CSKY::LSLI16), VR)
+              .addReg(VR)
+              .addImm(ShiftAmount);
+          BuildMI(MBB, MBBI, DL, TII->get(CSKY::MOV16), SPReg).addReg(VR);
+        }
+      }
+    }
+
+    // FP will be used to restore the frame in the epilogue, so we need
+    // another base register BP to record SP after re-alignment. SP will
+    // track the current stack after allocating variable sized objects.
+    if (hasBP(MF)) {
+      // move BP, SP
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BPReg).addReg(SPReg);
+    }
+
+  } else {
+    adjustReg(MBB, MBBI, DL, SPReg, SPReg,
+              -(static_cast<int64_t>(StackSize - ActualSize)),
+              MachineInstr::NoFlags);
+    // Emit ".cfi_def_cfa_offset StackSize"
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  }
 }
 
 void CSKYFrameLowering::emitEpilogue(MachineFunction &MF,
                                      MachineBasicBlock &MBB) const {
-  // FIXME: Implement this when we have function calls
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  Register FPReg = getFPReg(STI);
+  Register SPReg = CSKY::R14;
+
+  // Get the insert location for the epilogue. If there were no terminators in
+  // the block, get the last instruction.
+  MachineBasicBlock::iterator MBBI = MBB.end();
+  DebugLoc DL;
+  if (!MBB.empty()) {
+    MBBI = MBB.getFirstTerminator();
+    if (MBBI == MBB.end())
+      MBBI = MBB.getLastNonDebugInstr();
+    DL = MBBI->getDebugLoc();
+
+    // If this is not a terminator, the actual insert location should be after
+    // the last instruction.
+    if (!MBBI->isTerminator())
+      MBBI = std::next(MBBI);
+  }
+
+  const auto &CSI = MFI.getCalleeSavedInfo();
+  uint64_t StackSize = MFI.getStackSize();
+
+  uint64_t ActualSize =
+      CFI->getCalleeSaveAreaSize() + CFI->getVarArgsSaveSize();
+
+  // Skip to before the restores of callee-saved registers
+  // FIXME: assumes exactly one instruction is used to restore each
+  // callee-saved register.
+  auto LastFrameDestroy = MBBI;
+  if (!CSI.empty())
+    LastFrameDestroy = std::prev(MBBI, CSI.size());
+
+  if (hasFP(MF)) {
+    const CSKYInstrInfo *TII = STI.getInstrInfo();
+    BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::COPY), SPReg)
+        .addReg(FPReg)
+        .setMIFlag(MachineInstr::NoFlags);
+  } else {
+    adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg, (StackSize - ActualSize),
+              MachineInstr::FrameDestroy);
+  }
+
+  adjustReg(MBB, MBBI, DL, SPReg, SPReg, ActualSize,
+            MachineInstr::FrameDestroy);
+}
+
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
+                                         const CSKYSubtarget &STI) {
+  unsigned Limit = (1 << 12) - 1;
+
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugInstr())
+        continue;
+
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
+
+        if (MI.getOpcode() == CSKY::SPILL_CARRY ||
+            MI.getOpcode() == CSKY::RESTORE_CARRY ||
+            MI.getOpcode() == CSKY::STORE_PAIR ||
+            MI.getOpcode() == CSKY::LOAD_PAIR) {
+          Limit = std::min(Limit, ((1U << 12) - 1) * 4);
+          break;
+        }
+
+        if (MI.getOpcode() == CSKY::ADDI32) {
+          Limit = std::min(Limit, (1U << 12));
+          break;
+        }
+
+        if (MI.getOpcode() == CSKY::ADDI16XZ) {
+          Limit = std::min(Limit, (1U << 3));
+          break;
+        }
+
+        // ADDI16 will not require an extra register,
+        // it can reuse the destination.
+        if (MI.getOpcode() == CSKY::ADDI16)
+          break;
+
+        // Otherwise check the addressing mode.
+        switch (MI.getDesc().TSFlags & CSKYII::AddrModeMask) {
+        default:
+          LLVM_DEBUG(MI.dump());
+          llvm_unreachable(
+              "Unhandled addressing mode in stack size limit calculation");
+        case CSKYII::AddrMode32B:
+          Limit = std::min(Limit, (1U << 12) - 1);
+          break;
+        case CSKYII::AddrMode32H:
+          Limit = std::min(Limit, ((1U << 12) - 1) * 2);
+          break;
+        case CSKYII::AddrMode32WD:
+          Limit = std::min(Limit, ((1U << 12) - 1) * 4);
+          break;
+        case CSKYII::AddrMode16B:
+          Limit = std::min(Limit, (1U << 5) - 1);
+          break;
+        case CSKYII::AddrMode16H:
+          Limit = std::min(Limit, ((1U << 5) - 1) * 2);
+          break;
+        case CSKYII::AddrMode16W:
+          Limit = std::min(Limit, ((1U << 5) - 1) * 4);
+          break;
+        case CSKYII::AddrMode32SDF:
+          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+          break;
+        }
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+
+void CSKYFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                             BitVector &SavedRegs,
+                                             RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (hasFP(MF))
+    SavedRegs.set(CSKY::R8);
+
+  // Mark BP as used if function has dedicated base pointer.
+  if (hasBP(MF))
+    SavedRegs.set(CSKY::R7);
+
+  // If interrupt is enabled and there are calls in the handler,
+  // unconditionally save all Caller-saved registers and
+  // all FP registers, regardless whether they are used.
+  if (MF.getFunction().hasFnAttribute("interrupt") && MFI.hasCalls()) {
+
+    static const MCPhysReg CSRegs[] = {CSKY::R0,  CSKY::R1,  CSKY::R2, CSKY::R3,
+                                       CSKY::R12, CSKY::R13, 0};
+
+    for (unsigned i = 0; CSRegs[i]; ++i)
+      SavedRegs.set(CSRegs[i]);
+
+    if (STI.hasHighRegisters()) {
+
+      static const MCPhysReg CSHRegs[] = {CSKY::R18, CSKY::R19, CSKY::R20,
+                                          CSKY::R21, CSKY::R22, CSKY::R23,
+                                          CSKY::R24, CSKY::R25, 0};
+
+      for (unsigned i = 0; CSHRegs[i]; ++i)
+        SavedRegs.set(CSHRegs[i]);
+    }
+
+    static const MCPhysReg CSF32Regs[] = {
+        CSKY::F8_32,  CSKY::F9_32,  CSKY::F10_32,
+        CSKY::F11_32, CSKY::F12_32, CSKY::F13_32,
+        CSKY::F14_32, CSKY::F15_32, 0};
+    static const MCPhysReg CSF64Regs[] = {
+        CSKY::F8_64,  CSKY::F9_64,  CSKY::F10_64,
+        CSKY::F11_64, CSKY::F12_64, CSKY::F13_64,
+        CSKY::F14_64, CSKY::F15_64, 0};
+
+    const MCPhysReg *FRegs = NULL;
+    if (STI.hasFPUv2DoubleFloat() || STI.hasFPUv3DoubleFloat())
+      FRegs = CSF64Regs;
+    else if (STI.hasFPUv2SingleFloat() || STI.hasFPUv3SingleFloat())
+      FRegs = CSF32Regs;
+
+    if (FRegs != NULL) {
+      const MCPhysReg *Regs = MF.getRegInfo().getCalleeSavedRegs();
+
+      for (unsigned i = 0; Regs[i]; ++i)
+        if (CSKY::FPR32RegClass.contains(Regs[i]) ||
+            CSKY::FPR64RegClass.contains(Regs[i])) {
+          unsigned x = 0;
+          for (; FRegs[x]; ++x)
+            if (FRegs[x] == Regs[i])
+              break;
+          if (FRegs[x] == 0)
+            SavedRegs.set(Regs[i]);
+        }
+    }
+  }
+
+  CFI->setLRIsSpilled(SavedRegs.test(CSKY::R15));
+
+  unsigned CSStackSize = 0;
+  for (unsigned Reg : SavedRegs.set_bits()) {
+    auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
+    CSStackSize += RegSize;
+  }
+
+  CFI->setCalleeSaveAreaSize(CSStackSize);
+
+  uint64_t Limit = estimateRSStackSizeLimit(MF, STI);
+
+  bool BigFrame = (MFI.estimateStackSize(MF) + CSStackSize >= Limit);
+
+  if (BigFrame || CFI->isCRSpilled() || !STI.hasE2()) {
+    const TargetRegisterClass *RC = &CSKY::GPRRegClass;
+    unsigned size = TRI->getSpillSize(*RC);
+    Align align = TRI->getSpillAlign(*RC);
+
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(size, align, false));
+  }
+}
+
+// Not preserve stack space within prologue for outgoing variables when the
+// function contains variable size objects and let eliminateCallFramePseudoInstr
+// preserve stack space for it.
+bool CSKYFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo().hasVarSizedObjects();
+}
+
+bool CSKYFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  if (MI != MBB.end() && !MI->isDebugInstr())
+    DL = MI->getDebugLoc();
+
+  for (auto &CS : CSI) {
+    // Insert the spill to the stack frame.
+    Register Reg = CS.getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI);
+  }
+
+  return true;
+}
+
+bool CSKYFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  if (MI != MBB.end() && !MI->isDebugInstr())
+    DL = MI->getDebugLoc();
+
+  for (auto &CS : reverse(CSI)) {
+    Register Reg = CS.getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI);
+    assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
+  }
+
+  return true;
+}
+
+// Eliminate ADJCALLSTACKDOWN, ADJCALLSTACKUP pseudo instructions.
+MachineBasicBlock::iterator CSKYFrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MI) const {
+  Register SPReg = CSKY::R14;
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (!hasReservedCallFrame(MF)) {
+    // If space has not been reserved for a call frame, ADJCALLSTACKDOWN and
+    // ADJCALLSTACKUP must be converted to instructions manipulating the stack
+    // pointer. This is necessary when there is a variable length stack
+    // allocation (e.g. alloca), which means it's not possible to allocate
+    // space for outgoing arguments from within the function prologue.
+    int64_t Amount = MI->getOperand(0).getImm();
+
+    if (Amount != 0) {
+      // Ensure the stack remains aligned after adjustment.
+      Amount = alignSPAdjust(Amount);
+
+      if (MI->getOpcode() == CSKY::ADJCALLSTACKDOWN)
+        Amount = -Amount;
+
+      adjustReg(MBB, MI, DL, SPReg, SPReg, Amount, MachineInstr::NoFlags);
+    }
+  }
+
+  return MBB.erase(MI);
+}
+
+void CSKYFrameLowering::adjustReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  const DebugLoc &DL, Register DestReg,
+                                  Register SrcReg, int64_t Val,
+                                  MachineInstr::MIFlag Flag) const {
+  const CSKYInstrInfo *TII = STI.getInstrInfo();
+
+  if (DestReg == SrcReg && Val == 0)
+    return;
+
+  // TODO: Add 16-bit instruction support with immediate num
+  if (STI.hasE2() && isUInt<12>(std::abs(Val) - 1)) {
+    BuildMI(MBB, MBBI, DL, TII->get(Val < 0 ? CSKY::SUBI32 : CSKY::ADDI32),
+            DestReg)
+        .addReg(SrcReg)
+        .addImm(std::abs(Val))
+        .setMIFlag(Flag);
+  } else if (!STI.hasE2() && isShiftedUInt<7, 2>(std::abs(Val))) {
+    BuildMI(MBB, MBBI, DL,
+            TII->get(Val < 0 ? CSKY::SUBI16SPSP : CSKY::ADDI16SPSP), CSKY::R14)
+        .addReg(CSKY::R14, RegState::Kill)
+        .addImm(std::abs(Val))
+        .setMIFlag(Flag);
+  } else {
+
+    unsigned Op = 0;
+
+    if (STI.hasE2()) {
+      Op = Val < 0 ? CSKY::SUBU32 : CSKY::ADDU32;
+    } else {
+      assert(SrcReg == DestReg);
+      Op = Val < 0 ? CSKY::SUBU16XZ : CSKY::ADDU16XZ;
+    }
+
+    Register ScratchReg = TII->movImm(MBB, MBBI, DL, std::abs(Val), Flag);
+
+    BuildMI(MBB, MBBI, DL, TII->get(Op), DestReg)
+        .addReg(SrcReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .setMIFlag(Flag);
+  }
+}
+
+StackOffset
+CSKYFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                          Register &FrameReg) const {
+  const CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  const auto &CSI = MFI.getCalleeSavedInfo();
+
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  int Offset = MFI.getObjectOffset(FI) + MFI.getOffsetAdjustment();
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  if (FI >= MinCSFI && FI <= MaxCSFI) {
+    FrameReg = CSKY::R14;
+    Offset += CFI->getVarArgsSaveSize() + CFI->getCalleeSaveAreaSize();
+  } else if (RI->hasStackRealignment(MF)) {
+    assert(hasFP(MF));
+    if (!MFI.isFixedObjectIndex(FI)) {
+      FrameReg = hasBP(MF) ? getBPReg(STI) : CSKY::R14;
+      Offset += MFI.getStackSize();
+    } else {
+      FrameReg = getFPReg(STI);
+      Offset += CFI->getVarArgsSaveSize() + CFI->getCalleeSaveAreaSize();
+    }
+  } else {
+    if (MFI.isFixedObjectIndex(FI) && hasFP(MF)) {
+      FrameReg = getFPReg(STI);
+      Offset += CFI->getVarArgsSaveSize() + CFI->getCalleeSaveAreaSize();
+    } else {
+      FrameReg = hasBP(MF) ? getBPReg(STI) : CSKY::R14;
+      Offset += MFI.getStackSize();
+    }
+  }
+
+  return StackOffset::getFixed(Offset);
 }
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.h b/llvm/lib/Target/CSKY/CSKYFrameLowering.h
index 49921a1866bc..69bf01cf1801 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.h
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.h
@@ -21,6 +21,11 @@ class CSKYSubtarget;
 class CSKYFrameLowering : public TargetFrameLowering {
   const CSKYSubtarget &STI;
 
+  void determineFrameLayout(MachineFunction &MF) const;
+  void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                 const DebugLoc &DL, Register DestReg, Register SrcReg,
+                 int64_t Val, MachineInstr::MIFlag Flag) const;
+
 public:
   explicit CSKYFrameLowering(const CSKYSubtarget &STI)
       : TargetFrameLowering(StackGrowsDown,
@@ -31,8 +36,39 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
+  StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
+                                     Register &FrameReg) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+
+  bool assignCalleeSavedSpillSlots(
+      MachineFunction &MF, const TargetRegisterInfo *TRI,
+      std::vector<CalleeSavedInfo> &CSI) const override {
+
+    std::reverse(CSI.begin(), CSI.end());
+
+    return false;
+  }
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
+
   bool hasFP(const MachineFunction &MF) const override;
   bool hasBP(const MachineFunction &MF) const;
+
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  MachineBasicBlock::iterator
+  eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const override;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
index 8dc91904b8cc..d58f9095aa0d 100644
--- a/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelDAGToDAG.cpp
@@ -68,6 +68,24 @@ void CSKYDAGToDAGISel::Select(SDNode *N) {
   case ISD::SUBCARRY:
     IsSelected = selectSubCarry(N);
     break;
+  case ISD::GLOBAL_OFFSET_TABLE: {
+    Register GP = Subtarget->getInstrInfo()->getGlobalBaseReg(*MF);
+    ReplaceNode(N, CurDAG->getRegister(GP, N->getValueType(0)).getNode());
+
+    IsSelected = true;
+    break;
+  }
+  case ISD::FrameIndex: {
+    SDValue Imm = CurDAG->getTargetConstant(0, Dl, MVT::i32);
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
+    ReplaceNode(N, CurDAG->getMachineNode(Subtarget->hasE2() ? CSKY::ADDI32
+                                                             : CSKY::ADDI16XZ,
+                                          Dl, MVT::i32, TFI, Imm));
+
+    IsSelected = true;
+    break;
+  }
   }
 
   if (IsSelected)
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index a1f7cc685d4c..0b589e3d3e4f 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "CSKYISelLowering.h"
 #include "CSKYCallingConv.h"
+#include "CSKYConstantPoolValue.h"
 #include "CSKYMachineFunctionInfo.h"
 #include "CSKYRegisterInfo.h"
 #include "CSKYSubtarget.h"
@@ -37,6 +38,18 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM,
   // Register Class
   addRegisterClass(MVT::i32, &CSKY::GPRRegClass);
 
+  if (STI.useHardFloat()) {
+    if (STI.hasFPUv2SingleFloat())
+      addRegisterClass(MVT::f32, &CSKY::sFPR32RegClass);
+    else if (STI.hasFPUv3SingleFloat())
+      addRegisterClass(MVT::f32, &CSKY::FPR32RegClass);
+
+    if (STI.hasFPUv2DoubleFloat())
+      addRegisterClass(MVT::f64, &CSKY::sFPR64RegClass);
+    else if (STI.hasFPUv3DoubleFloat())
+      addRegisterClass(MVT::f64, &CSKY::FPR64RegClass);
+  }
+
   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
@@ -53,16 +66,29 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::MULHS, MVT::i32, Expand);
   setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::i32, MVT::i1, Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i1, Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, MVT::i1, Promote);
 
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+
   if (!Subtarget.hasE2()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i8, Expand);
     setLoadExtAction(ISD::SEXTLOAD, MVT::i32, MVT::i16, Expand);
@@ -77,6 +103,44 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UDIV, MVT::i32, Expand);
   }
 
+  if (!Subtarget.has3r2E3r3()) {
+    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
+  }
+
+  // Float
+
+  ISD::CondCode FPCCToExtend[] = {
+      ISD::SETONE, ISD::SETUEQ, ISD::SETUGT,
+      ISD::SETUGE, ISD::SETULT, ISD::SETULE,
+  };
+
+  ISD::NodeType FPOpToExpand[] = {ISD::FSIN, ISD::FCOS, ISD::FSINCOS,
+                                  ISD::FPOW, ISD::FREM, ISD::FCOPYSIGN};
+
+  if (STI.useHardFloat()) {
+
+    MVT AllVTy[] = {MVT::f32, MVT::f64};
+
+    for (auto VT : AllVTy) {
+      setOperationAction(ISD::FREM, VT, Expand);
+      setOperationAction(ISD::SELECT_CC, VT, Expand);
+      setOperationAction(ISD::BR_CC, VT, Expand);
+
+      for (auto CC : FPCCToExtend)
+        setCondCodeAction(CC, VT, Expand);
+      for (auto Op : FPOpToExpand)
+        setOperationAction(Op, VT, Expand);
+    }
+
+    if (STI.hasFPUv2SingleFloat() || STI.hasFPUv3SingleFloat()) {
+      setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+    }
+    if (STI.hasFPUv2DoubleFloat() || STI.hasFPUv3DoubleFloat()) {
+      setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
+      setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+    }
+  }
+
   // Compute derived properties from the register classes.
   computeRegisterProperties(STI.getRegisterInfo());
 
@@ -92,6 +156,30 @@ CSKYTargetLowering::CSKYTargetLowering(const TargetMachine &TM,
   setSchedulingPreference(Sched::Source);
 }
 
+SDValue CSKYTargetLowering::LowerOperation(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented op");
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::ExternalSymbol:
+    return LowerExternalSymbol(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  }
+}
+
 EVT CSKYTargetLowering::getSetCCResultType(const DataLayout &DL,
                                            LLVMContext &Context, EVT VT) const {
   if (!VT.isVector())
@@ -145,6 +233,14 @@ static SDValue unpackFromRegLoc(const CSKYSubtarget &Subtarget,
   case MVT::i32:
     RC = &CSKY::GPRRegClass;
     break;
+  case MVT::f32:
+    RC = Subtarget.hasFPUv2SingleFloat() ? &CSKY::sFPR32RegClass
+                                         : &CSKY::FPR32RegClass;
+    break;
+  case MVT::f64:
+    RC = Subtarget.hasFPUv2DoubleFloat() ? &CSKY::sFPR64RegClass
+                                         : &CSKY::FPR64RegClass;
+    break;
   }
 
   Register VReg = RegInfo.createVirtualRegister(RC);
@@ -181,6 +277,44 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
   return Val;
 }
 
+static SDValue unpack64(SelectionDAG &DAG, SDValue Chain, const CCValAssign &VA,
+                        const SDLoc &DL) {
+  assert(VA.getLocVT() == MVT::i32 &&
+         (VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::i64) &&
+         "Unexpected VA");
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  if (VA.isMemLoc()) {
+    // f64/i64 is passed on the stack.
+    int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true);
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    return DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
+                       MachinePointerInfo::getFixedStack(MF, FI));
+  }
+
+  assert(VA.isRegLoc() && "Expected register VA assignment");
+
+  Register LoVReg = RegInfo.createVirtualRegister(&CSKY::GPRRegClass);
+  RegInfo.addLiveIn(VA.getLocReg(), LoVReg);
+  SDValue Lo = DAG.getCopyFromReg(Chain, DL, LoVReg, MVT::i32);
+  SDValue Hi;
+  if (VA.getLocReg() == CSKY::R3) {
+    // Second half of f64/i64 is passed on the stack.
+    int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true);
+    SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+    Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
+                     MachinePointerInfo::getFixedStack(MF, FI));
+  } else {
+    // Second half of f64/i64 is passed in another GPR.
+    Register HiVReg = RegInfo.createVirtualRegister(&CSKY::GPRRegClass);
+    RegInfo.addLiveIn(VA.getLocReg() + 1, HiVReg);
+    Hi = DAG.getCopyFromReg(Chain, DL, HiVReg, MVT::i32);
+  }
+  return DAG.getNode(CSKYISD::BITCAST_FROM_LOHI, DL, VA.getValVT(), Lo, Hi);
+}
+
 // Transform physical registers into virtual registers.
 SDValue CSKYTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
@@ -210,7 +344,11 @@ SDValue CSKYTargetLowering::LowerFormalArguments(
     CCValAssign &VA = ArgLocs[i];
     SDValue ArgValue;
 
-    if (VA.isRegLoc())
+    bool IsF64OnCSKY = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
+
+    if (IsF64OnCSKY)
+      ArgValue = unpack64(DAG, Chain, VA, DL);
+    else if (VA.isRegLoc())
       ArgValue = unpackFromRegLoc(Subtarget, DAG, Chain, VA, DL);
     else
       ArgValue = unpackFromMemLoc(DAG, Chain, VA, DL);
@@ -354,6 +492,255 @@ CSKYTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   return DAG.getNode(CSKYISD::RET, DL, MVT::Other, RetOps);
 }
 
+// Lower a call to a callseq_start + CALL + callseq_end chain, and add input
+// and output parameter nodes.
+SDValue CSKYTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  MVT XLenVT = MVT::i32;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Analyze the operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
+
+  ArgCCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, IsVarArg));
+
+  // Check if it's really possible to do a tail call.
+  if (IsTailCall)
+    IsTailCall = false; // TODO: TailCallOptimization;
+
+  if (IsTailCall)
+    ++NumTailCalls;
+  else if (CLI.CB && CLI.CB->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = ArgCCInfo.getNextStackOffset();
+
+  // Create local copies for byval args
+  SmallVector<SDValue, 8> ByValArgs;
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    if (!Flags.isByVal())
+      continue;
+
+    SDValue Arg = OutVals[i];
+    unsigned Size = Flags.getByValSize();
+    Align Alignment = Flags.getNonZeroByValAlign();
+
+    int FI =
+        MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
+    SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);
+
+    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment,
+                          /*IsVolatile=*/false,
+                          /*AlwaysInline=*/false, IsTailCall,
+                          MachinePointerInfo(), MachinePointerInfo());
+    ByValArgs.push_back(FIPtr);
+  }
+
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL);
+
+  // Copy argument values to their designated locations.
+  SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+  for (unsigned i = 0, j = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue ArgValue = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    bool IsF64OnCSKY = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
+
+    if (IsF64OnCSKY && VA.isRegLoc()) {
+      SDValue Split64 =
+          DAG.getNode(CSKYISD::BITCAST_TO_LOHI, DL,
+                      DAG.getVTList(MVT::i32, MVT::i32), ArgValue);
+      SDValue Lo = Split64.getValue(0);
+      SDValue Hi = Split64.getValue(1);
+
+      Register RegLo = VA.getLocReg();
+      RegsToPass.push_back(std::make_pair(RegLo, Lo));
+
+      if (RegLo == CSKY::R3) {
+        // Second half of f64/i64 is passed on the stack.
+        // Work out the address of the stack slot.
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, DL, CSKY::R14, PtrVT);
+        // Emit the store.
+        MemOpChains.push_back(
+            DAG.getStore(Chain, DL, Hi, StackPtr, MachinePointerInfo()));
+      } else {
+        // Second half of f64/i64 is passed in another GPR.
+        assert(RegLo < CSKY::R31 && "Invalid register pair");
+        Register RegHigh = RegLo + 1;
+        RegsToPass.push_back(std::make_pair(RegHigh, Hi));
+      }
+      continue;
+    }
+
+    ArgValue = convertValVTToLocVT(DAG, ArgValue, VA, DL);
+
+    // Use local copy if it is a byval arg.
+    if (Flags.isByVal())
+      ArgValue = ByValArgs[j++];
+
+    if (VA.isRegLoc()) {
+      // Queue up the argument copies and emit them at the end.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgValue));
+    } else {
+      assert(VA.isMemLoc() && "Argument not register or memory");
+      assert(!IsTailCall && "Tail call not allowed if stack is used "
+                            "for passing parameters");
+
+      // Work out the address of the stack slot.
+      if (!StackPtr.getNode())
+        StackPtr = DAG.getCopyFromReg(Chain, DL, CSKY::R14, PtrVT);
+      SDValue Address =
+          DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr,
+                      DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
+
+      // Emit the store.
+      MemOpChains.push_back(
+          DAG.getStore(Chain, DL, ArgValue, Address, MachinePointerInfo()));
+    }
+  }
+
+  // Join the stores, which are independent of one another.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  SDValue Glue;
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  for (auto &Reg : RegsToPass) {
+    Chain = DAG.getCopyToReg(Chain, DL, Reg.first, Reg.second, Glue);
+    Glue = Chain.getValue(1);
+  }
+
+  SmallVector<SDValue, 8> Ops;
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  bool IsRegCall = false;
+
+  Ops.push_back(Chain);
+
+  if (GlobalAddressSDNode *S = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = S->getGlobal();
+    bool IsLocal =
+        getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+
+    if (isPositionIndependent() || !Subtarget.has2E3()) {
+      IsRegCall = true;
+      Ops.push_back(getAddr<GlobalAddressSDNode, true>(S, DAG, IsLocal));
+    } else {
+      Ops.push_back(getTargetNode(cast<GlobalAddressSDNode>(Callee), DL, Ty,
+                                  DAG, CSKYII::MO_None));
+      Ops.push_back(getTargetConstantPoolValue(
+          cast<GlobalAddressSDNode>(Callee), Ty, DAG, CSKYII::MO_None));
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(
+        *MF.getFunction().getParent(), nullptr);
+
+    if (isPositionIndependent() || !Subtarget.has2E3()) {
+      IsRegCall = true;
+      Ops.push_back(getAddr<ExternalSymbolSDNode, true>(S, DAG, IsLocal));
+    } else {
+      Ops.push_back(getTargetNode(cast<ExternalSymbolSDNode>(Callee), DL, Ty,
+                                  DAG, CSKYII::MO_None));
+      Ops.push_back(getTargetConstantPoolValue(
+          cast<ExternalSymbolSDNode>(Callee), Ty, DAG, CSKYII::MO_None));
+    }
+  } else {
+    IsRegCall = true;
+    Ops.push_back(Callee);
+  }
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (auto &Reg : RegsToPass)
+    Ops.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+
+  if (!IsTailCall) {
+    // Add a register mask operand representing the call-preserved registers.
+    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+    const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
+
+  // Glue the call to the argument copies, if any.
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Emit the call.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  if (IsTailCall) {
+    MF.getFrameInfo().setHasTailCall();
+    return DAG.getNode(IsRegCall ? CSKYISD::TAILReg : CSKYISD::TAIL, DL,
+                       NodeTys, Ops);
+  }
+
+  Chain = DAG.getNode(IsRegCall ? CSKYISD::CALLReg : CSKYISD::CALL, DL, NodeTys,
+                      Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
+  Glue = Chain.getValue(1);
+
+  // Mark the end of the call, which is glued to the call itself.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getConstant(NumBytes, DL, PtrVT, true),
+                             DAG.getConstant(0, DL, PtrVT, true), Glue, DL);
+  Glue = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> CSKYLocs;
+  CCState RetCCInfo(CallConv, IsVarArg, MF, CSKYLocs, *DAG.getContext());
+  RetCCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, IsVarArg));
+
+  // Copy all of the result registers out of their specified physreg.
+  for (auto &VA : CSKYLocs) {
+    // Copy the value out
+    SDValue RetValue =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), Glue);
+    // Glue the RetValue to the end of the call sequence
+    Chain = RetValue.getValue(1);
+    Glue = RetValue.getValue(2);
+
+    bool IsF64OnCSKY = VA.getLocVT() == MVT::i32 && VA.getValVT() == MVT::f64;
+
+    if (IsF64OnCSKY) {
+      assert(VA.getLocReg() == GPRArgRegs[0] && "Unexpected reg assignment");
+      SDValue RetValue2 =
+          DAG.getCopyFromReg(Chain, DL, GPRArgRegs[1], MVT::i32, Glue);
+      Chain = RetValue2.getValue(1);
+      Glue = RetValue2.getValue(2);
+      RetValue = DAG.getNode(CSKYISD::BITCAST_FROM_LOHI, DL, VA.getValVT(),
+                             RetValue, RetValue2);
+    }
+
+    RetValue = convertLocVTToValVT(DAG, RetValue, VA, DL);
+
+    InVals.push_back(RetValue);
+  }
+
+  return Chain;
+}
+
 CCAssignFn *CSKYTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
                                                     bool IsVarArg) const {
   if (IsVarArg || !Subtarget.useHardFloatABI())
@@ -370,6 +757,165 @@ CCAssignFn *CSKYTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
     return CC_CSKY_ABIV2_FP;
 }
 
+static CSKYCP::CSKYCPModifier getModifier(unsigned Flags) {
+
+  if (Flags == CSKYII::MO_ADDR32)
+    return CSKYCP::ADDR;
+  else if (Flags == CSKYII::MO_GOT32)
+    return CSKYCP::GOT;
+  else if (Flags == CSKYII::MO_GOTOFF)
+    return CSKYCP::GOTOFF;
+  else if (Flags == CSKYII::MO_PLT32)
+    return CSKYCP::PLT;
+  else if (Flags == CSKYII::MO_None)
+    return CSKYCP::NO_MOD;
+  else
+    assert(0 && "unknown CSKYII Modifier");
+  return CSKYCP::NO_MOD;
+}
+
+SDValue CSKYTargetLowering::getTargetConstantPoolValue(GlobalAddressSDNode *N,
+                                                       EVT Ty,
+                                                       SelectionDAG &DAG,
+                                                       unsigned Flags) const {
+  CSKYConstantPoolValue *CPV = CSKYConstantPoolConstant::Create(
+      N->getGlobal(), CSKYCP::CPValue, 0, getModifier(Flags), false);
+
+  return DAG.getTargetConstantPool(CPV, Ty);
+}
+
+static MachineBasicBlock *
+emitSelectPseudo(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode) {
+
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // To "insert" a SELECT instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = ++BB->getIterator();
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   bt32 c, sinkMBB
+  //   fallthrough --> copyMBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copyMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, copyMBB);
+  F->insert(It, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), BB,
+                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copyMBB);
+  BB->addSuccessor(sinkMBB);
+
+  // bt32 condition, sinkMBB
+  BuildMI(BB, DL, TII.get(Opcode))
+      .addReg(MI.getOperand(1).getReg())
+      .addMBB(sinkMBB);
+
+  //  copyMBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copyMBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %TrueValue, thisMBB ], [ %FalseValue, copyMBB ]
+  //  ...
+  BB = sinkMBB;
+
+  BuildMI(*BB, BB->begin(), DL, TII.get(CSKY::PHI), MI.getOperand(0).getReg())
+      .addReg(MI.getOperand(2).getReg())
+      .addMBB(thisMBB)
+      .addReg(MI.getOperand(3).getReg())
+      .addMBB(copyMBB);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+CSKYTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const {
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instr type to insert");
+  case CSKY::ISEL32:
+    return emitSelectPseudo(MI, BB, CSKY::BT32);
+  case CSKY::ISEL16:
+    return emitSelectPseudo(MI, BB, CSKY::BT16);
+  }
+}
+
+SDValue CSKYTargetLowering::getTargetConstantPoolValue(ExternalSymbolSDNode *N,
+                                                       EVT Ty,
+                                                       SelectionDAG &DAG,
+                                                       unsigned Flags) const {
+  CSKYConstantPoolValue *CPV =
+      CSKYConstantPoolSymbol::Create(Type::getInt32Ty(*DAG.getContext()),
+                                     N->getSymbol(), 0, getModifier(Flags));
+
+  return DAG.getTargetConstantPool(CPV, Ty);
+}
+
+SDValue CSKYTargetLowering::getTargetConstantPoolValue(JumpTableSDNode *N,
+                                                       EVT Ty,
+                                                       SelectionDAG &DAG,
+                                                       unsigned Flags) const {
+  CSKYConstantPoolValue *CPV =
+      CSKYConstantPoolJT::Create(Type::getInt32Ty(*DAG.getContext()),
+                                 N->getIndex(), 0, getModifier(Flags));
+  return DAG.getTargetConstantPool(CPV, Ty);
+}
+
+SDValue CSKYTargetLowering::getTargetConstantPoolValue(BlockAddressSDNode *N,
+                                                       EVT Ty,
+                                                       SelectionDAG &DAG,
+                                                       unsigned Flags) const {
+  CSKYConstantPoolValue *CPV = CSKYConstantPoolConstant::Create(
+      N->getBlockAddress(), CSKYCP::CPBlockAddress, 0, getModifier(Flags),
+      false);
+  return DAG.getTargetConstantPool(CPV, Ty);
+}
+
+SDValue CSKYTargetLowering::getTargetNode(GlobalAddressSDNode *N, SDLoc DL,
+                                          EVT Ty, SelectionDAG &DAG,
+                                          unsigned Flags) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
+}
+
+SDValue CSKYTargetLowering::getTargetNode(ExternalSymbolSDNode *N, SDLoc DL,
+                                          EVT Ty, SelectionDAG &DAG,
+                                          unsigned Flags) const {
+  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flags);
+}
+
+SDValue CSKYTargetLowering::getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flags) const {
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flags);
+}
+
+SDValue CSKYTargetLowering::getTargetNode(BlockAddressSDNode *N, SDLoc DL,
+                                          EVT Ty, SelectionDAG &DAG,
+                                          unsigned Flags) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, N->getOffset(),
+                                   Flags);
+}
+
 const char *CSKYTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default:
@@ -380,7 +926,243 @@ const char *CSKYTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "CSKYISD::NIR";
   case CSKYISD::RET:
     return "CSKYISD::RET";
+  case CSKYISD::CALL:
+    return "CSKYISD::CALL";
+  case CSKYISD::CALLReg:
+    return "CSKYISD::CALLReg";
+  case CSKYISD::TAIL:
+    return "CSKYISD::TAIL";
+  case CSKYISD::TAILReg:
+    return "CSKYISD::TAILReg";
+  case CSKYISD::LOAD_ADDR:
+    return "CSKYISD::LOAD_ADDR";
   case CSKYISD::BITCAST_TO_LOHI:
     return "CSKYISD::BITCAST_TO_LOHI";
+  case CSKYISD::BITCAST_FROM_LOHI:
+    return "CSKYISD::BITCAST_FROM_LOHI";
   }
 }
+
+SDValue CSKYTargetLowering::LowerGlobalAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  int64_t Offset = N->getOffset();
+
+  const GlobalValue *GV = N->getGlobal();
+  bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
+  SDValue Addr = getAddr<GlobalAddressSDNode, false>(N, DAG, IsLocal);
+
+  // In order to maximise the opportunity for common subexpression elimination,
+  // emit a separate ADD node for the global address offset instead of folding
+  // it in the global address node. Later peephole optimisations may choose to
+  // fold it back in when profitable.
+  if (Offset != 0)
+    return DAG.getNode(ISD::ADD, DL, Ty, Addr,
+                       DAG.getConstant(Offset, DL, MVT::i32));
+  return Addr;
+}
+
+SDValue CSKYTargetLowering::LowerExternalSymbol(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  ExternalSymbolSDNode *N = cast<ExternalSymbolSDNode>(Op);
+
+  return getAddr(N, DAG, false);
+}
+
+SDValue CSKYTargetLowering::LowerJumpTable(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+
+  return getAddr<JumpTableSDNode, false>(N, DAG);
+}
+
+SDValue CSKYTargetLowering::LowerBlockAddress(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+
+  return getAddr(N, DAG);
+}
+
+SDValue CSKYTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  CSKYMachineFunctionInfo *FuncInfo = MF.getInfo<CSKYMachineFunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
+                                 getPointerTy(MF.getDataLayout()));
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FI, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue CSKYTargetLowering::LowerFRAMEADDR(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  const CSKYRegisterInfo &RI = *Subtarget.getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  Register FrameReg = RI.getFrameRegister(MF);
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo());
+  return FrameAddr;
+}
+
+SDValue CSKYTargetLowering::LowerRETURNADDR(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  const CSKYRegisterInfo &RI = *Subtarget.getRegisterInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MFI.setReturnAddressIsTaken(true);
+
+  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo());
+  }
+  // Return the value of the return address register, marking it an implicit
+  // live-in.
+  unsigned Reg = MF.addLiveIn(RI.getRARegister(), getRegClassFor(MVT::i32));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
+}
+
+Register CSKYTargetLowering::getExceptionPointerRegister(
+    const Constant *PersonalityFn) const {
+  return CSKY::R0;
+}
+
+Register CSKYTargetLowering::getExceptionSelectorRegister(
+    const Constant *PersonalityFn) const {
+  return CSKY::R1;
+}
+
+SDValue CSKYTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  int64_t Offset = N->getOffset();
+  MVT XLenVT = MVT::i32;
+
+  TLSModel::Model Model = getTargetMachine().getTLSModel(N->getGlobal());
+  SDValue Addr;
+  switch (Model) {
+  case TLSModel::LocalExec:
+    Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/false);
+    break;
+  case TLSModel::InitialExec:
+    Addr = getStaticTLSAddr(N, DAG, /*UseGOT=*/true);
+    break;
+  case TLSModel::LocalDynamic:
+  case TLSModel::GeneralDynamic:
+    Addr = getDynamicTLSAddr(N, DAG);
+    break;
+  }
+
+  // In order to maximise the opportunity for common subexpression elimination,
+  // emit a separate ADD node for the global address offset instead of folding
+  // it in the global address node. Later peephole optimisations may choose to
+  // fold it back in when profitable.
+  if (Offset != 0)
+    return DAG.getNode(ISD::ADD, DL, Ty, Addr,
+                       DAG.getConstant(Offset, DL, XLenVT));
+  return Addr;
+}
+
+SDValue CSKYTargetLowering::getStaticTLSAddr(GlobalAddressSDNode *N,
+                                             SelectionDAG &DAG,
+                                             bool UseGOT) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+
+  unsigned CSKYPCLabelIndex = CFI->createPICLabelUId();
+
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+
+  CSKYCP::CSKYCPModifier Flag = UseGOT ? CSKYCP::TLSIE : CSKYCP::TLSLE;
+  bool AddCurrentAddr = UseGOT ? true : false;
+  unsigned char PCAjust = UseGOT ? 4 : 0;
+
+  CSKYConstantPoolValue *CPV =
+      CSKYConstantPoolConstant::Create(N->getGlobal(), CSKYCP::CPValue, PCAjust,
+                                       Flag, AddCurrentAddr, CSKYPCLabelIndex);
+  SDValue CAddr = DAG.getTargetConstantPool(CPV, Ty);
+
+  SDValue Load;
+  if (UseGOT) {
+    SDValue PICLabel = DAG.getTargetConstant(CSKYPCLabelIndex, DL, MVT::i32);
+    auto *LRWGRS = DAG.getMachineNode(CSKY::PseudoTLSLA32, DL, {Ty, Ty},
+                                      {CAddr, PICLabel});
+    auto LRWADDGRS =
+        DAG.getNode(ISD::ADD, DL, Ty, SDValue(LRWGRS, 0), SDValue(LRWGRS, 1));
+    Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), LRWADDGRS,
+                       MachinePointerInfo(N->getGlobal()));
+  } else {
+    Load = SDValue(DAG.getMachineNode(CSKY::LRW32, DL, Ty, CAddr), 0);
+  }
+
+  // Add the thread pointer.
+  SDValue TPReg = DAG.getRegister(CSKY::R31, MVT::i32);
+  return DAG.getNode(ISD::ADD, DL, Ty, Load, TPReg);
+}
+
+SDValue CSKYTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
+                                              SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+
+  unsigned CSKYPCLabelIndex = CFI->createPICLabelUId();
+
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  IntegerType *CallTy = Type::getIntNTy(*DAG.getContext(), Ty.getSizeInBits());
+
+  CSKYConstantPoolValue *CPV =
+      CSKYConstantPoolConstant::Create(N->getGlobal(), CSKYCP::CPValue, 4,
+                                       CSKYCP::TLSGD, true, CSKYPCLabelIndex);
+  SDValue Addr = DAG.getTargetConstantPool(CPV, Ty);
+  SDValue PICLabel = DAG.getTargetConstant(CSKYPCLabelIndex, DL, MVT::i32);
+
+  auto *LRWGRS =
+      DAG.getMachineNode(CSKY::PseudoTLSLA32, DL, {Ty, Ty}, {Addr, PICLabel});
+
+  auto Load =
+      DAG.getNode(ISD::ADD, DL, Ty, SDValue(LRWGRS, 0), SDValue(LRWGRS, 1));
+
+  // Prepare argument list to generate call.
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Entry.Node = Load;
+  Entry.Ty = CallTy;
+  Args.push_back(Entry);
+
+  // Setup call to __tls_get_addr.
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::C, CallTy,
+                    DAG.getExternalSymbol("__tls_get_addr", Ty),
+                    std::move(Args));
+  SDValue V = LowerCallTo(CLI).first;
+
+  return V;
+}
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.h b/llvm/lib/Target/CSKY/CSKYISelLowering.h
index 7557c11f50a8..e1744d5ce220 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.h
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.h
@@ -27,7 +27,15 @@ enum NodeType : unsigned {
   NIE,
   NIR,
   RET,
-  BITCAST_TO_LOHI
+  CALL,
+  CALLReg,
+  TAIL,
+  TAILReg,
+  LOAD_ADDR,
+  // i32, i32 <-- f64
+  BITCAST_TO_LOHI,
+  // f64 < -- i32, i32
+  BITCAST_FROM_LOHI,
 };
 }
 
@@ -38,6 +46,8 @@ public:
   explicit CSKYTargetLowering(const TargetMachine &TM,
                               const CSKYSubtarget &STI);
 
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
 
@@ -58,8 +68,96 @@ private:
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  /// If a physical register, this returns the register that receives the
+  /// exception address on entry to an EH pad.
+  Register
+  getExceptionPointerRegister(const Constant *PersonalityFn) const override;
+
+  /// If a physical register, this returns the register that receives the
+  /// exception typeid on entry to a landing pad.
+  Register
+  getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+
+  bool isSelectSupported(SelectSupportKind Kind) const override {
+    // CSKY does not support scalar condition selects on vectors.
+    return (Kind != ScalarCondVectorVal);
+  }
+
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr &MI,
+                              MachineBasicBlock *BB) const override;
+
+  SDValue getTargetNode(GlobalAddressSDNode *N, SDLoc DL, EVT Ty,
+                        SelectionDAG &DAG, unsigned Flags) const;
+
+  SDValue getTargetNode(ExternalSymbolSDNode *N, SDLoc DL, EVT Ty,
+                        SelectionDAG &DAG, unsigned Flags) const;
+
+  SDValue getTargetNode(JumpTableSDNode *N, SDLoc DL, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flags) const;
+
+  SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
+                        SelectionDAG &DAG, unsigned Flags) const;
+
+  SDValue getTargetConstantPoolValue(GlobalAddressSDNode *N, EVT Ty,
+                                     SelectionDAG &DAG, unsigned Flags) const;
+
+  SDValue getTargetConstantPoolValue(ExternalSymbolSDNode *N, EVT Ty,
+                                     SelectionDAG &DAG, unsigned Flags) const;
+
+  SDValue getTargetConstantPoolValue(JumpTableSDNode *N, EVT Ty,
+                                     SelectionDAG &DAG, unsigned Flags) const;
+
+  SDValue getTargetConstantPoolValue(BlockAddressSDNode *N, EVT Ty,
+                                     SelectionDAG &DAG, unsigned Flags) const;
+
+  template <class NodeTy, bool IsCall = false>
+  SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const {
+    SDLoc DL(N);
+    EVT Ty = getPointerTy(DAG.getDataLayout());
+
+    unsigned Flag = CSKYII::MO_None;
+    bool IsPIC = isPositionIndependent();
+
+    if (IsPIC)
+      Flag = IsLocal  ? CSKYII::MO_GOTOFF
+             : IsCall ? CSKYII::MO_PLT32
+                      : CSKYII::MO_GOT32;
+
+    SDValue TCPV = getTargetConstantPoolValue(N, Ty, DAG, Flag);
+    SDValue TV = getTargetNode(N, DL, Ty, DAG, Flag);
+    SDValue Addr = DAG.getNode(CSKYISD::LOAD_ADDR, DL, Ty, {TV, TCPV});
+
+    if (!IsPIC)
+      return Addr;
+
+    SDValue Result =
+        DAG.getNode(ISD::ADD, DL, Ty, {DAG.getGLOBAL_OFFSET_TABLE(Ty), Addr});
+    if (IsLocal)
+      return Result;
+
+    return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Result,
+                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  }
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
+                           bool UseGOT) const;
+  SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
+
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
   CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg) const;
 };
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
new file mode 100644
index 000000000000..446670a4d0a9
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF1.td
@@ -0,0 +1,274 @@
+//===- CSKYInstrFormatsF1.td - CSKY Float1.0 Instr Format --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// CSKY Instruction Format Float1.0 Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class CSKYFP1Inst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : CSKY32Inst<AddrModeNone, 0x3d, outs, ins, asmstr, pattern>, Requires<[HasFPUv2_SF]> {
+}
+
+class F_XYZ_BASE<bits<5> datatype, bits<6> sop, dag outs, dag ins, string opcodestr, list<dag> pattern>
+  : CSKYFP1Inst<outs, ins, opcodestr, pattern> {
+  bits<4> vrx;
+  bits<4> vry;
+  bits<4> vrz;
+  let Inst{25 - 21} = {0, vry};
+  let Inst{20 - 16} = {0, vrx};
+  let Inst{15 - 11} = datatype;
+  let Inst{10 - 5} = sop;
+  let Inst{4 - 0} = {0, vrz};
+}
+
+class F_XZ_GF<bits<5> datatype, bits<6> sop, dag outs, dag ins, string opcodestr, list<dag> pattern>
+  : CSKYFP1Inst<outs, ins, opcodestr, pattern> {
+  bits<4> vrx;
+  bits<5> rz;
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = {0, vrx};
+  let Inst{15 - 11} = datatype;
+  let Inst{10 - 5} = sop;
+  let Inst{4 - 0} = {rz};
+}
+
+class F_XZ_FG<bits<5> datatype, bits<6> sop, dag outs, dag ins, string opcodestr, list<dag> pattern>
+  : CSKYFP1Inst<outs, ins, opcodestr, pattern> {
+  bits<5> rx;
+  bits<4> vrz;
+  let Inst{25 - 21} = 0;
+  let Inst{20 - 16} = {rx};
+  let Inst{15 - 11} = datatype;
+  let Inst{10 - 5} = sop;
+  let Inst{4 - 0} = {0, vrz};
+}
+
+class F_XZ_TRANS_FROM<bits<6> sop, string op, RegisterOperand regtype1, RegisterOperand regtype2>
+  : F_XZ_GF<3, sop, (outs regtype1:$rz), (ins regtype2:$vrx), !strconcat(op, "\t$rz, $vrx"),
+  []>;
+
+class F_XZ_TRANS_TO<bits<6> sop, string op, RegisterOperand regtype1, RegisterOperand regtype2>
+  : F_XZ_FG<3, sop, (outs regtype1:$vrz), (ins regtype2:$rx), !strconcat(op, "\t$vrz, $rx"),
+  []>;
+
+let vry = 0 in {
+class F_XZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype>
+  : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrz, $vrx"),
+  [(set regtype:$vrz, (opnode regtype:$vrx))]>;
+
+class F_MOV<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype>
+  : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrz, $vrx"),
+  []>;
+
+class F_XZ_TRANS<bits<6> sop, string op, RegisterOperand regtype1, RegisterOperand regtype2>
+  : F_XYZ_BASE<3, sop, (outs regtype1:$vrz), (ins regtype2:$vrx), !strconcat(op, "\t$vrz, $vrx"),
+  []>;
+
+class F_XZ_TRANS_DS<bits<6> sop, string op, PatFrag opnode>
+  : F_XYZ_BASE<3, sop, (outs sFPR32Op:$vrz), (ins sFPR64Op:$vrx), !strconcat(op, "\t$vrz, $vrx"),
+  [(set sFPR32Op:$vrz, (opnode sFPR64Op:$vrx))]>;
+
+class F_XZ_TRANS_SD<bits<6> sop, string op, PatFrag opnode>
+  : F_XYZ_BASE<3, sop, (outs sFPR64Op:$vrz), (ins sFPR32Op:$vrx), !strconcat(op, "\t$vrz, $vrx"),
+  [(set sFPR64Op:$vrz, (opnode sFPR32Op:$vrx))]>;
+}
+
+multiclass FT_MOV<bits<6> sop, string op> {
+  def _S :  F_MOV<0, sop, op, "s", sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_MOV<1, sop, op, "d", sFPR64Op>;
+}
+
+multiclass FT_XZ<bits<6> sop, string op, PatFrag opnode> {
+  def _S :  F_XZ<0, sop, op, "s", opnode, sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_XZ<1, sop, op, "d", opnode, sFPR64Op>;
+}
+
+let vrz = 0, isCompare = 1 in {
+class F_CMPXY<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype>
+  : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry), !strconcat(op#op_su, "\t$vrx, $vry"),
+  []>;
+
+let vry = 0 in{
+class F_CMPZX<bits<5> datatype, bits<6> sop, string op, string op_su, RegisterOperand regtype>
+  : F_XYZ_BASE<datatype, sop, (outs CARRY:$ca), (ins regtype:$vrx), !strconcat(op#op_su, "\t$vrx"),
+  []>;
+}
+}
+
+class F_XYZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype>
+  : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrx, regtype:$vry),
+    !strconcat(op#op_su, "\t$vrz, $vrx, $vry"),
+  [(set regtype:$vrz, (opnode regtype:$vrx, regtype:$vry))]>;
+
+multiclass FT_XYZ<bits<6> sop, string op, PatFrag opnode> {
+  def _S :  F_XYZ<0, sop, op, "s", opnode, sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_XYZ<1, sop, op, "d", opnode, sFPR64Op>;
+}
+
+let Constraints = "$vrt = $vrz" in {
+class F_ACCUM_XYZ<bits<5> datatype, bits<6> sop, string op, string op_su, PatFrag opnode, RegisterOperand regtype>
+  : F_XYZ_BASE<datatype, sop, (outs regtype:$vrz), (ins regtype:$vrt, regtype:$vrx, regtype:$vry),
+    !strconcat(op#op_su, "\t$vrz, $vrx, $vry"),
+  [(set regtype:$vrz, (opnode regtype:$vrt, regtype:$vrx, regtype:$vry))]>;
+}
+
+multiclass FT_ACCUM_XYZ<bits<6> sop, string op, PatFrag opnode> {
+  def _S :  F_ACCUM_XYZ<0, sop, op, "s", opnode, sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_ACCUM_XYZ<1, sop, op, "d", opnode, sFPR64Op>;
+}
+
+multiclass FT_CMPXY<bits<6> sop, string op> {
+  def _S :  F_CMPXY<0, sop, op, "s", sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_CMPXY<1, sop, op, "d", sFPR64Op>;
+}
+
+
+multiclass FT_CMPZX<bits<6> sop, string op> {
+  def _S :  F_CMPZX<0, sop, op, "s", sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_CMPZX<1, sop, op, "d", sFPR64Op>;
+}
+
+class F_I8_XY_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern>
+  : CSKY32Inst<AddrMode32SDF, 0x3d, outs, ins, opcodestr, pattern> {
+  bits<5> rx;
+  bits<4> vrz;
+  bits<8> imm8;
+  let Inst{25} = 0;
+  let Inst{24 - 21} = imm8{7 - 4};  //imm4h
+  let Inst{20 - 16} = rx;  //rx
+  let Inst{15 - 9} = sop;
+  let Inst{8} = sop_su;
+  let Inst{7 - 4} = imm8{3 - 0}; // imm4l
+  let Inst{3 - 0} = vrz;
+}
+
+class F_I4_XY_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern>
+  : CSKY32Inst<AddrMode32SDF, 0x3d, outs, ins, opcodestr, pattern> {
+  bits<10> regs;
+  bits<5> rx;
+
+  let Inst{25} = 0;
+  let Inst{24 - 21} = regs{3-0};  //imm4
+  let Inst{20 - 16} = rx;  //rx
+  let Inst{15 - 9} = sop;
+  let Inst{8} = sop_su;
+  let Inst{7 - 4} = 0;
+  let Inst{3 - 0} = regs{8-5};
+}
+
+class F_I8_Z_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern>
+  : CSKY32Inst<AddrModeNone, 0x3d, outs, ins, opcodestr, pattern> {
+  bits<4> vrz;
+  bits<8> imm8;
+  let Inst{25} = 0;
+  let Inst{24 - 21} = imm8{7 - 4};  //imm4h
+  let Inst{20 - 16} = 0;  //rx
+  let Inst{15 - 9} = sop;
+  let Inst{8} = sop_su;
+  let Inst{7 - 4} = imm8{3 - 0}; // imm4l
+  let Inst{3 - 0} = vrz;
+}
+
+class F_XYZ_MEM<bits<7> sop, bits<1> sop_su, dag outs, dag ins, string opcodestr, list<dag> pattern>
+  : CSKY32Inst<AddrModeNone, 0x3d, outs, ins, opcodestr, pattern> {
+  bits<5> rx;
+  bits<5> ry;
+  bits<4> vrz;
+  bits<2> imm;
+
+  let Inst{25 - 21} = ry;  // ry;
+  let Inst{20 - 16} = rx;  // rx;
+  let Inst{15 - 9} = sop;
+  let Inst{8} = sop_su;
+  let Inst{7} = 0;
+  let Inst{6,5} = imm;  // shift;
+  let Inst{4} = 0;
+  let Inst{3 - 0} = vrz;
+}
+
+class F_XYAI_LD<bits<7> sop, bits<1> sop_su, string op, string op_su,
+                 RegisterOperand regtype, Operand operand>
+  : F_I8_XY_MEM<sop, sop_su, (outs regtype:$vrz), (ins GPR:$rx, operand:$imm8),
+    !strconcat(op#op_su, "\t$vrz, ($rx, ${imm8})"), []>;
+
+class F_XYAR_LD<bits<7> sop, bits<1> sop_su, string op, string op_su,
+                 RegisterOperand regtype>
+  : F_XYZ_MEM<sop, sop_su, (outs regtype:$vrz), (ins GPR:$rx, GPR:$ry, uimm2:$imm),
+    op#op_su#"\t$vrz, ($rx, $ry << ${imm})", []>;
+
+class F_XYAI_ST<bits<7> sop, bits<1> sop_su, string op, string op_su,
+                 RegisterOperand regtype, Operand operand>
+  : F_I8_XY_MEM<sop, sop_su, (outs), (ins regtype:$vrz, GPR:$rx, operand:$imm8),
+    !strconcat(op#op_su, "\t$vrz, ($rx, ${imm8})"), []>;
+
+class F_XYAR_ST<bits<7> sop, bits<1> sop_su, string op, string op_su,
+                 RegisterOperand regtype>
+  : F_XYZ_MEM<sop, sop_su, (outs), (ins regtype:$vrz, GPR:$rx, GPR:$ry, uimm2:$imm),
+    op#op_su#"\t$vrz, ($rx, $ry << ${imm})", []>;
+
+def Mem8SL2 : Operand<iPTR>, ComplexPattern<iPTR, 2, "SelectAddrRegImm8", []> {
+  let MIOperandInfo = (ops GPR, i32imm);
+  let PrintMethod = "printAddrModeRegImmOperand";
+  let EncoderMethod = "getAddrModeFloatImm8_sl2OpValue";
+}
+
+def FRRS : Operand<iPTR>, ComplexPattern<iPTR, 3, "SelectAddrRegReg", []> {
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+  let PrintMethod = "printAddrModeRegRegSLOperand";
+  let EncoderMethod = "getAddrModeFloatRegRegSLOpValue";
+}
+
+multiclass FT_XYAI_LD<bits<7> sop, string op> {
+  def _S :  F_XYAI_LD<sop, 0, op, "s", sFPR32Op, uimm8_2>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_XYAI_LD<sop, 1, op, "d", sFPR64Op, uimm8_2>;
+}
+
+multiclass FT_XYAR_LD<bits<7> sop, string op> {
+  def _S :  F_XYAR_LD<sop, 0, op, "s", sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_XYAR_LD<sop, 1, op, "d", sFPR64Op>;
+}
+
+multiclass FT_XYAI_ST<bits<7> sop, string op> {
+  def _S :  F_XYAI_ST<sop, 0, op, "s", sFPR32Op, uimm8_2>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_XYAI_ST<sop, 1, op, "d", sFPR64Op, uimm8_2>;
+}
+
+multiclass FT_XYAR_ST<bits<7> sop, string op> {
+  def _S :  F_XYAR_ST<sop, 0, op, "s", sFPR32Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_XYAR_ST<sop, 1, op, "d", sFPR64Op>;
+}
+
+multiclass FT_XYAR_STM<bits<7> sop, string op> {
+  def _S :  F_I4_XY_MEM<sop, 0, (outs),
+    (ins GPR:$rx, regseq_f1:$regs, variable_ops),
+      !strconcat(op#"s", "\t$regs, (${rx})"), []>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_I4_XY_MEM<sop, 1, (outs),
+    (ins GPR:$rx, regseq_d1:$regs, variable_ops),
+      !strconcat(op#"d", "\t$regs, (${rx})"), []>;
+}
+
+multiclass FT_XYAR_LDM<bits<7> sop, string op> {
+  def _S :  F_I4_XY_MEM<sop, 0, (outs),
+    (ins GPR:$rx, regseq_f1:$regs, variable_ops),
+      !strconcat(op#"s", "\t$regs, (${rx})"), []>;
+  let Predicates = [HasFPUv2_DF] in
+  def _D :  F_I4_XY_MEM<sop, 1, (outs),
+    (ins GPR:$rx, regseq_d1:$regs, variable_ops),
+      !strconcat(op#"d", "\t$regs, (${rx})"), []>;
+}
diff --git a/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
new file mode 100644
index 000000000000..641ad623f140
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrFormatsF2.td
@@ -0,0 +1,208 @@
+//===- CSKYInstrFormatsF2.td - CSKY Float2.0 Instr Format --*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// CSKY Instruction Format Float2.0 Definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class CSKYInstF2<AddrMode am, dag outs, dag ins, string opcodestr,
+                 list<dag> pattern>
+    : CSKY32Inst<am, 0x3d, outs, ins, opcodestr, pattern> {
+  let Predicates = [HasFPUv3_SF];
+  let DecoderNamespace = "FPUV3";
+}
+
+class F2_XYZ<bits<5> datatype, bits<6> sop, string opcodestr, dag outs, dag ins,
+             list<dag> pattern>
+    : CSKYInstF2<AddrModeNone, outs, ins, opcodestr, pattern> {
+  bits<5> vry;
+  bits<5> vrx;
+  bits<5> vrz;
+
+  let Inst{25-21} = vry;
+  let Inst{20-16} = vrx;
+  let Inst{15-11} = datatype;
+  let Inst{10-5} = sop;
+  let Inst{4-0} = vrz;
+}
+
+multiclass F2_XYZ_T<bits<6> sop, string op, PatFrag opnode> {
+  def _S : F2_XYZ<0b00000, sop, op#".32"#"\t$vrz, $vrx, $vry",
+             (outs FPR32Op:$vrz), (ins FPR32Op:$vrx, FPR32Op:$vry),
+             [(set FPR32Op:$vrz, (opnode FPR32Op:$vrx, FPR32Op:$vry))]>;
+  let Predicates = [HasFPUv3_DF] in
+  def _D : F2_XYZ<0b00001, sop, op#".64"#"\t$vrz, $vrx, $vry",
+             (outs FPR64Op:$vrz), (ins FPR64Op:$vrx, FPR64Op:$vry),
+             [(set FPR64Op:$vrz, (opnode FPR64Op:$vrx, FPR64Op:$vry))]>;
+}
+
+let Constraints = "$vrZ = $vrz" in
+multiclass F2_XYZZ_T<bits<6> sop, string op, PatFrag opnode> {
+  def _S : F2_XYZ<0b00000, sop, op#".32"#"\t$vrz, $vrx, $vry",
+                  (outs FPR32Op:$vrz), (ins FPR32Op:$vrZ, FPR32Op:$vrx, FPR32Op:$vry),
+                  [(set FPR32Op:$vrz, (opnode FPR32Op:$vrx, FPR32Op:$vry, FPR32Op:$vrZ))]>;
+  let Predicates = [HasFPUv3_DF] in
+  def _D : F2_XYZ<0b00001, sop, op#".64"#"\t$vrz, $vrx, $vry",
+                  (outs FPR64Op:$vrz), (ins FPR64Op:$vrZ, FPR64Op:$vrx, FPR64Op:$vry),
+                  [(set FPR64Op:$vrz, (opnode FPR64Op:$vrx, FPR64Op:$vry, FPR64Op:$vrZ))]>;
+}
+
+let vry = 0 in {
+class F2_XZ<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op, SDNode opnode>
+    : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx"),
+             (outs regtype:$vrz), (ins regtype:$vrx),
+             [(set regtype:$vrz, (opnode regtype:$vrx))]>;
+
+class F2_XZ_SET<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
+    : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx"),
+             (outs regtype:$vrz), (ins regtype:$vrx),
+             []>;
+
+class F2_XZ_P<bits<5> datatype, bits<6> sop, string op, list<dag> pattern = [],
+              dag outs, dag ins>
+    : F2_XYZ<datatype, sop, op#"\t$vrz, $vrx", outs, ins, pattern>;
+}
+
+multiclass F2_XZ_RM<bits<5> datatype, bits<4> sop, string op, dag outs, dag ins> {
+  def _RN  : F2_XZ_P<datatype, {sop, 0b00}, op#".rn", [], outs, ins>;
+  def _RZ  : F2_XZ_P<datatype, {sop, 0b01}, op#".rz", [], outs, ins>;
+  def _RPI : F2_XZ_P<datatype, {sop, 0b10}, op#".rpi", [], outs, ins>;
+  def _RNI : F2_XZ_P<datatype, {sop, 0b11}, op#".rni", [], outs, ins>;
+}
+
+multiclass F2_XZ_T<bits<6> sop, string op, SDNode opnode> {
+  def _S : F2_XZ<0b00000, FPR32Op, sop, op#".32", opnode>;
+  let Predicates = [HasFPUv3_DF] in
+  def _D : F2_XZ<0b00001, FPR64Op, sop, op#".64", opnode>;
+}
+
+multiclass F2_XZ_SET_T<bits<6> sop, string op, string suffix = ""> {
+  def _S : F2_XZ_SET<0b00000, FPR32Op, sop, op#".32"#suffix>;
+  let Predicates = [HasFPUv3_DF] in
+  def _D : F2_XZ_SET<0b00001, FPR64Op, sop, op#".64"#suffix>;
+}
+
+
+let vrz = 0, isCompare = 1 in
+class F2_CXY<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
+    : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx, $vry"),
+             (outs CARRY:$ca), (ins regtype:$vrx, regtype:$vry),
+             []>;
+
+multiclass F2_CXY_T<bits<6> sop, string op> {
+  def _S : F2_CXY<0b00000, FPR32Op, sop, op#".32">;
+  let Predicates = [HasFPUv3_DF] in
+  def _D : F2_CXY<0b00001, FPR64Op, sop, op#".64">;
+}
+
+
+let vrz = 0, vry = 0, isCompare = 1 in
+class F2_CX<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
+    : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrx"),
+             (outs CARRY:$ca), (ins regtype:$vrx),
+             []>;
+
+multiclass F2_CX_T<bits<6> sop, string op> {
+  def _S : F2_CX<0b00000, FPR32Op, sop, op#".32">;
+  let Predicates = [HasFPUv3_DF] in
+  def _D : F2_CX<0b00001, FPR64Op, sop, op#".64">;
+}
+
+
+class F2_LDST<bits<2> datatype, bits<1> sop, string op, dag outs, dag ins>
+    : CSKYInstF2<AddrMode32SDF, outs, ins,
+                 !strconcat(op, "\t$vrz, ($rx, ${imm8})"), []> {
+  bits<10> imm8;
+  bits<5> rx;
+  bits<5> vrz;
+
+  let Inst{25} = vrz{4};
+  let Inst{24-21} = imm8{7-4};
+  let Inst{20-16} = rx;
+  let Inst{15-11} = 0b00100;
+  let Inst{10} = sop;
+  let Inst{9-8} = datatype;
+  let Inst{7-4} = imm8{3-0};
+  let Inst{3-0} = vrz{3-0};
+}
+
+class F2_LDST_S<bits<1> sop, string op, dag outs, dag ins>
+    : F2_LDST<0b00, sop, op#".32", outs, ins>;
+class F2_LDST_D<bits<1> sop, string op, dag outs, dag ins>
+    : F2_LDST<0b01, sop, op#".64", outs, ins>;
+
+class F2_LDSTM<bits<2> datatype, bits<1> sop, bits<3> sop2, string op, dag outs, dag ins>
+    : CSKYInstF2<AddrMode32SDF, outs, ins,
+                 !strconcat(op, "\t$regs, (${rx})"), []> {
+  bits<10> regs;
+  bits<5> rx;
+
+  let Inst{25-21} = regs{4-0};
+  let Inst{20-16} = rx;
+  let Inst{15-11} = 0b00110;
+  let Inst{10} = sop;
+  let Inst{9-8} = datatype;
+  let Inst{7-5} = sop2;
+  let Inst{4-0} = regs{9-5};
+}
+
+class F2_LDSTM_S<bits<1> sop, bits<3> sop2, string op, dag outs, dag ins>
+    : F2_LDSTM<0b00, sop, sop2, op#".32", outs, ins>;
+class F2_LDSTM_D<bits<1> sop, bits<3> sop2, string op, dag outs, dag ins>
+    : F2_LDSTM<0b01, sop, sop2, op#".64", outs, ins>;
+
+
+class F2_LDSTR<bits<2> datatype, bits<1> sop, string op, dag outs, dag ins>
+    : CSKYInstF2<AddrModeNone, outs, ins,
+                 op#"\t$rz, ($rx, $ry << ${imm})", []> {
+  bits<5> rx;
+  bits<5> ry;
+  bits<5> rz;
+  bits<2> imm;
+
+  let Inst{25-21} = ry;
+  let Inst{20-16} = rx;
+  let Inst{15-11} = 0b00101;
+  let Inst{10} = sop;
+  let Inst{9-8} = datatype;
+  let Inst{7} = 0;
+  let Inst{6-5} = imm;
+  let Inst{4-0} = rz;
+}
+
+class F2_LDSTR_S<bits<1> sop, string op, dag outs, dag ins>
+    : F2_LDSTR<0b00, sop, op#".32", outs, ins>;
+class F2_LDSTR_D<bits<1> sop, string op, dag outs, dag ins>
+    : F2_LDSTR<0b01, sop, op#".64", outs, ins>;
+
+class F2_CXYZ<bits<5> datatype, RegisterOperand regtype, bits<6> sop, string op>
+    : F2_XYZ<datatype, sop, !strconcat(op, "\t$vrz, $vrx, $vry"),
+             (outs regtype:$vrz), (ins CARRY:$ca, regtype:$vrx, regtype:$vry),
+             []>;
+multiclass F2_CXYZ_T<bits<6> sop, string op> {
+  def _S : F2_CXYZ<0b00000, FPR32Op, sop, op#".32">;
+  let Predicates = [HasFPUv3_DF] in
+  def _D : F2_CXYZ<0b00001, FPR64Op, sop, op#".64">;
+}
+
+class F2_LRW<bits<2> datatype, bits<1> sop, string op, dag outs, dag ins>
+    : CSKYInstF2<AddrModeNone, outs, ins,
+                 !strconcat(op, "\t$vrz, ${imm8}"), []> {
+  bits<10> imm8;
+  bits<5> rx;
+  bits<5> vrz;
+
+  let Inst{25} = vrz{4};
+  let Inst{24-21} = imm8{7-4};
+  let Inst{20-16} = 0;
+  let Inst{15-11} = 0b00111;
+  let Inst{10} = sop;
+  let Inst{9-8} = datatype;
+  let Inst{7-4} = imm8{3-0};
+  let Inst{3-0} = vrz{3-0};
+}
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index 6fcb136cd99b..c57ccb9d6eea 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CSKYInstrInfo.h"
+#include "CSKYConstantPoolValue.h"
 #include "CSKYMachineFunctionInfo.h"
 #include "CSKYTargetMachine.h"
 #include "llvm/MC/MCContext.h"
@@ -24,6 +25,199 @@ using namespace llvm;
 
 CSKYInstrInfo::CSKYInstrInfo(CSKYSubtarget &STI)
     : CSKYGenInstrInfo(CSKY::ADJCALLSTACKDOWN, CSKY::ADJCALLSTACKUP), STI(STI) {
+  v2sf = STI.hasFPUv2SingleFloat();
+  v2df = STI.hasFPUv2DoubleFloat();
+  v3sf = STI.hasFPUv3SingleFloat();
+  v3df = STI.hasFPUv3DoubleFloat();
+}
+
+static void parseCondBranch(MachineInstr &LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  assert(LastInst.getDesc().isConditionalBranch() &&
+         "Unknown conditional branch");
+  Target = LastInst.getOperand(1).getMBB();
+  Cond.push_back(MachineOperand::CreateImm(LastInst.getOpcode()));
+  Cond.push_back(LastInst.getOperand(0));
+}
+
+bool CSKYInstrInfo::analyzeBranch(MachineBasicBlock &MBB,
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const {
+  TBB = FBB = nullptr;
+  Cond.clear();
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end() || !isUnpredicatedTerminator(*I))
+    return false;
+
+  // Count the number of terminators and find the first unconditional or
+  // indirect branch.
+  MachineBasicBlock::iterator FirstUncondOrIndirectBr = MBB.end();
+  int NumTerminators = 0;
+  for (auto J = I.getReverse(); J != MBB.rend() && isUnpredicatedTerminator(*J);
+       J++) {
+    NumTerminators++;
+    if (J->getDesc().isUnconditionalBranch() ||
+        J->getDesc().isIndirectBranch()) {
+      FirstUncondOrIndirectBr = J.getReverse();
+    }
+  }
+
+  // If AllowModify is true, we can erase any terminators after
+  // FirstUncondOrIndirectBR.
+  if (AllowModify && FirstUncondOrIndirectBr != MBB.end()) {
+    while (std::next(FirstUncondOrIndirectBr) != MBB.end()) {
+      std::next(FirstUncondOrIndirectBr)->eraseFromParent();
+      NumTerminators--;
+    }
+    I = FirstUncondOrIndirectBr;
+  }
+
+  // We can't handle blocks that end in an indirect branch.
+  if (I->getDesc().isIndirectBranch())
+    return true;
+
+  // We can't handle blocks with more than 2 terminators.
+  if (NumTerminators > 2)
+    return true;
+
+  // Handle a single unconditional branch.
+  if (NumTerminators == 1 && I->getDesc().isUnconditionalBranch()) {
+    TBB = getBranchDestBlock(*I);
+    return false;
+  }
+
+  // Handle a single conditional branch.
+  if (NumTerminators == 1 && I->getDesc().isConditionalBranch()) {
+    parseCondBranch(*I, TBB, Cond);
+    return false;
+  }
+
+  // Handle a conditional branch followed by an unconditional branch.
+  if (NumTerminators == 2 && std::prev(I)->getDesc().isConditionalBranch() &&
+      I->getDesc().isUnconditionalBranch()) {
+    parseCondBranch(*std::prev(I), TBB, Cond);
+    FBB = getBranchDestBlock(*I);
+    return false;
+  }
+
+  // Otherwise, we can't handle this.
+  return true;
+}
+
+unsigned CSKYInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                     int *BytesRemoved) const {
+  if (BytesRemoved)
+    *BytesRemoved = 0;
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return 0;
+
+  if (!I->getDesc().isUnconditionalBranch() &&
+      !I->getDesc().isConditionalBranch())
+    return 0;
+
+  // Remove the branch.
+  if (BytesRemoved)
+    *BytesRemoved += getInstSizeInBytes(*I);
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin())
+    return 1;
+  --I;
+  if (!I->getDesc().isConditionalBranch())
+    return 1;
+
+  // Remove the branch.
+  if (BytesRemoved)
+    *BytesRemoved += getInstSizeInBytes(*I);
+  I->eraseFromParent();
+  return 2;
+}
+
+MachineBasicBlock *
+CSKYInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
+  assert(MI.getDesc().isBranch() && "Unexpected opcode!");
+  // The branch target is always the last operand.
+  int NumOp = MI.getNumExplicitOperands();
+  assert(MI.getOperand(NumOp - 1).isMBB() && "Expected MBB!");
+  return MI.getOperand(NumOp - 1).getMBB();
+}
+
+unsigned CSKYInstrInfo::insertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const {
+  if (BytesAdded)
+    *BytesAdded = 0;
+
+  // Shouldn't be a fall through.
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "CSKY branch conditions have two components!");
+
+  // Unconditional branch.
+  if (Cond.empty()) {
+    MachineInstr &MI = *BuildMI(&MBB, DL, get(CSKY::BR32)).addMBB(TBB);
+    if (BytesAdded)
+      *BytesAdded += getInstSizeInBytes(MI);
+    return 1;
+  }
+
+  // Either a one or two-way conditional branch.
+  unsigned Opc = Cond[0].getImm();
+  MachineInstr &CondMI = *BuildMI(&MBB, DL, get(Opc)).add(Cond[1]).addMBB(TBB);
+  if (BytesAdded)
+    *BytesAdded += getInstSizeInBytes(CondMI);
+
+  // One-way conditional branch.
+  if (!FBB)
+    return 1;
+
+  // Two-way conditional branch.
+  MachineInstr &MI = *BuildMI(&MBB, DL, get(CSKY::BR32)).addMBB(FBB);
+  if (BytesAdded)
+    *BytesAdded += getInstSizeInBytes(MI);
+  return 2;
+}
+
+static unsigned getOppositeBranchOpc(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Unknown conditional branch!");
+  case CSKY::BT32:
+    return CSKY::BF32;
+  case CSKY::BT16:
+    return CSKY::BF16;
+  case CSKY::BF32:
+    return CSKY::BT32;
+  case CSKY::BF16:
+    return CSKY::BT16;
+  case CSKY::BHZ32:
+    return CSKY::BLSZ32;
+  case CSKY::BHSZ32:
+    return CSKY::BLZ32;
+  case CSKY::BLZ32:
+    return CSKY::BHSZ32;
+  case CSKY::BLSZ32:
+    return CSKY::BHZ32;
+  case CSKY::BNEZ32:
+    return CSKY::BEZ32;
+  case CSKY::BEZ32:
+    return CSKY::BNEZ32;
+  }
+}
+
+bool CSKYInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  assert((Cond.size() == 2) && "Invalid branch condition!");
+  Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm()));
+  return false;
 }
 
 Register CSKYInstrInfo::movImm(MachineBasicBlock &MBB,
@@ -147,6 +341,10 @@ unsigned CSKYInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   case CSKY::LD32H:
   case CSKY::LD32HS:
   case CSKY::LD32W:
+  case CSKY::FLD_S:
+  case CSKY::FLD_D:
+  case CSKY::f2FLD_S:
+  case CSKY::f2FLD_D:
   case CSKY::RESTORE_CARRY:
     break;
   }
@@ -171,6 +369,10 @@ unsigned CSKYInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   case CSKY::ST32B:
   case CSKY::ST32H:
   case CSKY::ST32W:
+  case CSKY::FST_S:
+  case CSKY::FST_D:
+  case CSKY::f2FST_S:
+  case CSKY::f2FST_D:
   case CSKY::SPILL_CARRY:
     break;
   }
@@ -204,7 +406,15 @@ void CSKYInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) {
     Opcode = CSKY::SPILL_CARRY;
     CFI->setSpillsCR();
-  } else {
+  } else if (v2sf && CSKY::sFPR32RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::FST_S;
+  else if (v2df && CSKY::sFPR64RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::FST_D;
+  else if (v3sf && CSKY::FPR32RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::f2FST_S;
+  else if (v3df && CSKY::FPR64RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::f2FST_D;
+  else {
     llvm_unreachable("Unknown RegisterClass");
   }
 
@@ -239,7 +449,15 @@ void CSKYInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   } else if (CSKY::CARRYRegClass.hasSubClassEq(RC)) {
     Opcode = CSKY::RESTORE_CARRY;
     CFI->setSpillsCR();
-  } else {
+  } else if (v2sf && CSKY::sFPR32RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::FLD_S;
+  else if (v2df && CSKY::sFPR64RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::FLD_D;
+  else if (v3sf && CSKY::FPR32RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::f2FLD_S;
+  else if (v3df && CSKY::FPR64RegClass.hasSubClassEq(RC))
+    Opcode = CSKY::f2FLD_D;
+  else {
     llvm_unreachable("Unknown RegisterClass");
   }
 
@@ -302,6 +520,38 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned Opcode = 0;
   if (CSKY::GPRRegClass.contains(DestReg, SrcReg))
     Opcode = CSKY::MOV32;
+  else if (v2sf && CSKY::sFPR32RegClass.contains(DestReg, SrcReg))
+    Opcode = CSKY::FMOV_S;
+  else if (v3sf && CSKY::FPR32RegClass.contains(DestReg, SrcReg))
+    Opcode = CSKY::f2FMOV_S;
+  else if (v2df && CSKY::sFPR64RegClass.contains(DestReg, SrcReg))
+    Opcode = CSKY::FMOV_D;
+  else if (v3df && CSKY::FPR64RegClass.contains(DestReg, SrcReg))
+    Opcode = CSKY::f2FMOV_D;
+  else if (v2sf && CSKY::sFPR32RegClass.contains(SrcReg) &&
+           CSKY::GPRRegClass.contains(DestReg))
+    Opcode = CSKY::FMFVRL;
+  else if (v3sf && CSKY::FPR32RegClass.contains(SrcReg) &&
+           CSKY::GPRRegClass.contains(DestReg))
+    Opcode = CSKY::f2FMFVRL;
+  else if (v2df && CSKY::sFPR64RegClass.contains(SrcReg) &&
+           CSKY::GPRRegClass.contains(DestReg))
+    Opcode = CSKY::FMFVRL_D;
+  else if (v3df && CSKY::FPR64RegClass.contains(SrcReg) &&
+           CSKY::GPRRegClass.contains(DestReg))
+    Opcode = CSKY::f2FMFVRL_D;
+  else if (v2sf && CSKY::GPRRegClass.contains(SrcReg) &&
+           CSKY::sFPR32RegClass.contains(DestReg))
+    Opcode = CSKY::FMTVRL;
+  else if (v3sf && CSKY::GPRRegClass.contains(SrcReg) &&
+           CSKY::FPR32RegClass.contains(DestReg))
+    Opcode = CSKY::f2FMTVRL;
+  else if (v2df && CSKY::GPRRegClass.contains(SrcReg) &&
+           CSKY::sFPR64RegClass.contains(DestReg))
+    Opcode = CSKY::FMTVRL_D;
+  else if (v3df && CSKY::GPRRegClass.contains(SrcReg) &&
+           CSKY::FPR64RegClass.contains(DestReg))
+    Opcode = CSKY::f2FMTVRL_D;
   else {
     LLVM_DEBUG(dbgs() << "src = " << SrcReg << ", dst = " << DestReg);
     LLVM_DEBUG(I->dump());
@@ -311,3 +561,58 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   BuildMI(MBB, I, DL, get(Opcode), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
 }
+
+Register CSKYInstrInfo::getGlobalBaseReg(MachineFunction &MF) const {
+  CSKYMachineFunctionInfo *CFI = MF.getInfo<CSKYMachineFunctionInfo>();
+  MachineConstantPool *MCP = MF.getConstantPool();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  Register GlobalBaseReg = CFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert a pseudo instruction to set the GlobalBaseReg into the first
+  // MBB of the function
+  MachineBasicBlock &FirstMBB = MF.front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  DebugLoc DL;
+
+  CSKYConstantPoolValue *CPV = CSKYConstantPoolSymbol::Create(
+      Type::getInt32Ty(MF.getFunction().getContext()), "_GLOBAL_OFFSET_TABLE_",
+      0, CSKYCP::ADDR);
+
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
+
+  MachineMemOperand *MO =
+      MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
+                              MachineMemOperand::MOLoad, 4, Align(4));
+  BuildMI(FirstMBB, MBBI, DL, get(CSKY::LRW32), CSKY::R28)
+      .addConstantPoolIndex(CPI)
+      .addMemOperand(MO);
+
+  GlobalBaseReg = MRI.createVirtualRegister(&CSKY::GPRRegClass);
+  BuildMI(FirstMBB, MBBI, DL, get(TargetOpcode::COPY), GlobalBaseReg)
+      .addReg(CSKY::R28);
+
+  CFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
+
+unsigned CSKYInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
+    return MI.getDesc().getSize();
+  case CSKY::CONSTPOOL_ENTRY:
+    return MI.getOperand(2).getImm();
+  case CSKY::SPILL_CARRY:
+  case CSKY::RESTORE_CARRY:
+  case CSKY::PseudoTLSLA32:
+    return 8;
+  case TargetOpcode::INLINEASM_BR:
+  case TargetOpcode::INLINEASM: {
+    const MachineFunction *MF = MI.getParent()->getParent();
+    const char *AsmStr = MI.getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  }
+}
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.h b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
index 450641d96b74..1a1bbbf9154f 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.h
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.h
@@ -24,6 +24,11 @@ namespace llvm {
 class CSKYSubtarget;
 
 class CSKYInstrInfo : public CSKYGenInstrInfo {
+  bool v2sf;
+  bool v2df;
+  bool v3sf;
+  bool v3df;
+
 protected:
   const CSKYSubtarget &STI;
 
@@ -50,6 +55,28 @@ public:
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
 
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override;
+
+  unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
+
+  Register getGlobalBaseReg(MachineFunction &MF) const;
+
   // Materializes the given integer Val into DstReg.
   Register movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   const DebugLoc &DL, int64_t Val,
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.td b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
index 30d9206eec68..a782efe7f4f4 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.td
@@ -15,22 +15,42 @@
 // CSKY specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
+// Target-independent type requirements, but with target-specific formats.
 def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
                                        SDTCisVT<1, i32>]>;
 
 def SDT_CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
                                      SDTCisVT<1, i32>]>;
 
+def SDT_CSKYCall : SDTypeProfile<0, 2, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
+def SDT_CSKYCallReg : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_CSKY_LOADADDR : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+                        SDTCisVT<1, iPTR>, SDTCisVT<2, iPTR>]>;
+
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_CallSeqStart,
                            [SDNPHasChain, SDNPOutGlue]>;
-
 def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_CallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
-// Target-dependent nodes.
 def CSKY_RET : SDNode<"CSKYISD::RET", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
+def CSKY_CALL : SDNode<"CSKYISD::CALL", SDT_CSKYCall,
+  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+
+def CSKY_CALLReg : SDNode<"CSKYISD::CALLReg", SDT_CSKYCallReg,
+  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+
+def CSKY_TAIL : SDNode<"CSKYISD::TAIL", SDT_CSKYCall,
+  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+
+def CSKY_TAILReg : SDNode<"CSKYISD::TAILReg", SDT_CSKYCallReg,
+  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue, SDNPVariadic]>;
+
+def CSKY_LOAD_ADDR : SDNode<"CSKYISD::LOAD_ADDR", SDT_CSKY_LOADADDR>;
+
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
@@ -57,6 +77,24 @@ def to_tframeindex : SDNodeXForm<frameindex, [{
   return CurDAG->getTargetFrameIndex(FI->getIndex(), TLI->getPointerTy(CurDAG->getDataLayout()));
 }]>;
 
+def to_tconstpool : SDNodeXForm<constpool, [{
+  auto CP = cast<ConstantPoolSDNode>(N);
+  return CurDAG->getTargetConstantPool(CP->getConstVal(), TLI->getPointerTy(CurDAG->getDataLayout()),
+                    CP->getAlign(), CP->getOffset(), CSKYII::MO_None);
+}]>;
+
+def to_tconstpool_hi16 : SDNodeXForm<constpool, [{
+  auto CP = cast<ConstantPoolSDNode>(N);
+  return CurDAG->getTargetConstantPool(CP->getConstVal(), TLI->getPointerTy(CurDAG->getDataLayout()),
+                    CP->getAlign(), CP->getOffset(), CSKYII::MO_ADDR_HI16);
+}]>;
+
+def to_tconstpool_lo16 : SDNodeXForm<constpool, [{
+  auto CP = cast<ConstantPoolSDNode>(N);
+  return CurDAG->getTargetConstantPool(CP->getConstVal(), TLI->getPointerTy(CurDAG->getDataLayout()),
+                    CP->getAlign(), CP->getOffset(), CSKYII::MO_ADDR_LO16);
+}]>;
+
 class oimm<int num> : Operand<i32>,
   ImmLeaf<i32, "return isUInt<"#num#">(Imm - 1);"> {
   let EncoderMethod = "getOImmOpValue";
@@ -1055,6 +1093,178 @@ let Predicates = [iHas2E3] in {
   def : Pat<(sext_inreg GPR:$src, i1), (SEXT32 GPR:$src, 0, 0)>;
   def : Pat<(and GPR:$src, 255), (ZEXT32 GPR:$src, 7, 0)>;
   def : Pat<(and GPR:$src, 65535), (ZEXT32 GPR:$src, 15, 0)>;
+
+   // Call Patterns
+  def : Pat<(CSKY_CALL tglobaladdr, tconstpool:$src2), (JSRI32 tconstpool:$src2)>;
+  def : Pat<(CSKY_CALL texternalsym, tconstpool:$src2), (JSRI32 tconstpool:$src2)>;
+  def : Pat<(CSKY_TAIL tglobaladdr, tconstpool:$src2), (JMPI32 tconstpool:$src2)>;
+  def : Pat<(CSKY_TAIL texternalsym, tconstpool:$src2), (JMPI32 tconstpool:$src2)>;
+
+  def : Pat<(CSKY_CALLReg GPR:$src), (JSR32 GPR:$src)>;
+  def : Pat<(CSKY_TAILReg GPR:$src), (JMP32 GPR:$src)>;
+}
+
+// Symbol address Patterns
+def : Pat<(CSKY_LOAD_ADDR tglobaladdr, tconstpool:$src2), (LRW32 tconstpool:$src2)>;
+def : Pat<(CSKY_LOAD_ADDR tblockaddress, tconstpool:$src2), (LRW32 tconstpool:$src2)>;
+def : Pat<(CSKY_LOAD_ADDR tjumptable:$src1, tconstpool:$src2), (LRW32_Gen tjumptable:$src1, tconstpool:$src2)>;
+def : Pat<(CSKY_LOAD_ADDR texternalsym, tconstpool:$src2), (LRW32 tconstpool:$src2)>;
+
+let Predicates = [iHas2E3] in
+  def : Pat<(i32 constpool:$src), (GRS32 (to_tconstpool tconstpool:$src))>;
+
+let Predicates = [iHasE2] in
+  def : Pat<(i32 constpool:$src),
+    (ORI32 (MOVIH32 (to_tconstpool_hi16 tconstpool:$src)),
+           (to_tconstpool_lo16 tconstpool:$src))>;
+
+def : Pat<(i32 (load constpool:$src)), (LRW32 (to_tconstpool tconstpool:$src))>;
+
+// Branch Patterns.
+let Predicates = [iHasE2] in {
+  def : Pat<(brcond CARRY:$ca, bb:$imm16),
+          (BT32 CARRY:$ca, bb:$imm16)>;
+
+  def : Pat<(brcond (i32 (setne GPR:$rs1, uimm16:$rs2)), bb:$imm16),
+          (BT32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), bb:$imm16)>;
+  def : Pat<(brcond (i32 (seteq GPR:$rs1, uimm16:$rs2)), bb:$imm16),
+          (BF32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), bb:$imm16)>;
+  def : Pat<(brcond (i32 (setuge GPR:$rs1, oimm16:$rs2)), bb:$imm16),
+          (BT32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>;
+  def : Pat<(brcond (i32 (setult GPR:$rs1, oimm16:$rs2)), bb:$imm16),
+          (BF32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>;
+  def : Pat<(brcond (i32 (setlt GPR:$rs1, oimm16:$rs2)), bb:$imm16),
+          (BT32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>;
+  def : Pat<(brcond (i32 (setge GPR:$rs1, oimm16:$rs2)), bb:$imm16),
+          (BF32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), bb:$imm16)>;
+
+}
+
+let Predicates = [iHas2E3] in {
+
+def : Pat<(brcond (i32 (setne GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BT32 (CMPNE32 GPR:$rs1, GPR:$rs2), bb:$imm16)>;
+def : Pat<(brcond (i32 (seteq GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BF32 (CMPNE32 GPR:$rs1, GPR:$rs2), bb:$imm16)>;
+def : Pat<(brcond (i32 (setuge GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BT32 (CMPHS32 GPR:$rs1, GPR:$rs2), bb:$imm16)>;
+def : Pat<(brcond (i32 (setule GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BT32 (CMPHS32 GPR:$rs2, GPR:$rs1), bb:$imm16)>;
+def : Pat<(brcond (i32 (setult GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BF32 (CMPHS32 GPR:$rs1, GPR:$rs2), bb:$imm16)>;
+def : Pat<(brcond (i32 (setugt GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BF32 (CMPHS32 GPR:$rs2, GPR:$rs1), bb:$imm16)>;
+def : Pat<(brcond (i32 (setlt GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BT32 (CMPLT32 GPR:$rs1, GPR:$rs2), bb:$imm16)>;
+def : Pat<(brcond (i32 (setgt GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BT32 (CMPLT32 GPR:$rs2, GPR:$rs1), bb:$imm16)>;
+def : Pat<(brcond (i32 (setge GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BF32 (CMPLT32 GPR:$rs1, GPR:$rs2), bb:$imm16)>;
+def : Pat<(brcond (i32 (setle GPR:$rs1, GPR:$rs2)), bb:$imm16),
+          (BF32 (CMPLT32 GPR:$rs2, GPR:$rs1), bb:$imm16)>;
+
+def : Pat<(brcond (i32 (seteq GPR:$rs1, (i32 0))), bb:$imm16),
+          (BEZ32 GPR:$rs1, bb:$imm16)>;
+def : Pat<(brcond (i32 (setne GPR:$rs1, (i32 0))), bb:$imm16),
+          (BNEZ32 GPR:$rs1, bb:$imm16)>;
+def : Pat<(brcond (i32 (setlt GPR:$rs1, (i32 0))), bb:$imm16),
+          (BLZ32 GPR:$rs1, bb:$imm16)>;
+def : Pat<(brcond (i32 (setge GPR:$rs1, (i32 0))), bb:$imm16),
+          (BHSZ32 GPR:$rs1, bb:$imm16)>;
+def : Pat<(brcond (i32 (setgt GPR:$rs1, (i32 0))), bb:$imm16),
+          (BHZ32 GPR:$rs1, bb:$imm16)>;
+def : Pat<(brcond (i32 (setle GPR:$rs1, (i32 0))), bb:$imm16),
+          (BLSZ32 GPR:$rs1, bb:$imm16)>;
+}
+
+// Compare Patterns.
+let Predicates = [iHas2E3] in {
+  def : Pat<(setne GPR:$rs1, GPR:$rs2),
+            (CMPNE32 GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(i32 (seteq GPR:$rs1, GPR:$rs2)),
+            (MVCV32 (CMPNE32 GPR:$rs1, GPR:$rs2))>;
+  def : Pat<(setuge GPR:$rs1, GPR:$rs2),
+            (CMPHS32 GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(setule GPR:$rs1, GPR:$rs2),
+            (CMPHS32 GPR:$rs2, GPR:$rs1)>;
+  def : Pat<(i32 (setult GPR:$rs1, GPR:$rs2)),
+            (MVCV32 (CMPHS32 GPR:$rs1, GPR:$rs2))>;
+  def : Pat<(i32 (setugt GPR:$rs1, GPR:$rs2)),
+            (MVCV32 (CMPHS32 GPR:$rs2, GPR:$rs1))>;
+  def : Pat<(setlt GPR:$rs1, GPR:$rs2),
+            (CMPLT32 GPR:$rs1, GPR:$rs2)>;
+  def : Pat<(setgt GPR:$rs1, GPR:$rs2),
+            (CMPLT32 GPR:$rs2, GPR:$rs1)>;
+  def : Pat<(i32 (setge GPR:$rs1, GPR:$rs2)),
+            (MVCV32 (CMPLT32 GPR:$rs1, GPR:$rs2))>;
+  def : Pat<(i32 (setle GPR:$rs1, GPR:$rs2)),
+            (MVCV32 (CMPLT32 GPR:$rs2, GPR:$rs1))>;
+}
+
+let Predicates = [iHasE2] in {
+  def : Pat<(setne GPR:$rs1, uimm16:$rs2),
+            (CMPNEI32 GPR:$rs1, uimm16:$rs2)>;
+  let Predicates = [iHas2E3] in
+  def : Pat<(i32 (seteq GPR:$rs1, uimm16:$rs2)),
+            (MVCV32 (CMPNEI32 GPR:$rs1, uimm16:$rs2))>;
+  def : Pat<(setuge GPR:$rs1, oimm16:$rs2),
+            (CMPHSI32 GPR:$rs1, oimm16:$rs2)>;
+  let Predicates = [iHas2E3] in
+  def : Pat<(i32 (setult GPR:$rs1, oimm16:$rs2)),
+            (MVCV32 (CMPHSI32 GPR:$rs1, oimm16:$rs2))>;
+  def : Pat<(setlt GPR:$rs1, oimm16:$rs2),
+            (CMPLTI32 GPR:$rs1, oimm16:$rs2)>;
+  let Predicates = [iHas2E3] in
+  def : Pat<(i32 (setge GPR:$rs1, oimm16:$rs2)),
+            (MVCV32 (CMPLTI32 GPR:$rs1, oimm16:$rs2))>;
+}
+
+// Select Patterns.
+let Predicates = [iHasE2] in {
+def : Pat<(select CARRY:$ca, GPR:$rx, GPR:$false),
+          (MOVT32 CARRY:$ca, GPR:$rx, GPR:$false)>;
+def : Pat<(select (and CARRY:$ca, 1), GPR:$rx, GPR:$false),
+          (MOVT32 CARRY:$ca, GPR:$rx, GPR:$false)>;
+
+def : Pat<(select (i32 (setne GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPNE32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setne GPR:$rs1, uimm16:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (seteq GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPNE32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (seteq GPR:$rs1, uimm16:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPNEI32 GPR:$rs1, uimm16:$rs2), GPR:$rx, GPR:$false)>;
+
+def : Pat<(select (i32 (setuge GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPHS32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setuge GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setule GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPHS32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setult GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPHS32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setult GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPHSI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setugt GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPHS32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>;
+
+def : Pat<(select (i32 (setlt GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPLT32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setlt GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setgt GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVT32 (CMPLT32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setge GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPLT32 GPR:$rs1, GPR:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setge GPR:$rs1, oimm16:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPLTI32 GPR:$rs1, oimm16:$rs2), GPR:$rx, GPR:$false)>;
+def : Pat<(select (i32 (setle GPR:$rs1, GPR:$rs2)), GPR:$rx, GPR:$false),
+          (MOVF32 (CMPLT32 GPR:$rs2, GPR:$rs1), GPR:$rx, GPR:$false)>;
+
+def : Pat<(select CARRY:$ca, GPR:$rx, GPR:$false),
+          (ISEL32 CARRY:$ca, GPR:$rx, GPR:$false)>;
+def : Pat<(select (and CARRY:$ca, 1), GPR:$rx, GPR:$false),
+          (ISEL32 CARRY:$ca, GPR:$rx, GPR:$false)>;
 }
 
 // Constant materialize patterns.
@@ -1150,3 +1360,5 @@ def CONSTPOOL_ENTRY : CSKYPseudo<(outs),
   (ins i32imm:$instid, i32imm:$cpidx, i32imm:$size), "", []>;
 
 include "CSKYInstrInfo16Instr.td"
+include "CSKYInstrInfoF1.td"
+include "CSKYInstrInfoF2.td"
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfoF1.td b/llvm/lib/Target/CSKY/CSKYInstrInfoF1.td
new file mode 100644
index 000000000000..30cef024f35a
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfoF1.td
@@ -0,0 +1,420 @@
+//===- CSKYInstrInfoF1.td - CSKY Instruction Float1.0 ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the CSKY instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+def regseq_f1 : Operand<iPTR> {
+  let EncoderMethod = "getRegisterSeqOpValue";
+  let ParserMatchClass = RegSeqAsmOperand<"V1">;
+  let PrintMethod = "printRegisterSeq";
+  let DecoderMethod = "DecodeRegSeqOperandF1";
+  let MIOperandInfo = (ops sFPR32, uimm5);
+}
+
+def regseq_d1 : Operand<iPTR> {
+  let EncoderMethod = "getRegisterSeqOpValue";
+  let ParserMatchClass = RegSeqAsmOperand<"V1">;
+  let PrintMethod = "printRegisterSeq";
+  let DecoderMethod = "DecodeRegSeqOperandD1";
+  let MIOperandInfo = (ops sFPR64, uimm5);
+}
+
+def sFPR32Op : RegisterOperand<sFPR32, "printFPR">;
+def sFPR64Op : RegisterOperand<sFPR64, "printFPR">;
+def sFPR64_V_OP : RegisterOperand<sFPR64_V, "printFPR">;
+
+include "CSKYInstrFormatsF1.td"
+
+//===----------------------------------------------------------------------===//
+// CSKY specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDT_BITCAST_TO_LOHI : SDTypeProfile<2, 1, [SDTCisSameAs<0, 1>]>;
+def CSKY_BITCAST_TO_LOHI : SDNode<"CSKYISD::BITCAST_TO_LOHI", SDT_BITCAST_TO_LOHI>;
+def SDT_BITCAST_FROM_LOHI : SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>]>;
+def CSKY_BITCAST_FROM_LOHI : SDNode<"CSKYISD::BITCAST_FROM_LOHI", SDT_BITCAST_FROM_LOHI>;
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
+def fpimm32_hi16 : SDNodeXForm<fpimm, [{
+  return CurDAG->getTargetConstant(
+    (N->getValueAPF().bitcastToAPInt().getZExtValue() >> 16) & 0xFFFF,
+    SDLoc(N), MVT::i32);
+}]>;
+
+def fpimm32_lo16 : SDNodeXForm<fpimm, [{
+  return CurDAG->getTargetConstant(
+    N->getValueAPF().bitcastToAPInt().getZExtValue() & 0xFFFF,
+    SDLoc(N), MVT::i32);
+}]>;
+
+class fpimm_xform<int width, int shift = 0> : SDNodeXForm<fpimm,
+  "return CurDAG->getTargetConstant(N->getValueAPF().bitcastToAPInt().lshr("#shift#").getLoBits("#width#"), SDLoc(N), MVT::i32);">;
+
+class fpimm_xform_i16<int width, int shift = 0> : SDNodeXForm<fpimm,
+  "return CurDAG->getTargetConstant(N->getValueAPF().bitcastToAPInt().lshr("#shift#").getLoBits("#width#"), SDLoc(N), MVT::i16);">;
+
+class fpimm_t<int width, int shift = 0> : PatLeaf<(fpimm),
+   "return isShiftedUInt<"#width#", "#shift#">(N->getValueAPF().bitcastToAPInt().getZExtValue());">;
+
+def fpimm8 : fpimm_t<8>;
+def fpimm8_8 : fpimm_t<8, 8>;
+def fpimm8_16 : fpimm_t<8, 16>;
+def fpimm8_24 : fpimm_t<8, 24>;
+def fpimm16 : fpimm_t<16>;
+def fpimm16_8 : fpimm_t<16, 8>;
+def fpimm16_16 : fpimm_t<16, 16>;
+def fpimm24 : fpimm_t<24>;
+def fpimm24_8 : fpimm_t<24, 8>;
+def fpimm32 : fpimm_t<32>;
+
+def fpimm8_sr0_XFORM : fpimm_xform<8>;
+def fpimm8_sr8_XFORM : fpimm_xform<8, 8>;
+def fpimm8_sr16_XFORM : fpimm_xform<8, 16>;
+def fpimm8_sr24_XFORM : fpimm_xform<8, 24>;
+
+def fpimm8_sr0_i16_XFORM : fpimm_xform_i16<8>;
+def fpimm8_sr8_i16_XFORM : fpimm_xform_i16<8, 8>;
+
+def fconstpool_symbol : Operand<iPTR> {
+  let ParserMatchClass = Constpool;
+  let EncoderMethod =
+    "getConstpoolSymbolOpValue<CSKY::fixup_csky_pcrel_uimm8_scale4>";
+  let DecoderMethod = "decodeUImmOperand<8, 2>";
+  let PrintMethod = "printConstpool";
+  let OperandType = "OPERAND_PCREL";
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//arithmetic
+
+def FABSM : F_XZ<0x2, 0b000110, "fabsm", "", UnOpFrag<(fabs node:$Src)>, sFPR64_V_OP>;
+def FNEGM : F_XZ<0x2, 0b000111, "fnegm", "", UnOpFrag<(fneg node:$Src)>, sFPR64_V_OP>;
+def FADDM : F_XYZ<0x2, 0b000000, "faddm", "", BinOpFrag<(fadd node:$LHS, node:$RHS)>, sFPR64_V_OP>;
+def FSUBM : F_XYZ<0x2, 0b000001, "fsubm", "", BinOpFrag<(fsub node:$LHS, node:$RHS)>, sFPR64_V_OP>;
+def FMULM : F_XYZ<0x2, 0b010000, "fmulm", "", BinOpFrag<(fmul node:$LHS, node:$RHS)>, sFPR64_V_OP>;
+def FNMULM : F_XYZ<0x2, 0b010001, "fnmulm", "", BinOpFrag<(fneg (fmul node:$LHS, node:$RHS))>, sFPR64_V_OP>;
+def FMACM : F_ACCUM_XYZ<0x2, 0b010100, "fmacm", "", TriOpFrag<(fadd node:$LHS, (fmul node:$MHS, node:$RHS))>, sFPR64_V_OP>;
+def FMSCM : F_ACCUM_XYZ<0x2, 0b010101, "fmscm", "", TriOpFrag<(fsub (fmul node:$MHS, node:$RHS), node:$LHS)>, sFPR64_V_OP>;
+def FNMACM : F_ACCUM_XYZ<0x2, 0b010110, "fnmacm", "", TriOpFrag<(fsub node:$LHS, (fmul node:$MHS, node:$RHS))>, sFPR64_V_OP>;
+def FNMSCM : F_ACCUM_XYZ<0x2, 0b010111, "fnmscm", "", TriOpFrag<(fneg (fadd node:$LHS, (fmul node:$MHS, node:$RHS)))>, sFPR64_V_OP>;
+
+def FMOVM :  F_MOV<0x2, 0b000100, "fmovm", "", sFPR64_V_OP>;
+
+defm FABS   : FT_XZ<0b000110, "fabs", UnOpFrag<(fabs node:$Src)>>;
+defm FNEG   : FT_XZ<0b000111, "fneg", UnOpFrag<(fneg node:$Src)>>;
+defm FSQRT  : FT_XZ<0b011010, "fsqrt", UnOpFrag<(fsqrt node:$Src)>>;
+
+defm FADD   : FT_XYZ<0b000000, "fadd", BinOpFrag<(fadd node:$LHS, node:$RHS)>>;
+defm FSUB   : FT_XYZ<0b000001, "fsub", BinOpFrag<(fsub node:$LHS, node:$RHS)>>;
+defm FDIV   : FT_XYZ<0b011000, "fdiv", BinOpFrag<(fdiv node:$LHS, node:$RHS)>>;
+defm FMUL   : FT_XYZ<0b010000, "fmul", BinOpFrag<(fmul node:$LHS, node:$RHS)>>;
+defm FNMUL  : FT_XYZ<0b010001, "fnmul", BinOpFrag<(fneg (fmul node:$LHS, node:$RHS))>>;
+defm FMAC   : FT_ACCUM_XYZ<0b010100, "fmac", TriOpFrag<(fadd node:$LHS, (fmul node:$MHS, node:$RHS))>>;
+defm FMSC   : FT_ACCUM_XYZ<0b010101, "fmsc", TriOpFrag<(fsub (fmul node:$MHS, node:$RHS), node:$LHS)>>;
+defm FNMAC  : FT_ACCUM_XYZ<0b010110, "fnmac", TriOpFrag<(fsub node:$LHS, (fmul node:$MHS, node:$RHS))>>;
+defm FNMSC  : FT_ACCUM_XYZ<0b010111, "fnmsc", TriOpFrag<(fneg (fadd node:$LHS, (fmul node:$MHS, node:$RHS)))>>;
+
+defm FCMPHS : FT_CMPXY<0b001100, "fcmphs">;
+defm FCMPLT : FT_CMPXY<0b001101, "fcmplt">;
+defm FCMPNE : FT_CMPXY<0b001110, "fcmpne">;
+defm FCMPUO : FT_CMPXY<0b001111, "fcmpuo">;
+defm FCMPZHS : FT_CMPZX<0b001000, "fcmpzhs">;
+defm FCMPZLS : FT_CMPZX<0b001001, "fcmpzls">;
+defm FCMPZNE : FT_CMPZX<0b001010, "fcmpzne">;
+defm FCMPZUO : FT_CMPZX<0b001011, "fcmpzuo">;
+
+defm FRECIP   : FT_MOV<0b011001, "frecip">;
+
+//fmov, fmtvr, fmfvr
+defm FMOV : FT_MOV<0b000100, "fmov">;
+def FMFVRL : F_XZ_GF<3, 0b011001, (outs GPR:$rz), (ins sFPR32Op:$vrx),
+                     "fmfvrl\t$rz, $vrx", [(set GPR:$rz, (bitconvert sFPR32Op:$vrx))]>;
+def FMTVRL : F_XZ_FG<3, 0b011011, (outs sFPR32Op:$vrz), (ins GPR:$rx),
+                     "fmtvrl\t$vrz, $rx", [(set sFPR32Op:$vrz, (bitconvert GPR:$rx))]>;
+
+let Predicates = [HasFPUv2_DF] in {
+  let isCodeGenOnly = 1 in
+  def FMFVRL_D : F_XZ_GF<3, 0b011001, (outs GPR:$rz), (ins sFPR64Op:$vrx),
+                         "fmfvrl\t$rz, $vrx", []>;
+  def FMFVRH_D : F_XZ_GF<3, 0b011000, (outs GPR:$rz), (ins sFPR64Op:$vrx),
+                         "fmfvrh\t$rz, $vrx", []>;
+  let isCodeGenOnly = 1 in
+  def FMTVRL_D : F_XZ_FG<3, 0b011011, (outs sFPR64Op:$vrz), (ins GPR:$rx),
+                         "fmtvrl\t$vrz, $rx", []>;
+let Constraints = "$vrZ = $vrz" in
+  def FMTVRH_D : F_XZ_FG<3, 0b011010, (outs sFPR64Op:$vrz), (ins sFPR64Op:$vrZ, GPR:$rx),
+                         "fmtvrh\t$vrz, $rx", []>;
+}
+
+//fcvt
+
+def FSITOS  : F_XZ_TRANS<0b010000, "fsitos", sFPR32Op, sFPR32Op>;
+def : Pat<(f32 (sint_to_fp GPR:$a)),
+          (FSITOS (COPY_TO_REGCLASS GPR:$a, sFPR32))>,
+          Requires<[HasFPUv2_SF]>;
+
+def FUITOS  : F_XZ_TRANS<0b010001, "fuitos", sFPR32Op, sFPR32Op>;
+def : Pat<(f32 (uint_to_fp GPR:$a)),
+          (FUITOS (COPY_TO_REGCLASS GPR:$a, sFPR32))>,
+          Requires<[HasFPUv2_SF]>;
+
+def FSITOD  : F_XZ_TRANS<0b010100, "fsitod", sFPR64Op, sFPR64Op>;
+def : Pat<(f64 (sint_to_fp GPR:$a)),
+            (FSITOD (COPY_TO_REGCLASS GPR:$a, sFPR64))>,
+           Requires<[HasFPUv2_DF]>;
+
+def FUITOD  : F_XZ_TRANS<0b010101, "fuitod", sFPR64Op, sFPR64Op>;
+def : Pat<(f64 (uint_to_fp GPR:$a)),
+            (FUITOD (COPY_TO_REGCLASS GPR:$a, sFPR64))>,
+           Requires<[HasFPUv2_DF]>;
+
+let Predicates = [HasFPUv2_DF] in {
+def FDTOS   : F_XZ_TRANS_DS<0b010110,"fdtos", UnOpFrag<(fpround node:$Src)>>;
+def FSTOD   : F_XZ_TRANS_SD<0b010111,"fstod", UnOpFrag<(fpextend node:$Src)>>;
+}
+
+def rpiFSTOSI : F_XZ_TRANS<0b000010, "fstosi.rpi", sFPR32Op, sFPR32Op>;
+def rpiFSTOUI : F_XZ_TRANS<0b000110, "fstoui.rpi", sFPR32Op, sFPR32Op>;
+def rzFSTOSI : F_XZ_TRANS<0b000001, "fstosi.rz", sFPR32Op, sFPR32Op>;
+def rzFSTOUI : F_XZ_TRANS<0b000101, "fstoui.rz", sFPR32Op, sFPR32Op>;
+def rnFSTOSI : F_XZ_TRANS<0b000000, "fstosi.rn", sFPR32Op, sFPR32Op>;
+def rnFSTOUI : F_XZ_TRANS<0b000100, "fstoui.rn", sFPR32Op, sFPR32Op>;
+def rniFSTOSI : F_XZ_TRANS<0b000011, "fstosi.rni", sFPR32Op, sFPR32Op>;
+def rniFSTOUI : F_XZ_TRANS<0b000111, "fstoui.rni", sFPR32Op, sFPR32Op>;
+
+let Predicates = [HasFPUv2_DF] in {
+def rpiFDTOSI : F_XZ_TRANS<0b001010, "fdtosi.rpi", sFPR64Op, sFPR64Op>;
+def rpiFDTOUI : F_XZ_TRANS<0b001110, "fdtoui.rpi", sFPR64Op, sFPR64Op>;
+def rzFDTOSI : F_XZ_TRANS<0b001001, "fdtosi.rz", sFPR64Op, sFPR64Op>;
+def rzFDTOUI : F_XZ_TRANS<0b001101, "fdtoui.rz", sFPR64Op, sFPR64Op>;
+def rnFDTOSI : F_XZ_TRANS<0b001000, "fdtosi.rn", sFPR64Op, sFPR64Op>;
+def rnFDTOUI : F_XZ_TRANS<0b001100, "fdtoui.rn", sFPR64Op, sFPR64Op>;
+def rniFDTOSI : F_XZ_TRANS<0b001011, "fdtosi.rni", sFPR64Op, sFPR64Op>;
+def rniFDTOUI : F_XZ_TRANS<0b001111, "fdtoui.rni", sFPR64Op, sFPR64Op>;
+}
+
+multiclass FPToIntegerPats<SDNode round, string SUFFIX> {
+  def : Pat<(i32 (fp_to_sint (round sFPR32Op:$Rn))),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOSI) sFPR32Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_SF]>;
+  def : Pat<(i32 (fp_to_uint (round sFPR32Op:$Rn))),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOUI) sFPR32Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_SF]>;
+  def : Pat<(i32 (fp_to_sint (round sFPR64Op:$Rn))),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOSI) sFPR64Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_DF]>;
+  def : Pat<(i32 (fp_to_uint (round sFPR64Op:$Rn))),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOUI) sFPR64Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_DF]>;
+}
+
+defm: FPToIntegerPats<fceil, "rpi">;
+defm: FPToIntegerPats<fround, "rn">;
+defm: FPToIntegerPats<ffloor, "rni">;
+
+multiclass FPToIntegerTowardszeroPats<string SUFFIX> {
+  def : Pat<(i32 (fp_to_sint sFPR32Op:$Rn)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOSI) sFPR32Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_SF]>;
+  def : Pat<(i32 (fp_to_uint sFPR32Op:$Rn)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FSTOUI) sFPR32Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_SF]>;
+  def : Pat<(i32 (fp_to_sint sFPR64Op:$Rn)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOSI) sFPR64Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_DF]>;
+  def : Pat<(i32 (fp_to_uint sFPR64Op:$Rn)),
+            (COPY_TO_REGCLASS (!cast<Instruction>(SUFFIX # FDTOUI) sFPR64Op:$Rn), GPR)>,
+            Requires<[HasFPUv2_DF]>;
+}
+
+defm: FPToIntegerTowardszeroPats<"rz">;
+
+
+//fld, fst
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+  defm FLD : FT_XYAI_LD<0b0010000, "fld">;
+  defm FLDR : FT_XYAR_LD<0b0010100, "fldr">;
+  defm FLDM : FT_XYAR_LDM<0b0011000, "fldm">;
+
+  let Predicates = [HasFPUv2_DF] in
+  def FLDRM : F_XYAR_LD<0b0010101, 0, "fldrm", "", sFPR64Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def FLDMM : F_I4_XY_MEM<0b0011001, 0,
+    (outs), (ins GPR:$rx, regseq_d1:$regs, variable_ops), "fldmm\t$regs, (${rx})", []>;
+  let Predicates = [HasFPUv2_DF] in
+  def FLDM : F_XYAI_LD<0b0010001, 0, "fldm", "", sFPR64Op, uimm8_3>;
+}
+
+
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+  defm FST : FT_XYAI_ST<0b0010010, "fst">;
+  defm FSTR : FT_XYAR_ST<0b0010110, "fstr">;
+  defm FSTM : FT_XYAR_STM<0b0011010, "fstm">;
+
+  let Predicates = [HasFPUv2_DF] in
+  def FSTRM : F_XYAR_ST<0b0010111, 0, "fstrm", "", sFPR64Op>;
+  let Predicates = [HasFPUv2_DF] in
+  def FSTMM :  F_I4_XY_MEM<0b0011011, 0,
+    (outs), (ins GPR:$rx, regseq_d1:$regs, variable_ops), "fstmm\t$regs, (${rx})", []>;
+  let Predicates = [HasFPUv2_DF] in
+  def FSTM : F_XYAI_ST<0b0010011, 0, "fstm", "", sFPR64Op, uimm8_3>;
+}
+
+defm : LdPat<load, uimm8_2, FLD_S, f32>, Requires<[HasFPUv2_SF]>;
+defm : LdPat<load, uimm8_2, FLD_D, f64>, Requires<[HasFPUv2_DF]>;
+defm : LdrPat<load, FLDR_S, f32>, Requires<[HasFPUv2_SF]>;
+defm : LdrPat<load, FLDR_D, f64>, Requires<[HasFPUv2_DF]>;
+
+defm : StPat<store, f32, uimm8_2, FST_S>, Requires<[HasFPUv2_SF]>;
+defm : StPat<store, f64, uimm8_2, FST_D>, Requires<[HasFPUv2_DF]>;
+defm : StrPat<store, f32, FSTR_S>, Requires<[HasFPUv2_SF]>;
+defm : StrPat<store, f64, FSTR_D>, Requires<[HasFPUv2_DF]>;
+
+
+def : Pat<(f32 fpimm16:$imm), (COPY_TO_REGCLASS (MOVI32 (fpimm32_lo16 fpimm16:$imm)), sFPR32)>,
+        Requires<[HasFPUv2_SF]>;
+def : Pat<(f32 fpimm16_16:$imm), (f32 (COPY_TO_REGCLASS (MOVIH32 (fpimm32_hi16 fpimm16_16:$imm)), sFPR32))>,
+        Requires<[HasFPUv2_SF]>;
+def : Pat<(f32 fpimm:$imm), (COPY_TO_REGCLASS (ORI32 (MOVIH32 (fpimm32_hi16 fpimm:$imm)), (fpimm32_lo16 fpimm:$imm)), sFPR32)>,
+        Requires<[HasFPUv2_SF]>;
+
+def : Pat<(f64(CSKY_BITCAST_FROM_LOHI GPR:$rs1, GPR:$rs2)), (FMTVRH_D(FMTVRL_D GPR:$rs1), GPR:$rs2)>,
+        Requires<[HasFPUv2_DF]>;
+
+multiclass BRCond_Bin<CondCode CC, string Instr, Instruction Br, Instruction MV> {
+  let Predicates = [HasFPUv2_SF] in
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_S) sFPR32Op:$rs1, sFPR32Op:$rs2), bb:$imm16)>;
+  let Predicates = [HasFPUv2_DF] in
+  def : Pat<(brcond (i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_D) sFPR64Op:$rs1, sFPR64Op:$rs2), bb:$imm16)>;
+
+  let Predicates = [HasFPUv2_SF] in
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_S) sFPR32Op:$rs1, sFPR32Op:$rs2))>;
+  let Predicates = [HasFPUv2_DF] in
+  def : Pat<(i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_D) sFPR64Op:$rs1, sFPR64Op:$rs2))>;
+}
+
+multiclass BRCond_Bin_SWAP<CondCode CC, string Instr, Instruction Br, Instruction MV> {
+  let Predicates = [HasFPUv2_SF] in
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_S) sFPR32Op:$rs2, sFPR32Op:$rs1), bb:$imm16)>;
+  let Predicates = [HasFPUv2_DF] in
+  def : Pat<(brcond (i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_D) sFPR64Op:$rs2, sFPR64Op:$rs1), bb:$imm16)>;
+
+  let Predicates = [HasFPUv2_SF] in
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, sFPR32Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_S) sFPR32Op:$rs2, sFPR32Op:$rs1))>;
+  let Predicates = [HasFPUv2_DF] in
+  def : Pat<(i32 (setcc sFPR64Op:$rs1, sFPR64Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_D) sFPR64Op:$rs2, sFPR64Op:$rs1))>;
+}
+
+// inverse (order && compare) to (unorder || inverse(compare))
+
+defm : BRCond_Bin<SETUNE, "FCMPNE", BT32, MVC32>;
+defm : BRCond_Bin<SETOEQ, "FCMPNE", BF32, MVCV32>;
+defm : BRCond_Bin<SETOGE, "FCMPHS", BT32, MVC32>;
+defm : BRCond_Bin<SETOLT, "FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin<SETUO, "FCMPUO", BT32, MVC32>;
+defm : BRCond_Bin<SETO, "FCMPUO", BF32, MVCV32>;
+defm : BRCond_Bin_SWAP<SETOGT, "FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin_SWAP<SETOLE, "FCMPHS", BT32, MVC32>;
+
+defm : BRCond_Bin<SETNE, "FCMPNE", BT32, MVC32>;
+defm : BRCond_Bin<SETEQ, "FCMPNE", BF32, MVCV32>;
+defm : BRCond_Bin<SETGE, "FCMPHS", BT32, MVC32>;
+defm : BRCond_Bin<SETLT, "FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin_SWAP<SETGT, "FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin_SWAP<SETLE, "FCMPHS", BT32, MVC32>;
+
+// -----------
+
+let Predicates = [HasFPUv2_SF] in {
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGE)), bb:$imm16),
+            (BT32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGE)),
+            (MVC32 (FCMPZHS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLT)), bb:$imm16),
+            (BF32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLT)),
+            (MVCV32 (FCMPZHS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLE)), bb:$imm16),
+            (BT32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOLE)),
+            (MVC32 (FCMPZLS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGT)), bb:$imm16),
+            (BF32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOGT)),
+            (MVCV32 (FCMPZLS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETUNE)), bb:$imm16),
+            (BT32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETUNE)),
+            (MVC32 (FCMPZNE_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETOEQ)), bb:$imm16),
+            (BF32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETOEQ)),
+            (MVCV32 (FCMPZNE_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm, SETUO)), bb:$imm16),
+            (BT32 (FCMPZUO_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm, SETUO)),
+            (MVC32 (FCMPZUO_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm, SETO)), bb:$imm16),
+            (BF32 (FCMPZUO_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm, SETO)),
+            (MVCV32 (FCMPZUO_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETGE)), bb:$imm16),
+            (BT32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETGE)),
+            (MVC32 (FCMPZHS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETLT)), bb:$imm16),
+            (BF32 (FCMPZHS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETLT)),
+            (MVCV32 (FCMPZHS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETLE)), bb:$imm16),
+            (BT32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETLE)),
+            (MVC32 (FCMPZLS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETGT)), bb:$imm16),
+            (BF32 (FCMPZLS_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETGT)),
+            (MVCV32 (FCMPZLS_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETNE)), bb:$imm16),
+            (BT32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETNE)),
+            (MVC32 (FCMPZNE_S sFPR32Op:$rs1))>;
+  def : Pat<(brcond (i32 (setcc sFPR32Op:$rs1, fpimm0, SETEQ)), bb:$imm16),
+            (BF32 (FCMPZNE_S sFPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc sFPR32Op:$rs1, fpimm0, SETEQ)),
+            (MVCV32 (FCMPZNE_S sFPR32Op:$rs1))>;
+}
+
+let usesCustomInserter = 1 in  {
+  let Predicates = [HasFPUv2_SF] in
+  def FSELS : CSKYPseudo<(outs sFPR32Op:$dst), (ins CARRY:$cond, sFPR32Op:$src1, sFPR32Op:$src2),
+    "!fsels\t$dst, $src1, src2", [(set sFPR32Op:$dst, (select CARRY:$cond, sFPR32Op:$src1, sFPR32Op:$src2))]>;
+
+  let Predicates = [HasFPUv2_DF] in
+  def FSELD : CSKYPseudo<(outs sFPR64Op:$dst), (ins CARRY:$cond, sFPR64Op:$src1, sFPR64Op:$src2),
+    "!fseld\t$dst, $src1, src2", [(set sFPR64Op:$dst, (select CARRY:$cond, sFPR64Op:$src1, sFPR64Op:$src2))]>;
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfoF2.td b/llvm/lib/Target/CSKY/CSKYInstrInfoF2.td
new file mode 100644
index 000000000000..8a00e7d9af3a
--- /dev/null
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfoF2.td
@@ -0,0 +1,462 @@
+//===- CSKYInstrInfoF2.td - CSKY Instruction Float2.0 ------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the CSKY instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+def regseq_f2 : Operand<i32> {
+  let EncoderMethod = "getRegisterSeqOpValue";
+  let ParserMatchClass = RegSeqAsmOperand<"V2">;
+  let PrintMethod = "printRegisterSeq";
+  let DecoderMethod = "DecodeRegSeqOperandF2";
+  let MIOperandInfo = (ops FPR32, uimm5);
+}
+
+def regseq_d2 : Operand<i32> {
+  let EncoderMethod = "getRegisterSeqOpValue";
+  let ParserMatchClass = RegSeqAsmOperand<"V2">;
+  let PrintMethod = "printRegisterSeq";
+  let DecoderMethod = "DecodeRegSeqOperandD2";
+  let MIOperandInfo = (ops FPR64, uimm5);
+}
+
+def FPR32Op : RegisterOperand<FPR32, "printFPR">;
+def FPR64Op : RegisterOperand<FPR64, "printFPR">;
+
+include "CSKYInstrFormatsF2.td"
+
+// Predicates
+def IsOrAdd: PatFrag<(ops node:$A, node:$B), (or node:$A, node:$B), [{
+  return isOrEquivalentToAdd(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+defm f2FADD   : F2_XYZ_T<0b000000, "fadd", BinOpFrag<(fadd node:$LHS, node:$RHS)>>;
+defm f2FSUB   : F2_XYZ_T<0b000001, "fsub", BinOpFrag<(fsub node:$LHS, node:$RHS)>>;
+defm f2FDIV   : F2_XYZ_T<0b011000, "fdiv", BinOpFrag<(fdiv node:$LHS, node:$RHS)>>;
+defm f2FMUL   : F2_XYZ_T<0b010000, "fmul", BinOpFrag<(fmul node:$LHS, node:$RHS)>>;
+
+defm f2FMAXNM : F2_XYZ_T<0b101000, "fmaxnm", BinOpFrag<(fmaxnum node:$LHS, node:$RHS)>>;
+defm f2FMINNM : F2_XYZ_T<0b101001, "fminnm", BinOpFrag<(fminnum node:$LHS, node:$RHS)>>;
+
+defm f2FABS   : F2_XZ_T<0b000110, "fabs", fabs>;
+defm f2FNEG   : F2_XZ_T<0b000111, "fneg", fneg>;
+defm f2FSQRT  : F2_XZ_T<0b011010, "fsqrt", fsqrt>;
+defm f2FMOV   : F2_XZ_SET_T<0b000100, "fmov">;
+def f2FMOVX   : F2_XZ_SET<0b00001, FPR32Op, 0b000101, "fmovx.32">;
+
+defm f2RECIP   : F2_XZ_SET_T<0b011001, "frecip">;
+
+// fld/fst
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+  def f2FLD_S : F2_LDST_S<0b0, "fld", (outs FPR32Op:$vrz), (ins GPR:$rx, uimm8_2:$imm8)>;
+  let Predicates = [HasFPUv3_DF] in
+  def f2FLD_D : F2_LDST_D<0b0, "fld", (outs FPR64Op:$vrz), (ins GPR:$rx, uimm8_2:$imm8)>;
+}
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+  def f2FST_S : F2_LDST_S<0b1, "fst", (outs), (ins FPR32Op:$vrz, GPR:$rx, uimm8_2:$imm8)>;
+  let Predicates = [HasFPUv3_DF] in
+  def f2FST_D : F2_LDST_D<0b1, "fst", (outs), (ins FPR64Op:$vrz, GPR:$rx, uimm8_2:$imm8)>;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+  def f2FSTM_S : F2_LDSTM_S<0b1, 0, "fstm", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>;
+  let Predicates = [HasFPUv3_DF] in
+  def f2FSTM_D : F2_LDSTM_D<0b1, 0, "fstm", (outs), (ins GPR:$rx, regseq_d2:$regs, variable_ops)>;
+
+  def f2FSTMU_S : F2_LDSTM_S<0b1, 0b100, "fstmu", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>;
+  let Predicates = [HasFPUv3_DF] in
+  def f2FSTMU_D : F2_LDSTM_D<0b1, 0b100, "fstmu", (outs), (ins GPR:$rx, regseq_d2:$regs, variable_ops)>;
+}
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+  def f2FLDM_S : F2_LDSTM_S<0b0, 0, "fldm", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>;
+  let Predicates = [HasFPUv3_DF] in
+  def f2FLDM_D : F2_LDSTM_D<0b0, 0, "fldm", (outs), (ins GPR:$rx,  regseq_d2:$regs, variable_ops)>;
+
+  def f2FLDMU_S : F2_LDSTM_S<0b0, 0b100, "fldmu", (outs), (ins GPR:$rx, regseq_f2:$regs, variable_ops)>;
+  let Predicates = [HasFPUv3_DF] in
+  def f2FLDMU_D : F2_LDSTM_D<0b0, 0b100, "fldmu", (outs), (ins GPR:$rx, regseq_d2:$regs, variable_ops)>;
+}
+
+multiclass FLSR {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def FLDR_S : F2_LDSTR_S<0b0, "fldr", (outs FPR32Op:$rz), (ins GPR:$rx, GPR:$ry, uimm2:$imm)>;
+    let Predicates = [HasFPUv3_DF] in
+    def FLDR_D : F2_LDSTR_D<0b0, "fldr", (outs FPR64Op:$rz), (ins GPR:$rx, GPR:$ry, uimm2:$imm)>;
+  }
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def FSTR_S : F2_LDSTR_S<0b1, "fstr", (outs), (ins FPR32Op:$rz, GPR:$rx, GPR:$ry, uimm2:$imm)>;
+    let Predicates = [HasFPUv3_DF] in
+    def FSTR_D : F2_LDSTR_D<0b1, "fstr", (outs), (ins FPR64Op:$rz, GPR:$rx, GPR:$ry, uimm2:$imm)>;
+  }
+}
+
+defm f2: FLSR;
+
+def f2FLRW_S : F2_LRW<0b00, 0b0, "flrw.32", (outs FPR32Op:$vrz), (ins fconstpool_symbol:$imm8)>;
+def f2FLRW_D : F2_LRW<0b01, 0b0, "flrw.64", (outs FPR64Op:$vrz), (ins fconstpool_symbol:$imm8)>;
+
+def : Pat<(f32 (load constpool:$src)), (f2FLRW_S (to_tconstpool tconstpool:$src))>, Requires<[HasFPUv3_SF]>;
+def : Pat<(f64 (load constpool:$src)), (f2FLRW_D (to_tconstpool tconstpool:$src))>, Requires<[HasFPUv3_DF]>;
+
+defm : LdPat<load, uimm8_2, f2FLD_S, f32>, Requires<[HasFPUv3_SF]>;
+defm : LdPat<load, uimm8_2, f2FLD_D, f64>, Requires<[HasFPUv3_DF]>;
+defm : LdrPat<load, f2FLDR_S, f32>, Requires<[HasFPUv3_SF]>;
+defm : LdrPat<load, f2FLDR_D, f64>, Requires<[HasFPUv3_DF]>;
+
+defm : StPat<store, f32, uimm8_2, f2FST_S>, Requires<[HasFPUv3_SF]>;
+defm : StPat<store, f64, uimm8_2, f2FST_D>, Requires<[HasFPUv3_DF]>;
+defm : StrPat<store, f32, f2FSTR_S>, Requires<[HasFPUv3_SF]>;
+defm : StrPat<store, f64, f2FSTR_D>, Requires<[HasFPUv3_DF]>;
+
+// fmfvr
+let vry = 0 in
+def f2FMFVRL   : F2_XYZ<0b00011, 0b011001, "fmfvr.32.1\t$vrz, $vrx",
+                        (outs GPR:$vrz), (ins FPR32Op:$vrx),
+                        [(set GPR:$vrz, (bitconvert FPR32Op:$vrx))]>;
+// TODO: vrz and vrz+1
+def f2FMFVRL_2 : F2_XYZ<0b00011, 0b111010, "fmfvr.32.2\t$vrz, $vry, $vrx",
+                        (outs GPR:$vrz, GPR:$vry), (ins FPR64Op:$vrx),
+                        []>;
+
+let Predicates = [HasFPUv3_DF] in {
+let vry = 0 in {
+let isCodeGenOnly = 1 in
+def f2FMFVRL_D : F2_XYZ<0b00011, 0b011001, "fmfvr.32.1\t$vrz, $vrx",
+                        (outs GPR:$vrz), (ins FPR64Op:$vrx),
+                        []>;
+def f2FMFVRH_D : F2_XYZ<0b00011, 0b011000, "fmfvrh\t$vrz, $vrx",
+                        (outs GPR:$vrz), (ins FPR64Op:$vrx),
+                        []>;
+}
+def f2FMFVR_D  : F2_XYZ<0b00011, 0b111000, "fmfvr.64\t$vrz, $vry, $vrx",
+                        (outs GPR:$vrz, GPR:$vry), (ins FPR64Op:$vrx),
+                        [(set GPR:$vrz, GPR:$vry, (CSKY_BITCAST_TO_LOHI FPR64Op:$vrx))]>;
+}
+
+// fmtvr
+def f2FMTVRL   : F2_XZ_P<0b00011, 0b011011, "fmtvr.32.1",
+                         [(set FPR32Op:$vrz, (bitconvert GPR:$vrx))],
+                         (outs FPR32Op:$vrz), (ins GPR:$vrx)>;
+// TODO: vrz and vrz+1
+def f2FMTVRL_2 : F2_XYZ<0b00011, 0b111110, "fmtvr.32.2\t$vrz, $vrx, $vry",
+                        (outs FPR32Op:$vrz), (ins GPR:$vrx, GPR:$vry),
+                        []>;
+
+let Predicates = [HasFPUv3_DF] in {
+let isCodeGenOnly = 1 in
+def f2FMTVRL_D : F2_XZ_P<0b00011, 0b011011, "fmtvr.32.1",
+                         [],
+                         (outs FPR64Op:$vrz), (ins GPR:$vrx)>;
+let Constraints = "$vrZ = $vrz" in
+def f2FMTVRH_D : F2_XZ_P<0b00011, 0b011010, "fmtvrh",
+                         [],
+                         (outs FPR64Op:$vrz), (ins FPR64Op:$vrZ, GPR:$vrx)>;
+def f2FMTVR_D  : F2_XYZ<0b00011, 0b111100, "fmtvr.64\t$vrz, $vrx, $vry",
+                        (outs FPR64Op:$vrz), (ins GPR:$vrx, GPR:$vry),
+                        [(set FPR64Op:$vrz, (CSKY_BITCAST_FROM_LOHI GPR:$vrx, GPR:$vry))]>;
+}
+
+// fcmp
+
+defm f2FCMPHS: F2_CXY_T<0b001100, "fcmphs">;
+defm f2FCMPLT: F2_CXY_T<0b001101, "fcmplt">;
+defm f2FCMPNE: F2_CXY_T<0b001110, "fcmpne">;
+defm f2FCMPUO: F2_CXY_T<0b001111, "fcmpuo">;
+
+defm f2FCMPHSZ: F2_CX_T<0b001000, "fcmphsz">;
+defm f2FCMPHZ : F2_CX_T<0b101010, "fcmphz">;
+defm f2FCMPLSZ: F2_CX_T<0b101011, "fcmplsz">;
+defm f2FCMPLTZ: F2_CX_T<0b001001, "fcmpltz">;
+defm f2FCMPNEZ: F2_CX_T<0b001010, "fcmpnez">;
+defm f2FCMPUOZ: F2_CX_T<0b001011, "fcmpuoz">;
+
+defm f2FMULA : F2_XYZZ_T<0b010100, "fmula",
+  TriOpFrag<(fadd (fmul node:$LHS, node:$MHS), node:$RHS)>>;
+
+defm f2FMULS : F2_XYZZ_T<0b010110, "fmuls",
+  TriOpFrag<(fsub node:$RHS, (fmul node:$LHS, node:$MHS))>>;
+
+defm f2FFMULA : F2_XYZZ_T<0b110000, "ffmula",
+  TriOpFrag<(fma node:$LHS, node:$MHS, node:$RHS)>>;
+
+defm f2FFMULS : F2_XYZZ_T<0b110001, "ffmuls",
+  TriOpFrag<(fma (fneg node:$LHS), node:$MHS, node:$RHS)>>;
+
+defm f2FFNMULA : F2_XYZZ_T<0b110010, "ffnmula",
+  TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))>>;
+
+defm f2FFNMULS : F2_XYZZ_T<0b110011, "ffnmuls",
+  TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))>>;
+
+defm f2FNMULA : F2_XYZZ_T<0b010111, "fnmula",
+  TriOpFrag<(fneg (fadd (fmul node:$LHS, node:$MHS), node:$RHS))>>;
+
+defm f2FNMULS : F2_XYZZ_T<0b010101, "fnmuls",
+  TriOpFrag<(fneg (fsub node:$RHS, (fmul node:$LHS, node:$MHS)))>>;
+
+defm f2FNMUL : F2_XYZ_T<0b010001, "fnmul",
+  BinOpFrag<(fneg (fmul node:$LHS, node:$RHS))>>;
+
+// fcvt
+def f2FFTOS32_S  : F2_XZ_P<0b01000, 0b011011, "fftoi.f32.s32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FFTOU32_S  : F2_XZ_P<0b01000, 0b011010, "fftoi.f32.u32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FS32TOF_S  : F2_XZ_P<0b01001, 0b011011, "fitof.s32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FU32TOF_S  : F2_XZ_P<0b01001, 0b011010, "fitof.u32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FFTOXU32_S  : F2_XZ_P<0b01000, 0b001010, "fftox.f32.u32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FFTOXS32_S  : F2_XZ_P<0b01000, 0b001011, "fftox.f32.s32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FXTOFU32_S  : F2_XZ_P<0b01001, 0b001010, "fxtof.u32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FXTOFS32_S  : F2_XZ_P<0b01001, 0b001011, "fxtof.s32.f32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+let Predicates = [HasFPUv3_DF] in {
+def f2FFTOS32_D  : F2_XZ_P<0b01000, 0b011101, "fftoi.f64.s32", [], (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>;
+def f2FFTOU32_D  : F2_XZ_P<0b01000, 0b011100, "fftoi.f64.u32", [], (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>;
+def f2FS32TOF_D  : F2_XZ_P<0b01001, 0b011101, "fitof.s32.f64", [], (outs FPR64Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FU32TOF_D  : F2_XZ_P<0b01001, 0b011100, "fitof.u32.f64", [], (outs FPR64Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FFTOXU32_D  : F2_XZ_P<0b01000, 0b001100, "fftox.f64.u32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FFTOXS32_D  : F2_XZ_P<0b01000, 0b001101, "fftox.f64.s32", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FXTOFU32_D  : F2_XZ_P<0b01001, 0b001100, "fxtof.u32.f64", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+def f2FXTOFS32_D  : F2_XZ_P<0b01001, 0b001101, "fxtof.s32.f64", [], (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+}
+
+defm f2FF32TOSI32 : F2_XZ_RM<0b00011, 0b0000, "fftoi.f32.s32", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+defm f2FF32TOUI32 : F2_XZ_RM<0b00011, 0b0001, "fftoi.f32.u32", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+defm f2FF32TOFI32 : F2_XZ_RM<0b01000, 0b1001, "fftofi.f32", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+let Predicates = [HasFPUv3_DF] in {
+defm f2FF64TOSI32 : F2_XZ_RM<0b00011, 0b0010, "fftoi.f64.s32", (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>;
+defm f2FF64TOUI32 : F2_XZ_RM<0b00011, 0b0011, "fftoi.f64.u32", (outs FPR32Op:$vrz), (ins FPR64Op:$vrx)>;
+defm f2FF64TOFI32 : F2_XZ_RM<0b01000, 0b1010, "fftofi.f64", (outs FPR32Op:$vrz), (ins FPR32Op:$vrx)>;
+}
+
+def : Pat<(i32 (fp_to_sint (fround FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RN  $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_uint (fround FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RN  $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_sint (fceil  FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_uint (fceil  FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_sint (ffloor FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_uint (ffloor FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_sint (ftrunc FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOSI32_RZ  $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_uint (ftrunc FPR32Op:$vrx))), (COPY_TO_REGCLASS (f2FF32TOUI32_RZ  $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_sint FPR32Op:$vrx)), (COPY_TO_REGCLASS (f2FF32TOSI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+def : Pat<(i32 (fp_to_uint FPR32Op:$vrx)), (COPY_TO_REGCLASS (f2FF32TOUI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_SF]>;
+
+def : Pat<(i32 (fp_to_sint (fround FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RN  $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_uint (fround FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RN  $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_sint (fceil  FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_uint (fceil  FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RPI $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_sint (ffloor FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_uint (ffloor FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RNI $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_sint (ftrunc FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOSI32_RZ  $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_uint (ftrunc FPR64Op:$vrx))), (COPY_TO_REGCLASS (f2FF64TOUI32_RZ  $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_sint FPR64Op:$vrx)), (COPY_TO_REGCLASS (f2FF64TOSI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+def : Pat<(i32 (fp_to_uint FPR64Op:$vrx)), (COPY_TO_REGCLASS (f2FF64TOUI32_RZ $vrx), GPR)>, Requires<[HasFPUv3_DF]>;
+
+def : Pat<(sint_to_fp GPR:$vrx), (f2FS32TOF_S (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_SF]>;
+def : Pat<(uint_to_fp GPR:$vrx), (f2FU32TOF_S (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_SF]>;
+def : Pat<(sint_to_fp GPR:$vrx), (f2FS32TOF_D (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_DF]>;
+def : Pat<(uint_to_fp GPR:$vrx), (f2FU32TOF_D (COPY_TO_REGCLASS $vrx, FPR32))>, Requires<[HasFPUv3_DF]>;
+
+let Predicates = [HasFPUv3_DF] in {
+def f2FDTOS   : F2_XZ_P<0b00011, 0b010110, "fdtos", [(set FPR32Op:$vrz, (fpround FPR64Op:$vrx))], (outs FPR32Op:$vrz),
+                        (ins FPR64Op:$vrx)>;
+def f2FSTOD   : F2_XZ_P<0b00011, 0b010111, "fstod", [(set FPR64Op:$vrz, (fpextend FPR32Op:$vrx))], (outs FPR64Op:$vrz),
+                        (ins FPR32Op:$vrx)>;
+}
+
+// fsel
+defm f2FSEL: F2_CXYZ_T<0b111001, "fsel">;
+
+def f2FINS: F2_XZ_SET<0b00000, FPR32Op, 0b011011, "fins.32">;
+
+def : Pat<(f32 fpimm16:$imm),(COPY_TO_REGCLASS (MOVI32 (fpimm32_lo16 fpimm16:$imm)), FPR32)>,
+        Requires<[HasFPUv3_SF]>;
+def : Pat<(f32 fpimm16_16:$imm), (COPY_TO_REGCLASS (MOVIH32 (fpimm32_hi16 fpimm16_16:$imm)), FPR32)>,
+        Requires<[HasFPUv3_SF]>;
+def : Pat<(f32 fpimm:$imm),(COPY_TO_REGCLASS (ORI32 (MOVIH32 (fpimm32_hi16 fpimm:$imm)), (fpimm32_lo16 fpimm:$imm)), FPR32)>,
+        Requires<[HasFPUv3_SF]>;
+
+
+multiclass BRCond_Bin_F2<CondCode CC, string Instr, Instruction Br, Instruction MV, bit IsSelectSwap = 0> {
+  let Predicates = [HasFPUv3_SF] in
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2), bb:$imm16)>;
+  let Predicates = [HasFPUv3_DF] in
+  def : Pat<(brcond (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2), bb:$imm16)>;
+
+  let Predicates = [HasFPUv3_SF] in
+  def : Pat<(i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2))>;
+  let Predicates = [HasFPUv3_DF] in
+  def : Pat<(i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2))>;
+
+  let Predicates = [HasFPUv3_SF] in {
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), FPR32Op:$rx, FPR32Op:$false),
+            !if(
+                !eq(IsSelectSwap, 0),
+                (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2), FPR32Op:$rx, FPR32Op:$false),
+                (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs1, FPR32Op:$rs2), FPR32Op:$false, FPR32Op:$rx)
+               )>;
+  }
+  let Predicates = [HasFPUv3_DF] in {
+  def : Pat<(select (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), FPR64Op:$rx, FPR64Op:$false),
+            !if(
+                !eq(IsSelectSwap, 0),
+                (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2), FPR64Op:$rx, FPR64Op:$false),
+                (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs1, FPR64Op:$rs2), FPR64Op:$false, FPR64Op:$rx)
+               )>;
+  }
+}
+
+multiclass BRCond_Bin_SWAP_F2<CondCode CC, string Instr, Instruction Br, Instruction MV, bit IsSelectSwap = 0> {
+  let Predicates = [HasFPUv3_SF] in
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1), bb:$imm16)>;
+  let Predicates = [HasFPUv3_DF] in
+  def : Pat<(brcond (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), bb:$imm16),
+            (Br (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1), bb:$imm16)>;
+
+  let Predicates = [HasFPUv3_SF] in
+  def : Pat<(i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1))>;
+  let Predicates = [HasFPUv3_DF] in
+  def : Pat<(i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)),
+            (MV (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1))>;
+
+  let Predicates = [HasFPUv3_SF] in {
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, FPR32Op:$rs2, CC)), FPR32Op:$rx, FPR32Op:$false),
+            !if(
+                !eq(IsSelectSwap, 0),
+                (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false),
+                (f2FSEL_S (!cast<Instruction>(Instr#_S) FPR32Op:$rs2, FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx)
+               )>;
+  }
+  let Predicates = [HasFPUv3_DF] in {
+  def : Pat<(select (i32 (setcc FPR64Op:$rs1, FPR64Op:$rs2, CC)), FPR64Op:$rx, FPR64Op:$false),
+            !if(
+                !eq(IsSelectSwap, 0),
+                (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1), FPR64Op:$rx, FPR64Op:$false),
+                (f2FSEL_D (!cast<Instruction>(Instr#_D) FPR64Op:$rs2, FPR64Op:$rs1), FPR64Op:$false, FPR64Op:$rx)
+               )>;
+  }
+}
+
+// inverse (order && compare) to (unorder || inverse(compare))
+
+defm : BRCond_Bin_F2<SETUNE, "f2FCMPNE", BT32, MVC32>;
+defm : BRCond_Bin_F2<SETOEQ, "f2FCMPNE", BF32, MVCV32, 1>;
+defm : BRCond_Bin_F2<SETOGE, "f2FCMPHS", BT32, MVC32>;
+defm : BRCond_Bin_F2<SETOLT, "f2FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin_F2<SETUO, "f2FCMPUO", BT32, MVC32>;
+defm : BRCond_Bin_F2<SETO, "f2FCMPUO", BF32, MVCV32, 1>;
+defm : BRCond_Bin_SWAP_F2<SETOGT, "f2FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin_SWAP_F2<SETOLE, "f2FCMPHS", BT32, MVC32>;
+
+defm : BRCond_Bin_F2<SETNE, "f2FCMPNE", BT32, MVC32>;
+defm : BRCond_Bin_F2<SETEQ, "f2FCMPNE", BF32, MVCV32, 1>;
+defm : BRCond_Bin_F2<SETGE, "f2FCMPHS", BT32, MVC32>;
+defm : BRCond_Bin_F2<SETLT, "f2FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin_SWAP_F2<SETGT, "f2FCMPLT", BT32, MVC32>;
+defm : BRCond_Bin_SWAP_F2<SETLE, "f2FCMPHS", BT32, MVC32>;
+
+// ------
+
+let Predicates = [HasFPUv3_SF] in {
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGE)), bb:$imm16),
+            (BT32 (f2FCMPHSZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOGE)),
+            (MVC32 (f2FCMPHSZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGE)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPHSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLT)), bb:$imm16),
+            (BT32 (f2FCMPLTZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOLT)),
+            (MVC32 (f2FCMPLTZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLT)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPLTZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLE)), bb:$imm16),
+            (BT32 (f2FCMPLSZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOLE)),
+            (MVC32 (f2FCMPLSZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOLE)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPLSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGT)), bb:$imm16),
+            (BT32 (f2FCMPHZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOGT)),
+            (MVC32 (f2FCMPHZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOGT)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPHZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETUNE)), bb:$imm16),
+            (BT32 (f2FCMPNEZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETUNE)),
+            (MVC32 (f2FCMPNEZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETUNE)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPNEZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm, SETUO)), bb:$imm16),
+            (BT32 (f2FCMPUOZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm, SETUO)),
+            (MVC32 (f2FCMPUOZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm, SETUO)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPUOZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETGE)), bb:$imm16),
+            (BT32 (f2FCMPHSZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETGE)),
+            (MVC32 (f2FCMPHSZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETGE)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPHSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETLT)), bb:$imm16),
+            (BT32 (f2FCMPLTZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETLT)),
+            (MVC32 (f2FCMPLTZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETLT)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPLTZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETLE)), bb:$imm16),
+            (BT32 (f2FCMPLSZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETLE)),
+            (MVC32 (f2FCMPLSZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETLE)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPLSZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETGT)), bb:$imm16),
+            (BT32 (f2FCMPHZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETGT)),
+            (MVC32 (f2FCMPHZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETGT)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPHZ_S FPR32Op:$rs1), FPR32Op:$rx, FPR32Op:$false)>;
+
+
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm, SETO)), bb:$imm16),
+            (BF32 (f2FCMPUOZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm, SETO)),
+            (MVCV32 (f2FCMPUOZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm, SETO)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPUOZ_S FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETOEQ)), bb:$imm16),
+            (BF32 (f2FCMPNEZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETOEQ)),
+            (MVCV32 (f2FCMPNEZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETOEQ)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPNEZ_S FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx)>;
+  def : Pat<(brcond (i32 (setcc FPR32Op:$rs1, fpimm0, SETEQ)), bb:$imm16),
+            (BF32 (f2FCMPNEZ_S FPR32Op:$rs1), bb:$imm16)>;
+  def : Pat<(i32 (setcc FPR32Op:$rs1, fpimm0, SETEQ)),
+            (MVCV32 (f2FCMPNEZ_S FPR32Op:$rs1))>;
+  def : Pat<(select (i32 (setcc FPR32Op:$rs1, fpimm0, SETEQ)), FPR32Op:$rx, FPR32Op:$false),
+            (f2FSEL_S (f2FCMPNEZ_S FPR32Op:$rs1), FPR32Op:$false, FPR32Op:$rx)>;
+}
+
+
+let Predicates = [HasFPUv3_SF] in
+def : Pat<(select CARRY:$ca, FPR32Op:$rx, FPR32Op:$false),
+          (f2FSEL_S CARRY:$ca, FPR32Op:$rx, FPR32Op:$false)>;
+let Predicates = [HasFPUv3_DF] in
+def : Pat<(select CARRY:$ca, FPR64Op:$rx, FPR64Op:$false),
+          (f2FSEL_D CARRY:$ca, FPR64Op:$rx, FPR64Op:$false)>;
+\ No newline at end of file
diff --git a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
index ade5c7f795af..b7f4fc17166b 100644
--- a/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
+++ b/llvm/lib/Target/CSKY/CSKYRegisterInfo.td
@@ -194,6 +194,8 @@ def FPR64 : RegisterClass<"CSKY", [f64], 64,
 def sFPR64 : RegisterClass<"CSKY", [f64], 64,
                          (add (sequence "F%u_64", 0, 15))>;
 
+def sFPR64_V : RegisterClass<"CSKY", [v2f32], 32, (add sFPR64)>;
+
 def FPR128 : RegisterClass<"CSKY",
              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
              (add (sequence "F%u_128", 0, 31))>;
diff --git a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
index 8f61feb6506d..94b24044c27d 100644
--- a/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
+++ b/llvm/lib/Target/CSKY/CSKYTargetMachine.cpp
@@ -23,6 +23,9 @@ using namespace llvm;
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeCSKYTarget() {
   RegisterTargetMachine<CSKYTargetMachine> X(getTheCSKYTarget());
+
+  PassRegistry *Registry = PassRegistry::getPassRegistry();
+  initializeCSKYConstantIslandsPass(*Registry);
 }
 
 static std::string computeDataLayout(const Triple &TT) {
@@ -92,6 +95,7 @@ public:
   }
 
   bool addInstSelector() override;
+  void addPreEmitPass() override;
 };
 
 } // namespace
@@ -105,3 +109,7 @@ bool CSKYPassConfig::addInstSelector() {
 
   return false;
 }
+
+void CSKYPassConfig::addPreEmitPass() {
+  addPass(createCSKYConstantIslandPass());
+}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
index 7001de999a51..07757f03c258 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.cpp
@@ -73,6 +73,13 @@ void CSKYInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
   O << getRegisterName(RegNo);
 }
 
+void CSKYInstPrinter::printFPRRegName(raw_ostream &O, unsigned RegNo) const {
+  if (PrintBranchImmAsAddress)
+    O << getRegisterName(RegNo, CSKY::NoRegAltName);
+  else
+    O << getRegisterName(RegNo);
+}
+
 void CSKYInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                    const MCSubtargetInfo &STI, raw_ostream &O,
                                    const char *Modifier) {
@@ -201,3 +208,11 @@ const char *CSKYInstPrinter::getRegisterName(unsigned RegNo) {
   return getRegisterName(RegNo, ArchRegNames ? CSKY::NoRegAltName
                                              : CSKY::ABIRegAltName);
 }
+
+void CSKYInstPrinter::printFPR(const MCInst *MI, unsigned OpNo,
+                               const MCSubtargetInfo &STI, raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+  assert(MO.isReg());
+
+  printFPRRegName(O, MO.getReg());
+}
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
index f93a342ec6a3..52a1b9276762 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYInstPrinter.h
@@ -36,6 +36,8 @@ public:
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O, const char *Modifier = nullptr);
 
+  void printFPRRegName(raw_ostream &O, unsigned RegNo) const;
+
   // Autogenerated by tblgen.
   std::pair<const char *, uint64_t> getMnemonic(const MCInst *MI) override;
   void printInstruction(const MCInst *MI, uint64_t Address,
@@ -60,6 +62,8 @@ public:
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printSPAddr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  void printFPR(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
   static const char *getRegisterName(unsigned RegNo, unsigned AltIdx);
 };
diff --git a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
index 668247bbbd87..543f2e3d43d4 100644
--- a/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
+++ b/llvm/lib/Target/CSKY/MCTargetDesc/CSKYMCAsmInfo.cpp
@@ -22,4 +22,6 @@ CSKYMCAsmInfo::CSKYMCAsmInfo(const Triple &TargetTriple) {
   AlignmentIsInBytes = false;
   SupportsDebugInformation = true;
   CommentString = "#";
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
 }
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index d131cf896834..15eba89eeb55 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -211,8 +211,7 @@ struct HexagonOperand : public MCParsedAsmOperand {
     struct ImmTy Imm;
   };
 
-  HexagonOperand(KindTy K, MCContext &Context)
-      : MCParsedAsmOperand(), Kind(K), Context(Context) {}
+  HexagonOperand(KindTy K, MCContext &Context) : Kind(K), Context(Context) {}
 
 public:
   HexagonOperand(const HexagonOperand &o)
diff --git a/llvm/lib/Target/Hexagon/BitTracker.cpp b/llvm/lib/Target/Hexagon/BitTracker.cpp
index 685bafd785df..17adf32750db 100644
--- a/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -940,8 +940,8 @@ void BT::visitBranchesFrom(const MachineInstr &BI) {
       // If evaluated successfully add the targets to the cumulative list.
       if (Trace) {
         dbgs() << "  adding targets:";
-        for (unsigned i = 0, n = BTs.size(); i < n; ++i)
-          dbgs() << " " << printMBBReference(*BTs[i]);
+        for (const MachineBasicBlock *BT : BTs)
+          dbgs() << " " << printMBBReference(*BT);
         if (FallsThrough)
           dbgs() << "\n  falls through\n";
         else
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 428d25da6dbc..b2a842233bb8 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -3260,13 +3260,12 @@ bool HexagonLoopRescheduling::processLoop(LoopCand &C) {
       dbgs() << "Group[" << i << "] inp: "
              << printReg(G.Inp.Reg, HRI, G.Inp.Sub)
              << "  out: " << printReg(G.Out.Reg, HRI, G.Out.Sub) << "\n";
-      for (unsigned j = 0, m = G.Ins.size(); j < m; ++j)
-        dbgs() << "  " << *G.Ins[j];
+      for (const MachineInstr *MI : G.Ins)
+        dbgs() << "  " << MI;
     }
   });
 
-  for (unsigned i = 0, n = Groups.size(); i < n; ++i) {
-    InstrGroup &G = Groups[i];
+  for (InstrGroup &G : Groups) {
     if (!isShuffleOf(G.Out.Reg, G.Inp.Reg))
       continue;
     auto LoopInpEq = [G] (const PhiInfo &P) -> bool {
diff --git a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index 1938a5c259da..8e014b395286 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -493,6 +493,11 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
       RegisterCell RC = eADD(rc(1), lo(M, W0));
       return rr0(RC, Outputs);
     }
+    case M2_mnaci: {
+      RegisterCell M = eMLS(rc(2), rc(3));
+      RegisterCell RC = eSUB(rc(1), lo(M, W0));
+      return rr0(RC, Outputs);
+    }
     case M2_mpysmi: {
       RegisterCell M = eMLS(rc(1), eIMM(im(2), W0));
       return rr0(lo(M, 32), Outputs);
diff --git a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index b456cf139c55..a31ad45f4bb0 100644
--- a/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -118,13 +118,10 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
     return false;
 
   // Loop over all of the basic blocks.
-  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
-       MBBb != MBBe; ++MBBb) {
-    MachineBasicBlock *MBB = &*MBBb;
-
+  for (MachineBasicBlock &MBB : Fn) {
     // Traverse the basic block.
-    MachineBasicBlock::iterator MII = MBB->getFirstTerminator();
-    if (MII != MBB->end()) {
+    MachineBasicBlock::iterator MII = MBB.getFirstTerminator();
+    if (MII != MBB.end()) {
       MachineInstr &MI = *MII;
       int Opc = MI.getOpcode();
       if (IsConditionalBranch(Opc)) {
@@ -155,17 +152,17 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
         //   Remove BB2
         //   BB3: ...
         //   BB4: ...
-        unsigned NumSuccs = MBB->succ_size();
-        MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
+        unsigned NumSuccs = MBB.succ_size();
+        MachineBasicBlock::succ_iterator SI = MBB.succ_begin();
         MachineBasicBlock* FirstSucc = *SI;
         MachineBasicBlock* SecondSucc = *(++SI);
         MachineBasicBlock* LayoutSucc = nullptr;
         MachineBasicBlock* JumpAroundTarget = nullptr;
 
-        if (MBB->isLayoutSuccessor(FirstSucc)) {
+        if (MBB.isLayoutSuccessor(FirstSucc)) {
           LayoutSucc = FirstSucc;
           JumpAroundTarget = SecondSucc;
-        } else if (MBB->isLayoutSuccessor(SecondSucc)) {
+        } else if (MBB.isLayoutSuccessor(SecondSucc)) {
           LayoutSucc = SecondSucc;
           JumpAroundTarget = FirstSucc;
         } else {
@@ -201,7 +198,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
             if (case1 || case2) {
               InvertAndChangeJumpTarget(MI, UncondTarget);
-              MBB->replaceSuccessor(JumpAroundTarget, UncondTarget);
+              MBB.replaceSuccessor(JumpAroundTarget, UncondTarget);
 
               // Remove the unconditional branch in LayoutSucc.
               LayoutSucc->erase(LayoutSucc->begin());
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index 93e17e608dd1..cc41b569e490 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -126,16 +126,16 @@ def CC_Hexagon_HVX: CallingConv<[
 
   // HVX 128-byte mode
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToReg<[V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15]>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
       CCAssignToReg<[W0,W1,W2,W3,W4,W5,W6,W7]>>>,
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToStack<128,128>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
       CCAssignToStack<256,128>>>,
 
   CCDelegateTo<CC_Hexagon>
@@ -152,10 +152,10 @@ def RetCC_Hexagon_HVX: CallingConv<[
 
   // HVX 128-byte mode
   CCIfHvx128<
-    CCIfType<[v32i32,v64i16,v128i8],
+    CCIfType<[v32i32,v64i16,v128i8,v32f32,v64f16],
       CCAssignToReg<[V0]>>>,
   CCIfHvx128<
-    CCIfType<[v64i32,v128i16,v256i8],
+    CCIfType<[v64i32,v128i16,v256i8,v64f32,v128f16],
       CCAssignToReg<[W0]>>>,
 
   CCDelegateTo<RetCC_Hexagon>
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index a53efeb96961..fc5e05d8c9a0 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -290,13 +290,11 @@ namespace {
   raw_ostream &operator<< (raw_ostream &OS,
                            const NodeToUsesMap &M) LLVM_ATTRIBUTE_UNUSED;
   raw_ostream &operator<< (raw_ostream &OS, const NodeToUsesMap &M){
-    using const_iterator = NodeToUsesMap::const_iterator;
-
-    for (const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
-      const UseSet &Us = I->second;
-      OS << I->first << " -> #" << Us.size() << '{';
-      for (UseSet::const_iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
-        User *R = (*J)->getUser();
+    for (const auto &I : M) {
+      const UseSet &Us = I.second;
+      OS << I.first << " -> #" << Us.size() << '{';
+      for (const Use *U : Us) {
+        User *R = U->getUser();
         if (R->hasName())
           OS << ' ' << R->getName();
         else
@@ -420,15 +418,12 @@ void HexagonCommonGEP::collect() {
   // instruction that uses another GEP instruction as the base pointer, the
   // gep node for the base pointer should already exist.
   ValueToNodeMap NM;
-  for (ValueVect::iterator I = BO.begin(), E = BO.end(); I != E; ++I) {
-    BasicBlock *B = cast<BasicBlock>(*I);
-    for (BasicBlock::iterator J = B->begin(), F = B->end(); J != F; ++J) {
-      if (!isa<GetElementPtrInst>(J))
-        continue;
-      GetElementPtrInst *GepI = cast<GetElementPtrInst>(J);
-      if (isHandledGepForm(GepI))
-        processGepInst(GepI, NM);
-    }
+  for (Value *I : BO) {
+    BasicBlock *B = cast<BasicBlock>(I);
+    for (Instruction &J : *B)
+      if (auto *GepI = dyn_cast<GetElementPtrInst>(&J))
+        if (isHandledGepForm(GepI))
+          processGepInst(GepI, NM);
   }
 
   LLVM_DEBUG(dbgs() << "Gep nodes after initial collection:\n" << Nodes);
@@ -436,17 +431,14 @@ void HexagonCommonGEP::collect() {
 
 static void invert_find_roots(const NodeVect &Nodes, NodeChildrenMap &NCM,
                               NodeVect &Roots) {
-    using const_iterator = NodeVect::const_iterator;
-
-    for (const_iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-      GepNode *N = *I;
-      if (N->Flags & GepNode::Root) {
-        Roots.push_back(N);
-        continue;
-      }
-      GepNode *PN = N->Parent;
-      NCM[PN].push_back(N);
+  for (GepNode *N : Nodes) {
+    if (N->Flags & GepNode::Root) {
+      Roots.push_back(N);
+      continue;
     }
+    GepNode *PN = N->Parent;
+    NCM[PN].push_back(N);
+  }
 }
 
 static void nodes_for_root(GepNode *Root, NodeChildrenMap &NCM,
@@ -546,8 +538,7 @@ void HexagonCommonGEP::common() {
   using NodeSetMap = std::map<unsigned, NodeSet>;
   NodeSetMap MaybeEq;
 
-  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Nodes) {
     unsigned H = node_hash(N);
     MaybeEq[H].insert(N);
   }
@@ -556,9 +547,8 @@ void HexagonCommonGEP::common() {
   // one for equality and the other for non-equality.
   NodeSymRel EqRel;  // Equality relation (as set of equivalence classes).
   NodePairSet Eq, Ne;  // Caches.
-  for (NodeSetMap::iterator I = MaybeEq.begin(), E = MaybeEq.end();
-       I != E; ++I) {
-    NodeSet &S = I->second;
+  for (auto &I : MaybeEq) {
+    NodeSet &S = I.second;
     for (NodeSet::iterator NI = S.begin(), NE = S.end(); NI != NE; ++NI) {
       GepNode *N = *NI;
       // If node already has a class, then the class must have been created
@@ -612,8 +602,7 @@ void HexagonCommonGEP::common() {
     // Update the min element's flags, and user list.
     uint32_t Flags = 0;
     UseSet &MinUs = Uses[Min];
-    for (NodeSet::iterator J = S.begin(), F = S.end(); J != F; ++J) {
-      GepNode *N = *J;
+    for (GepNode *N : S) {
       uint32_t NF = N->Flags;
       // If N is used, append all original values of N to the list of
       // original values of Min.
@@ -633,8 +622,7 @@ void HexagonCommonGEP::common() {
   // selected (minimum) node from the corresponding equivalence class.
   // If a given parent does not have an equivalence class, leave it
   // unchanged (it means that it's the only element in its class).
-  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Nodes) {
     if (N->Flags & GepNode::Root)
       continue;
     const NodeSet *PC = node_class(N->Parent, EqRel);
@@ -652,8 +640,7 @@ void HexagonCommonGEP::common() {
 
   // Finally, erase the nodes that are no longer used.
   NodeSet Erase;
-  for (NodeVect::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Nodes) {
     const NodeSet *PC = node_class(N, EqRel);
     if (!PC)
       continue;
@@ -663,7 +650,7 @@ void HexagonCommonGEP::common() {
     if (N == F->second)
       continue;
     // Node for removal.
-    Erase.insert(*I);
+    Erase.insert(N);
   }
   erase_if(Nodes, in_set(Erase));
 
@@ -775,8 +762,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
     NodeToUsesMap::iterator UF = Uses.find(Node);
     assert(UF != Uses.end() && "Used node with no use information");
     UseSet &Us = UF->second;
-    for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
-      Use *U = *I;
+    for (Use *U : Us) {
       User *R = U->getUser();
       if (!isa<Instruction>(R))
         continue;
@@ -790,8 +776,7 @@ BasicBlock *HexagonCommonGEP::recalculatePlacement(GepNode *Node,
   NodeChildrenMap::iterator CF = NCM.find(Node);
   if (CF != NCM.end()) {
     NodeVect &Cs = CF->second;
-    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
-      GepNode *CN = *I;
+    for (GepNode *CN : Cs) {
       NodeToValueMap::iterator LF = Loc.find(CN);
       // If the child is only used in GEP instructions (i.e. is not used in
       // non-GEP instructions), the nearest dominator computed for it may
@@ -831,8 +816,8 @@ BasicBlock *HexagonCommonGEP::recalculatePlacementRec(GepNode *Node,
   NodeChildrenMap::iterator CF = NCM.find(Node);
   if (CF != NCM.end()) {
     NodeVect &Cs = CF->second;
-    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
-      recalculatePlacementRec(*I, NCM, Loc);
+    for (GepNode *C : Cs)
+      recalculatePlacementRec(C, NCM, Loc);
   }
   BasicBlock *LB = recalculatePlacement(Node, NCM, Loc);
   LLVM_DEBUG(dbgs() << "LocRec end for node:" << Node << '\n');
@@ -921,8 +906,8 @@ BasicBlock *HexagonCommonGEP::adjustForInvariance(GepNode *Node,
   NodeChildrenMap::iterator CF = NCM.find(Node);
   if (CF != NCM.end()) {
     NodeVect &Cs = CF->second;
-    for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I)
-      adjustForInvariance(*I, NCM, Loc);
+    for (GepNode *C : Cs)
+      adjustForInvariance(C, NCM, Loc);
   }
   return LocB;
 }
@@ -938,10 +923,9 @@ namespace {
   raw_ostream &operator<< (raw_ostream &OS,
                            const LocationAsBlock &Loc) LLVM_ATTRIBUTE_UNUSED ;
   raw_ostream &operator<< (raw_ostream &OS, const LocationAsBlock &Loc) {
-    for (NodeToValueMap::const_iterator I = Loc.Map.begin(), E = Loc.Map.end();
-         I != E; ++I) {
-      OS << I->first << " -> ";
-      if (BasicBlock *B = cast_or_null<BasicBlock>(I->second))
+    for (const auto &I : Loc.Map) {
+      OS << I.first << " -> ";
+      if (BasicBlock *B = cast_or_null<BasicBlock>(I.second))
         OS << B->getName() << '(' << B << ')';
       else
         OS << "<null-block>";
@@ -1016,8 +1000,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node,
   // Collect all used nodes together with the uses from loads and stores,
   // where the GEP node could be folded into the load/store instruction.
   NodeToUsesMap FNs; // Foldable nodes.
-  for (NodeSet::iterator I = Ns.begin(), E = Ns.end(); I != E; ++I) {
-    GepNode *N = *I;
+  for (GepNode *N : Ns) {
     if (!(N->Flags & GepNode::Used))
       continue;
     NodeToUsesMap::iterator UF = Uses.find(N);
@@ -1025,8 +1008,7 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node,
     UseSet &Us = UF->second;
     // Loads/stores that use the node N.
     UseSet LSs;
-    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J) {
-      Use *U = *J;
+    for (Use *U : Us) {
       User *R = U->getUser();
       // We're interested in uses that provide the address. It can happen
       // that the value may also be provided via GEP, but we won't handle
@@ -1051,11 +1033,11 @@ void HexagonCommonGEP::separateConstantChains(GepNode *Node,
 
   LLVM_DEBUG(dbgs() << "Nodes with foldable users:\n" << FNs);
 
-  for (NodeToUsesMap::iterator I = FNs.begin(), E = FNs.end(); I != E; ++I) {
-    GepNode *N = I->first;
-    UseSet &Us = I->second;
-    for (UseSet::iterator J = Us.begin(), F = Us.end(); J != F; ++J)
-      separateChainForNode(N, *J, Loc);
+  for (auto &FN : FNs) {
+    GepNode *N = FN.first;
+    UseSet &Us = FN.second;
+    for (Use *U : Us)
+      separateChainForNode(N, U, Loc);
   }
 }
 
@@ -1068,21 +1050,21 @@ void HexagonCommonGEP::computeNodePlacement(NodeToValueMap &Loc) {
 
   // Compute the initial placement determined by the users' locations, and
   // the locations of the child nodes.
-  for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
-    recalculatePlacementRec(*I, NCM, Loc);
+  for (GepNode *Root : Roots)
+    recalculatePlacementRec(Root, NCM, Loc);
 
   LLVM_DEBUG(dbgs() << "Initial node placement:\n" << LocationAsBlock(Loc));
 
   if (OptEnableInv) {
-    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
-      adjustForInvariance(*I, NCM, Loc);
+    for (GepNode *Root : Roots)
+      adjustForInvariance(Root, NCM, Loc);
 
     LLVM_DEBUG(dbgs() << "Node placement after adjustment for invariance:\n"
                       << LocationAsBlock(Loc));
   }
   if (OptEnableConst) {
-    for (NodeVect::iterator I = Roots.begin(), E = Roots.end(); I != E; ++I)
-      separateConstantChains(*I, NCM, Loc);
+    for (GepNode *Root : Roots)
+      separateConstantChains(Root, NCM, Loc);
   }
   LLVM_DEBUG(dbgs() << "Node use information:\n" << Uses);
 
@@ -1153,8 +1135,8 @@ void HexagonCommonGEP::getAllUsersForNode(GepNode *Node, ValueVect &Values,
       NodeToUsesMap::iterator UF = Uses.find(N);
       assert(UF != Uses.end() && "No use information for used node");
       UseSet &Us = UF->second;
-      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I)
-        Values.push_back((*I)->getUser());
+      for (const auto &U : Us)
+        Values.push_back(U->getUser());
     }
     NodeChildrenMap::iterator CF = NCM.find(N);
     if (CF != NCM.end()) {
@@ -1223,8 +1205,7 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
     // to the Roots list.
     if (LastCN > 0) {
       NodeVect &Cs = NCM[Last];
-      for (NodeVect::iterator I = Cs.begin(), E = Cs.end(); I != E; ++I) {
-        GepNode *CN = *I;
+      for (GepNode *CN : Cs) {
         CN->Flags &= ~GepNode::Internal;
         CN->Flags |= GepNode::Root;
         CN->BaseVal = NewInst;
@@ -1238,10 +1219,8 @@ void HexagonCommonGEP::materialize(NodeToValueMap &Loc) {
       NodeToUsesMap::iterator UF = Uses.find(Last);
       assert(UF != Uses.end() && "No use information found");
       UseSet &Us = UF->second;
-      for (UseSet::iterator I = Us.begin(), E = Us.end(); I != E; ++I) {
-        Use *U = *I;
+      for (Use *U : Us)
         U->set(NewInst);
-      }
     }
   }
 }
@@ -1261,8 +1240,8 @@ void HexagonCommonGEP::removeDeadCode() {
     ValueVect Ins;
     for (Instruction &I : llvm::reverse(*B))
       Ins.push_back(&I);
-    for (ValueVect::iterator I = Ins.begin(), E = Ins.end(); I != E; ++I) {
-      Instruction *In = cast<Instruction>(*I);
+    for (Value *I : Ins) {
+      Instruction *In = cast<Instruction>(I);
       if (isInstructionTriviallyDead(In))
         In->eraseFromParent();
     }
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index d3fcdb6ae9a8..d8af35cbf3a8 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -229,7 +229,7 @@ namespace {
   private:
     struct Register {
       Register() = default;
-      Register(unsigned R, unsigned S) : Reg(R), Sub(S) {}
+      Register(llvm::Register R, unsigned S) : Reg(R), Sub(S) {}
       Register(const MachineOperand &Op)
         : Reg(Op.getReg()), Sub(Op.getSubReg()) {}
       Register &operator=(const MachineOperand &Op) {
@@ -1573,7 +1573,7 @@ HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
         // No compounds are available. It is not clear whether we should
         // even process such extenders where the initializer cannot be
         // a single instruction, but do it for now.
-        unsigned TmpR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+        llvm::Register TmpR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
         BuildMI(MBB, At, dl, HII->get(Hexagon::S2_asl_i_r), TmpR)
           .add(MachineOperand(Ex.Rs))
           .addImm(Ex.S);
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index daf311fc49d4..105bf2811a20 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -125,8 +125,8 @@ namespace {
     };
 
     LatticeCell() : Kind(Top), Size(0), IsSpecial(false) {
-      for (unsigned i = 0; i < MaxCellSize; ++i)
-        Values[i] = nullptr;
+      for (const Constant *&Value : Values)
+        Value = nullptr;
     }
 
     bool meet(const LatticeCell &L);
@@ -1029,8 +1029,8 @@ bool MachineConstPropagator::rewrite(MachineFunction &MF) {
           ToRemove.push_back(const_cast<MachineBasicBlock*>(SB));
         Targets.remove(SB);
       }
-      for (unsigned i = 0, n = ToRemove.size(); i < n; ++i)
-        removeCFGEdge(B, ToRemove[i]);
+      for (MachineBasicBlock *MBB : ToRemove)
+        removeCFGEdge(B, MBB);
       // If there are any blocks left in the computed targets, it means that
       // we think that the block could go somewhere, but the CFG does not.
       // This could legitimately happen in blocks that have non-returning
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 03b0f75b2dc1..2ee7f1325df9 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -70,9 +70,7 @@ class HexagonCopyToCombine : public MachineFunctionPass  {
 public:
   static char ID;
 
-  HexagonCopyToCombine() : MachineFunctionPass(ID) {
-    initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
-  }
+  HexagonCopyToCombine() : MachineFunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index 9a3feb5b6af1..2207925ceeba 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -612,8 +612,8 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
   // Simply keep a list of children of B, and traverse that list.
   using DTNodeVectType = SmallVector<MachineDomTreeNode *, 4>;
   DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
-  for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
+  for (auto &I : Cn) {
+    MachineBasicBlock *SB = I->getBlock();
     if (!Deleted.count(SB))
       Changed |= visitBlock(SB, L);
   }
@@ -648,8 +648,8 @@ bool HexagonEarlyIfConversion::visitLoop(MachineLoop *L) {
              << "\n");
   bool Changed = false;
   if (L) {
-    for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-      Changed |= visitLoop(*I);
+    for (MachineLoop *I : *L)
+      Changed |= visitLoop(I);
   }
 
   MachineBasicBlock *EntryB = GraphTraits<MachineFunction*>::getEntryNode(MFN);
@@ -964,8 +964,8 @@ void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
     using DTNodeVectType = SmallVector<MachineDomTreeNode *, 4>;
 
     DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
-    for (DTNodeVectType::iterator I = Cn.begin(), E = Cn.end(); I != E; ++I) {
-      MachineBasicBlock *SB = (*I)->getBlock();
+    for (auto &I : Cn) {
+      MachineBasicBlock *SB = I->getBlock();
       MDT->changeImmediateDominator(SB, IDB);
     }
   }
@@ -973,8 +973,8 @@ void HexagonEarlyIfConversion::removeBlock(MachineBasicBlock *B) {
   while (!B->succ_empty())
     B->removeSuccessor(B->succ_begin());
 
-  for (auto I = B->pred_begin(), E = B->pred_end(); I != E; ++I)
-    (*I)->removeSuccessor(B, true);
+  for (MachineBasicBlock *Pred : B->predecessors())
+    Pred->removeSuccessor(B, true);
 
   Deleted.insert(B);
   MDT->eraseNode(B);
@@ -1064,8 +1064,8 @@ bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
   Deleted.clear();
   bool Changed = false;
 
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
-    Changed |= visitLoop(*I);
+  for (MachineLoop *L : *MLI)
+    Changed |= visitLoop(L);
   Changed |= visitLoop(nullptr);
 
   return Changed;
diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index c444cf557c21..2693940bb1e9 100644
--- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -1106,8 +1106,7 @@ bool HexagonExpandCondsets::isIntReg(RegisterRef RR, unsigned &BW) {
 }
 
 bool HexagonExpandCondsets::isIntraBlocks(LiveInterval &LI) {
-  for (LiveInterval::iterator I = LI.begin(), E = LI.end(); I != E; ++I) {
-    LiveRange::Segment &LR = *I;
+  for (LiveRange::Segment &LR : LI) {
     // Range must start at a register...
     if (!LR.start.isRegister())
       return false;
@@ -1160,16 +1159,16 @@ bool HexagonExpandCondsets::coalesceRegisters(RegisterRef R1, RegisterRef R2) {
   // Move all live segments from L2 to L1.
   using ValueInfoMap = DenseMap<VNInfo *, VNInfo *>;
   ValueInfoMap VM;
-  for (LiveInterval::iterator I = L2.begin(), E = L2.end(); I != E; ++I) {
-    VNInfo *NewVN, *OldVN = I->valno;
+  for (LiveRange::Segment &I : L2) {
+    VNInfo *NewVN, *OldVN = I.valno;
     ValueInfoMap::iterator F = VM.find(OldVN);
     if (F == VM.end()) {
-      NewVN = L1.getNextValue(I->valno->def, LIS->getVNInfoAllocator());
+      NewVN = L1.getNextValue(I.valno->def, LIS->getVNInfoAllocator());
       VM.insert(std::make_pair(OldVN, NewVN));
     } else {
       NewVN = F->second;
     }
-    L1.addSegment(LiveRange::Segment(I->start, I->end, NewVN));
+    L1.addSegment(LiveRange::Segment(I.start, I.end, NewVN));
   }
   while (!L2.empty())
     L2.removeSegment(*L2.begin());
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 12ceac545e9d..989a98571434 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -416,8 +416,8 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
   UnsignedMap RPO;
   RPOTType RPOT(&MF);
   unsigned RPON = 0;
-  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
-    RPO[(*I)->getNumber()] = RPON++;
+  for (auto &I : RPOT)
+    RPO[I->getNumber()] = RPON++;
 
   // Don't process functions that have loops, at least for now. Placement
   // of prolog and epilog must take loop structure into account. For simpli-
@@ -1410,7 +1410,7 @@ bool HexagonFrameLowering::insertCSRSpillsInBlock(MachineBasicBlock &MBB,
   }
 
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     // Add live in registers. We treat eh_return callee saved register r0 - r3
     // specially. They are not really callee saved registers as they are not
     // supposed to be killed.
@@ -1479,7 +1479,7 @@ bool HexagonFrameLowering::insertCSRRestoresInBlock(MachineBasicBlock &MBB,
   }
 
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     const TargetRegisterClass *RC = HRI.getMinimalPhysRegClass(Reg);
     int FI = I.getFrameIdx();
     HII.loadRegFromStackSlot(MBB, MI, Reg, FI, RC, &HRI);
@@ -1620,7 +1620,7 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
   // sub-registers to SRegs.
   LLVM_DEBUG(dbgs() << "Initial CS registers: {");
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned R = I.getReg();
+    Register R = I.getReg();
     LLVM_DEBUG(dbgs() << ' ' << printReg(R, TRI));
     for (MCSubRegIterator SR(R, TRI, true); SR.isValid(); ++SR)
       SRegs[*SR] = true;
@@ -2635,7 +2635,7 @@ bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF,
   // a contiguous block starting from D8.
   BitVector Regs(Hexagon::NUM_TARGET_REGS);
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned R = I.getReg();
+    Register R = I.getReg();
     if (!Hexagon::DoubleRegsRegClass.contains(R))
       return true;
     Regs[R] = true;
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 85230cac9d7c..0bb1658e7698 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -583,14 +583,12 @@ namespace {
 char HexagonGenInsert::ID = 0;
 
 void HexagonGenInsert::dump_map() const {
-  using iterator = IFMapType::const_iterator;
-
-  for (iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
-    dbgs() << "  " << printReg(I->first, HRI) << ":\n";
-    const IFListType &LL = I->second;
-    for (unsigned i = 0, n = LL.size(); i < n; ++i)
-      dbgs() << "    " << PrintIFR(LL[i].first, HRI) << ", "
-             << PrintRegSet(LL[i].second, HRI) << '\n';
+  for (const auto &I : IFMap) {
+    dbgs() << "  " << printReg(I.first, HRI) << ":\n";
+    const IFListType &LL = I.second;
+    for (const auto &J : LL)
+      dbgs() << "    " << PrintIFR(J.first, HRI) << ", "
+             << PrintRegSet(J.second, HRI) << '\n';
   }
 }
 
@@ -627,8 +625,8 @@ void HexagonGenInsert::buildOrderingBT(RegisterOrdering &RB,
   using SortableVectorType = std::vector<unsigned>;
 
   SortableVectorType VRs;
-  for (RegisterOrdering::iterator I = RB.begin(), E = RB.end(); I != E; ++I)
-    VRs.push_back(I->first);
+  for (auto &I : RB)
+    VRs.push_back(I.first);
   llvm::sort(VRs, LexCmp);
   // Transfer the results to the outgoing register ordering.
   for (unsigned i = 0, n = VRs.size(); i < n; ++i)
@@ -853,20 +851,18 @@ bool HexagonGenInsert::findRecordInsertForms(unsigned VR,
 
   if (isDebug()) {
     dbgs() << "Prefixes matching register " << printReg(VR, HRI) << "\n";
-    for (LRSMapType::iterator I = LM.begin(), E = LM.end(); I != E; ++I) {
-      dbgs() << "  L=" << I->first << ':';
-      const RSListType &LL = I->second;
-      for (unsigned i = 0, n = LL.size(); i < n; ++i)
-        dbgs() << " (" << printReg(LL[i].first, HRI) << ",@"
-               << LL[i].second << ')';
+    for (const auto &I : LM) {
+      dbgs() << "  L=" << I.first << ':';
+      const RSListType &LL = I.second;
+      for (const auto &J : LL)
+        dbgs() << " (" << printReg(J.first, HRI) << ",@" << J.second << ')';
       dbgs() << '\n';
     }
   }
 
   bool Recorded = false;
 
-  for (iterator I = AVs.begin(), E = AVs.end(); I != E; ++I) {
-    unsigned SrcR = *I;
+  for (unsigned SrcR : AVs) {
     int FDi = -1, LDi = -1;   // First/last different bit.
     const BitTracker::RegisterCell &AC = CMS->lookup(SrcR);
     uint16_t AW = AC.width();
@@ -888,8 +884,8 @@ bool HexagonGenInsert::findRecordInsertForms(unsigned VR,
       if (F == LM.end())
         continue;
       RSListType &LL = F->second;
-      for (unsigned i = 0, n = LL.size(); i < n; ++i) {
-        uint16_t S = LL[i].second;
+      for (const auto &I : LL) {
+        uint16_t S = I.second;
         // MinL is the minimum length of the prefix. Any length above MinL
         // allows some flexibility as to where the prefix can start:
         // given the extra length EL=L-MinL, the prefix must start between
@@ -900,7 +896,7 @@ bool HexagonGenInsert::findRecordInsertForms(unsigned VR,
         uint16_t LowS = (EL < FD) ? FD-EL : 0;
         if (S < LowS) // Starts too early.
           continue;
-        unsigned InsR = LL[i].first;
+        unsigned InsR = I.first;
         if (!isValidInsertForm(VR, SrcR, InsR, L, S))
           continue;
         if (isDebug()) {
@@ -1029,10 +1025,10 @@ void HexagonGenInsert::findRemovableRegisters(unsigned VR, IFRecord IF,
 }
 
 void HexagonGenInsert::computeRemovableRegisters() {
-  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
-    IFListType &LL = I->second;
-    for (unsigned i = 0, n = LL.size(); i < n; ++i)
-      findRemovableRegisters(I->first, LL[i].first, LL[i].second);
+  for (auto &I : IFMap) {
+    IFListType &LL = I.second;
+    for (auto &J : LL)
+      findRemovableRegisters(I.first, J.first, J.second);
   }
 }
 
@@ -1064,8 +1060,8 @@ void HexagonGenInsert::pruneCoveredSets(unsigned VR) {
   MachineInstr *DefVR = MRI->getVRegDef(VR);
   bool DefEx = HII->isConstExtended(*DefVR);
   bool HasNE = false;
-  for (unsigned i = 0, n = LL.size(); i < n; ++i) {
-    if (LL[i].second.empty())
+  for (const auto &I : LL) {
+    if (I.second.empty())
       continue;
     HasNE = true;
     break;
@@ -1172,8 +1168,8 @@ void HexagonGenInsert::pruneCandidates() {
   // selection method.
   // First, remove candidates whose potentially removable set is a subset
   // of another candidate's set.
-  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
-    pruneCoveredSets(I->first);
+  for (const auto &I : IFMap)
+    pruneCoveredSets(I.first);
 
   UnsignedMap RPO;
 
@@ -1181,18 +1177,18 @@ void HexagonGenInsert::pruneCandidates() {
 
   RPOTType RPOT(MFN);
   unsigned RPON = 0;
-  for (RPOTType::rpo_iterator I = RPOT.begin(), E = RPOT.end(); I != E; ++I)
-    RPO[(*I)->getNumber()] = RPON++;
+  for (const auto &I : RPOT)
+    RPO[I->getNumber()] = RPON++;
 
   PairMapType Memo; // Memoization map for distance calculation.
   // Remove candidates that would use registers defined too far away.
-  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
-    pruneUsesTooFar(I->first, RPO, Memo);
+  for (const auto &I : IFMap)
+    pruneUsesTooFar(I.first, RPO, Memo);
 
   pruneEmptyLists();
 
-  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I)
-    pruneRegCopies(I->first);
+  for (const auto &I : IFMap)
+    pruneRegCopies(I.first);
 }
 
 namespace {
@@ -1277,8 +1273,8 @@ void HexagonGenInsert::selectCandidates() {
   for (IFMapType::iterator I = IFMap.begin(); I != End; ++I) {
     const IFListType &LL = I->second;
     RegisterSet TT;
-    for (unsigned i = 0, n = LL.size(); i < n; ++i)
-      TT.insert(LL[i].second);
+    for (const auto &J : LL)
+      TT.insert(J.second);
     for (unsigned R = TT.find_first(); R; R = TT.find_next(R))
       RemC[R]++;
     AllRMs.insert(TT);
@@ -1384,8 +1380,8 @@ bool HexagonGenInsert::generateInserts() {
   // Create a new register for each one from IFMap, and store them in the
   // map.
   UnsignedMap RegMap;
-  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
-    unsigned VR = I->first;
+  for (auto &I : IFMap) {
+    unsigned VR = I.first;
     const TargetRegisterClass *RC = MRI->getRegClass(VR);
     Register NewVR = MRI->createVirtualRegister(RC);
     RegMap[VR] = NewVR;
@@ -1394,15 +1390,15 @@ bool HexagonGenInsert::generateInserts() {
   // We can generate the "insert" instructions using potentially stale re-
   // gisters: SrcR and InsR for a given VR may be among other registers that
   // are also replaced. This is fine, we will do the mass "rauw" a bit later.
-  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
-    MachineInstr *MI = MRI->getVRegDef(I->first);
+  for (auto &I : IFMap) {
+    MachineInstr *MI = MRI->getVRegDef(I.first);
     MachineBasicBlock &B = *MI->getParent();
     DebugLoc DL = MI->getDebugLoc();
-    unsigned NewR = RegMap[I->first];
+    unsigned NewR = RegMap[I.first];
     bool R32 = MRI->getRegClass(NewR) == &Hexagon::IntRegsRegClass;
     const MCInstrDesc &D = R32 ? HII->get(Hexagon::S2_insert)
                                : HII->get(Hexagon::S2_insertp);
-    IFRecord IF = I->second[0].first;
+    IFRecord IF = I.second[0].first;
     unsigned Wdh = IF.Wdh, Off = IF.Off;
     unsigned InsS = 0;
     if (R32 && MRI->getRegClass(IF.InsR) == &Hexagon::DoubleRegsRegClass) {
@@ -1428,9 +1424,9 @@ bool HexagonGenInsert::generateInserts() {
     MRI->clearKillFlags(IF.InsR);
   }
 
-  for (IFMapType::iterator I = IFMap.begin(), E = IFMap.end(); I != E; ++I) {
-    MachineInstr *DefI = MRI->getVRegDef(I->first);
-    MRI->replaceRegWith(I->first, RegMap[I->first]);
+  for (const auto &I : IFMap) {
+    MachineInstr *DefI = MRI->getVRegDef(I.first);
+    MRI->replaceRegWith(I.first, RegMap[I.first]);
     DefI->eraseFromParent();
   }
 
@@ -1523,9 +1519,8 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
 
   if (isDebug()) {
     dbgs() << "Cell ordering:\n";
-    for (RegisterOrdering::iterator I = CellOrd.begin(), E = CellOrd.end();
-        I != E; ++I) {
-      unsigned VR = I->first, Pos = I->second;
+    for (const auto &I : CellOrd) {
+      unsigned VR = I.first, Pos = I.second;
       dbgs() << printReg(VR, HRI) << " -> " << Pos << "\n";
     }
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 1a66394e9757..00615f355146 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -505,8 +505,8 @@ bool HexagonGenPredicate::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   collectPredicateGPR(MF);
-  for (SetOfReg::iterator I = PredGPRs.begin(), E = PredGPRs.end(); I != E; ++I)
-    processPredicateGPR(*I);
+  for (const RegisterSubReg &R : PredGPRs)
+    processPredicateGPR(R);
 
   bool Again;
   do {
diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 5d2e1b259449..43afae441457 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -1127,8 +1127,8 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L,
   bool L1Used = false;
 
   // Process nested loops first.
-  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
-    Changed |= convertToHardwareLoop(*I, RecL0used, RecL1used);
+  for (MachineLoop *I : *L) {
+    Changed |= convertToHardwareLoop(I, RecL0used, RecL1used);
     L0Used |= RecL0used;
     L1Used |= RecL1used;
   }
@@ -1587,16 +1587,6 @@ void HexagonHardwareLoops::setImmediate(MachineOperand &MO, int64_t Val) {
   MO.setReg(NewR);
 }
 
-static bool isImmValidForOpcode(unsigned CmpOpc, int64_t Imm) {
-  // These two instructions are not extendable.
-  if (CmpOpc == Hexagon::A4_cmpbeqi)
-    return isUInt<8>(Imm);
-  if (CmpOpc == Hexagon::A4_cmpbgti)
-    return isInt<8>(Imm);
-  // The rest of the comparison-with-immediate instructions are extendable.
-  return true;
-}
-
 bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
   MachineBasicBlock *Header = L->getHeader();
   MachineBasicBlock *Latch = L->getLoopLatch();
@@ -1812,9 +1802,9 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
       // Most comparisons of register against an immediate value allow
       // the immediate to be constant-extended. There are some exceptions
       // though. Make sure the new combination will work.
-      if (CmpImmOp->isImm())
-        if (!isImmValidForOpcode(PredDef->getOpcode(), CmpImm))
-          return false;
+      if (CmpImmOp->isImm() && !TII->isExtendable(*PredDef) &&
+          !TII->isValidOffset(PredDef->getOpcode(), CmpImm, TRI, false))
+        return false;
 
       // Make sure that the compare happens after the bump.  Otherwise,
       // after the fixup, the compare would use a yet-undefined register.
diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
index 44679d429de5..e2215c9900d0 100644
--- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.cpp
@@ -44,12 +44,7 @@ HexagonHazardRecognizer::getHazardType(SUnit *SU, int stalls) {
   if (!Resources->canReserveResources(*MI)) {
     LLVM_DEBUG(dbgs() << "*** Hazard in cycle " << PacketNum << ", " << *MI);
     HazardType RetVal = Hazard;
-    if (TII->mayBeNewStore(*MI)) {
-      // Make sure the register to be stored is defined by an instruction in the
-      // packet.
-      MachineOperand &MO = MI->getOperand(MI->getNumOperands() - 1);
-      if (!MO.isReg() || RegDefs.count(MO.getReg()) == 0)
-        return Hazard;
+    if (isNewStore(*MI)) {
       // The .new store version uses different resources so check if it
       // causes a hazard.
       MachineFunction *MF = MI->getParent()->getParent();
@@ -105,6 +100,15 @@ bool HexagonHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
   return UsesDotCur && ((SU == UsesDotCur) ^ (DotCurPNum == (int)PacketNum));
 }
 
+/// Return true if the instruction would be converted to a new value store when
+/// packetized.
+bool HexagonHazardRecognizer::isNewStore(MachineInstr &MI) {
+  if (!TII->mayBeNewStore(MI))
+    return false;
+  MachineOperand &MO = MI.getOperand(MI.getNumOperands() - 1);
+  return (MO.isReg() && RegDefs.count(MO.getReg()) != 0);
+}
+
 void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
   MachineInstr *MI = SU->getInstr();
   if (!MI)
@@ -119,7 +123,7 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
   if (TII->isZeroCost(MI->getOpcode()))
     return;
 
-  if (!Resources->canReserveResources(*MI)) {
+  if (!Resources->canReserveResources(*MI) || isNewStore(*MI)) {
     // It must be a .new store since other instructions must be able to be
     // reserved at this point.
     assert(TII->mayBeNewStore(*MI) && "Expecting .new store");
@@ -127,11 +131,12 @@ void HexagonHazardRecognizer::EmitInstruction(SUnit *SU) {
     MachineInstr *NewMI =
         MF->CreateMachineInstr(TII->get(TII->getDotNewOp(*MI)),
                                MI->getDebugLoc());
-    assert(Resources->canReserveResources(*NewMI));
-    Resources->reserveResources(*NewMI);
+    if (Resources->canReserveResources(*NewMI))
+      Resources->reserveResources(*NewMI);
+    else
+      Resources->reserveResources(*MI);
     MF->deleteMachineInstr(NewMI);
-  }
-  else
+  } else
     Resources->reserveResources(*MI);
   LLVM_DEBUG(dbgs() << " Add instruction " << *MI);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
index 53b9cb43b4b6..0528cbd1f15f 100644
--- a/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
+++ b/llvm/lib/Target/Hexagon/HexagonHazardRecognizer.h
@@ -40,6 +40,10 @@ class HexagonHazardRecognizer : public ScheduleHazardRecognizer {
   // The set of registers defined by instructions in the current packet.
   SmallSet<unsigned, 8> RegDefs;
 
+  // Return true if the instruction is a store that is converted to a new value
+  // store because its value is defined in the same packet.
+  bool isNewStore(MachineInstr &MI);
+
 public:
   HexagonHazardRecognizer(const InstrItineraryData *II,
                           const HexagonInstrInfo *HII,
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 2679e399852f..161768b8dc22 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1176,6 +1176,9 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
       EVT UVT = U->getValueType(0);
       if (!UVT.isSimple() || !UVT.isInteger() || UVT.getSimpleVT() == MVT::i1)
         continue;
+      // Do not generate select for all i1 vector type.
+      if (UVT.isVector() && UVT.getVectorElementType() == MVT::i1)
+        continue;
       if (isMemOPCandidate(N, U))
         continue;
 
@@ -1282,7 +1285,7 @@ void HexagonDAGToDAGISel::emitFunctionEntryCode() {
 
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineBasicBlock *EntryBB = &MF->front();
-  unsigned AR = FuncInfo->CreateReg(MVT::i32);
+  Register AR = FuncInfo->CreateReg(MVT::i32);
   Align EntryMaxA = MFI.getMaxAlign();
   BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::PS_aligna), AR)
       .addImm(EntryMaxA.value());
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index ed4874baf7c8..0a6dd727eb82 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -230,8 +230,7 @@ bool Coloring::color() {
       WorkQ.push_back(N);
   }
 
-  for (unsigned I = 0; I < WorkQ.size(); ++I) {
-    Node N = WorkQ[I];
+  for (Node N : WorkQ) {
     NodeSet &Ns = Edges[N];
     auto P = getUniqueColor(Ns);
     if (P.first) {
@@ -270,8 +269,7 @@ bool Coloring::color() {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Coloring::dump() const {
   dbgs() << "{ Order:   {";
-  for (unsigned I = 0; I != Order.size(); ++I) {
-    Node P = Order[I];
+  for (Node P : Order) {
     if (P != Ignore)
       dbgs() << ' ' << P;
     else
@@ -761,8 +759,7 @@ void ResultStack::print(raw_ostream &OS, const SelectionDAG &G) const {
 namespace {
 struct ShuffleMask {
   ShuffleMask(ArrayRef<int> M) : Mask(M) {
-    for (unsigned I = 0, E = Mask.size(); I != E; ++I) {
-      int M = Mask[I];
+    for (int M : Mask) {
       if (M == -1)
         continue;
       MinSrc = (MinSrc == -1) ? M : std::min(MinSrc, M);
@@ -935,8 +932,7 @@ static SmallVector<unsigned, 4> getInputSegmentList(ShuffleMask SM,
   unsigned Shift = Log2_32(SegLen);
   BitVector Segs(alignTo(SM.MaxSrc + 1, SegLen) >> Shift);
 
-  for (int I = 0, E = SM.Mask.size(); I != E; ++I) {
-    int M = SM.Mask[I];
+  for (int M : SM.Mask) {
     if (M >= 0)
       Segs.set(M >> Shift);
   }
@@ -2397,6 +2393,7 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
   SDValue Base = N->getOperand(4);
   SDValue Modifier = N->getOperand(5);
   SDValue Offset = N->getOperand(6);
+  SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
 
   unsigned Opcode;
   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -2418,7 +2415,8 @@ void HexagonDAGToDAGISel::SelectV65GatherPred(SDNode *N) {
   }
 
   SDVTList VTs = CurDAG->getVTList(MVT::Other);
-  SDValue Ops[] = { Address, Predicate, Base, Modifier, Offset, Chain };
+  SDValue Ops[] = { Address, ImmOperand,
+                    Predicate, Base, Modifier, Offset, Chain };
   SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
 
   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2434,6 +2432,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   SDValue Base = N->getOperand(3);
   SDValue Modifier = N->getOperand(4);
   SDValue Offset = N->getOperand(5);
+  SDValue ImmOperand = CurDAG->getTargetConstant(0, dl, MVT::i32);
 
   unsigned Opcode;
   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -2455,7 +2454,7 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   }
 
   SDVTList VTs = CurDAG->getVTList(MVT::Other);
-  SDValue Ops[] = { Address, Base, Modifier, Offset, Chain };
+  SDValue Ops[] = { Address, ImmOperand, Base, Modifier, Offset, Chain };
   SDNode *Result = CurDAG->getMachineNode(Opcode, dl, VTs, Ops);
 
   MachineMemOperand *MemOp = cast<MemIntrinsicSDNode>(N)->getMemOperand();
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 88effed9f076..d7ca934a23e6 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -543,9 +543,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // The Glue is necessary since all emitted instructions must be
   // stuck together.
   if (!CLI.IsTailCall) {
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, Glue);
+    for (const auto &R : RegsToPass) {
+      Chain = DAG.getCopyToReg(Chain, dl, R.first, R.second, Glue);
       Glue = Chain.getValue(1);
     }
   } else {
@@ -560,9 +559,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     //
     // Do not flag preceding copytoreg stuff together with the following stuff.
     Glue = SDValue();
-    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                               RegsToPass[i].second, Glue);
+    for (const auto &R : RegsToPass) {
+      Chain = DAG.getCopyToReg(Chain, dl, R.first, R.second, Glue);
       Glue = Chain.getValue(1);
     }
     Glue = SDValue();
@@ -589,10 +587,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Add argument registers to the end of the list so that they are
   // known live into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
-  }
+  for (const auto &R : RegsToPass)
+    Ops.push_back(DAG.getRegister(R.first, R.second.getValueType()));
 
   const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
@@ -690,7 +686,7 @@ HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
       case InlineAsm::Kind_RegDef:
       case InlineAsm::Kind_RegDefEarlyClobber: {
         for (; NumVals; --NumVals, ++i) {
-          unsigned Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
+          Register Reg = cast<RegisterSDNode>(Op.getOperand(i))->getReg();
           if (Reg != LR)
             continue;
           HMFI.setHasClobberLR(true);
@@ -1190,7 +1186,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(HRI.getRARegister(), getRegClassFor(MVT::i32));
+  Register Reg = MF.addLiveIn(HRI.getRARegister(), getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
@@ -1776,6 +1772,18 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
   setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
 
+  // Special handling for half-precision floating point conversions.
+  // Lower half float conversions into library calls.
+  setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+
   // Handling of indexed loads/stores: default is "expand".
   //
   for (MVT VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f32, MVT::f64,
@@ -1856,6 +1864,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   else
     setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf");
 
+  // Routines to handle fp16 storage type.
+  setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
+  setLibcallName(RTLIB::FPROUND_F64_F16, "__truncdfhf2");
+  setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
+
   // These cause problems when the shift amount is non-constant.
   setLibcallName(RTLIB::SHL_I128, nullptr);
   setLibcallName(RTLIB::SRL_I128, nullptr);
@@ -2204,8 +2217,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
   // Express the shuffle mask in terms of bytes.
   SmallVector<int,8> ByteMask;
   unsigned ElemBytes = VecTy.getVectorElementType().getSizeInBits() / 8;
-  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
-    int M = Mask[i];
+  for (int M : Mask) {
     if (M < 0) {
       for (unsigned j = 0; j != ElemBytes; ++j)
         ByteMask.push_back(-1);
@@ -2428,8 +2440,8 @@ HexagonTargetLowering::buildVector32(ArrayRef<SDValue> Elem, const SDLoc &dl,
     if (AllConst) {
       int32_t V = (Consts[0]->getZExtValue() & 0xFF) |
                   (Consts[1]->getZExtValue() & 0xFF) << 8 |
-                  (Consts[1]->getZExtValue() & 0xFF) << 16 |
-                  Consts[2]->getZExtValue() << 24;
+                  (Consts[2]->getZExtValue() & 0xFF) << 16 |
+                  Consts[3]->getZExtValue() << 24;
       return DAG.getBitcast(MVT::v4i8, DAG.getConstant(V, dl, MVT::i32));
     }
 
@@ -2720,7 +2732,6 @@ SDValue
 HexagonTargetLowering::getZero(const SDLoc &dl, MVT Ty, SelectionDAG &DAG)
       const {
   if (Ty.isVector()) {
-    assert(Ty.isInteger() && "Only integer vectors are supported here");
     unsigned W = Ty.getSizeInBits();
     if (W <= 64)
       return DAG.getBitcast(Ty, DAG.getConstant(0, dl, MVT::getIntegerVT(W)));
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index d518c036f125..f9ce7a9407aa 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -458,6 +458,7 @@ private:
                           SelectionDAG &DAG) const;
 
   SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxExtractElement(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
@@ -468,7 +469,6 @@ private:
   SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxCttz(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerHvxMul(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
@@ -476,6 +476,8 @@ private:
   SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxFpExtend(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index f7237f496aee..0ba75a544c04 100644..100755
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -55,6 +55,12 @@ HexagonTargetLowering::initializeHVXLowering() {
     addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
+    if (Subtarget.useHVXV68Ops() && Subtarget.useHVXFloatingPoint()) {
+      addRegisterClass(MVT::v32f32, &Hexagon::HvxVRRegClass);
+      addRegisterClass(MVT::v64f16, &Hexagon::HvxVRRegClass);
+      addRegisterClass(MVT::v64f32, &Hexagon::HvxWRRegClass);
+      addRegisterClass(MVT::v128f16, &Hexagon::HvxWRRegClass);
+    }
   }
 
   // Set up operation actions.
@@ -83,6 +89,72 @@ HexagonTargetLowering::initializeHVXLowering() {
   setOperationAction(ISD::VECTOR_SHUFFLE,     ByteW,      Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  if (Subtarget.useHVX128BOps() && Subtarget.useHVXV68Ops() &&
+      Subtarget.useHVXFloatingPoint()) {
+    setOperationAction(ISD::FMINNUM, MVT::v64f16, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v64f16, Legal);
+    setOperationAction(ISD::FADD,    MVT::v64f16, Legal);
+    setOperationAction(ISD::FSUB,    MVT::v64f16, Legal);
+    setOperationAction(ISD::FMUL,    MVT::v64f16, Legal);
+    setOperationAction(ISD::FADD,    MVT::v32f32, Legal);
+    setOperationAction(ISD::FSUB,    MVT::v32f32, Legal);
+    setOperationAction(ISD::FMUL,    MVT::v32f32, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v32f32, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v32f32, Legal);
+    setOperationAction(ISD::INSERT_SUBVECTOR,  MVT::v64f16, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v64f16, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,  MVT::v32f32, Custom);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
+
+    // Handle ISD::BUILD_VECTOR for v32f32 in a custom way to generate vsplat
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v32f32, Custom);
+
+    // BUILD_VECTOR with f16 operands cannot be promoted without
+    // promoting the result, so lower the node to vsplat or constant pool
+    setOperationAction(ISD::BUILD_VECTOR,      MVT::f16,    Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::f16,    Custom);
+    setOperationAction(ISD::SPLAT_VECTOR,      MVT::f16,    Custom);
+    setOperationAction(ISD::SPLAT_VECTOR,      MVT::v64f16, Legal);
+    setOperationAction(ISD::SPLAT_VECTOR,      MVT::v32f32, Legal);
+    // Vector shuffle is always promoted to ByteV and a bitcast to f16 is
+    // generated.
+    setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v64f16, ByteV);
+    setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v64f32, ByteW);
+    setPromoteTo(ISD::VECTOR_SHUFFLE, MVT::v32f32, ByteV);
+
+    // Custom-lower BUILD_VECTOR for vector pairs. The standard (target-
+    // independent) handling of it would convert it to a load, which is
+    // not always the optimal choice.
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v64f32, Custom);
+    // Make concat-vectors custom to handle concats of more than 2 vectors.
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v128f16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v64f32, Custom);
+
+    setOperationAction(ISD::LOAD,    MVT::v64f32, Custom);
+    setOperationAction(ISD::STORE,   MVT::v64f32, Custom);
+    setOperationAction(ISD::FADD,    MVT::v64f32, Custom);
+    setOperationAction(ISD::FSUB,    MVT::v64f32, Custom);
+    setOperationAction(ISD::FMUL,    MVT::v64f32, Custom);
+    setOperationAction(ISD::FMINNUM, MVT::v64f32, Custom);
+    setOperationAction(ISD::FMAXNUM, MVT::v64f32, Custom);
+    setOperationAction(ISD::VSELECT, MVT::v64f32, Custom);
+
+    if (Subtarget.useHVXQFloatOps()) {
+      setOperationAction(ISD::FP_EXTEND, MVT::v64f32, Custom);
+      setOperationAction(ISD::FP_ROUND, MVT::v64f16, Legal);
+    } else if (Subtarget.useHVXIEEEFPOps()) {
+      setOperationAction(ISD::FP_EXTEND, MVT::v64f32, Legal);
+      setOperationAction(ISD::FP_ROUND, MVT::v64f16, Legal);
+    }
+
+    setOperationAction(ISD::MLOAD, MVT::v32f32, Custom);
+    setOperationAction(ISD::MSTORE, MVT::v32f32, Custom);
+    setOperationAction(ISD::MLOAD, MVT::v64f16, Custom);
+    setOperationAction(ISD::MSTORE, MVT::v64f16, Custom);
+    setOperationAction(ISD::MLOAD, MVT::v64f32, Custom);
+    setOperationAction(ISD::MSTORE, MVT::v64f32, Custom);
+  }
+
   for (MVT T : LegalV) {
     setIndexedLoadAction(ISD::POST_INC,  T, Legal);
     setIndexedStoreAction(ISD::POST_INC, T, Legal);
@@ -137,6 +209,18 @@ HexagonTargetLowering::initializeHVXLowering() {
       setPromoteTo(ISD::VECTOR_SHUFFLE, T, ByteV);
     }
 
+    if (Subtarget.useHVXQFloatOps()) {
+      setOperationAction(ISD::SINT_TO_FP, T, Expand);
+      setOperationAction(ISD::UINT_TO_FP, T, Expand);
+      setOperationAction(ISD::FP_TO_SINT, T, Expand);
+      setOperationAction(ISD::FP_TO_UINT, T, Expand);
+    } else if (Subtarget.useHVXIEEEFPOps()) {
+      setOperationAction(ISD::SINT_TO_FP, T, Custom);
+      setOperationAction(ISD::UINT_TO_FP, T, Custom);
+      setOperationAction(ISD::FP_TO_SINT, T, Custom);
+      setOperationAction(ISD::FP_TO_UINT, T, Custom);
+    }
+
     setCondCodeAction(ISD::SETNE,  T, Expand);
     setCondCodeAction(ISD::SETLE,  T, Expand);
     setCondCodeAction(ISD::SETGE,  T, Expand);
@@ -198,8 +282,39 @@ HexagonTargetLowering::initializeHVXLowering() {
       setOperationAction(ISD::UMIN,   T, Custom);
       setOperationAction(ISD::UMAX,   T, Custom);
     }
+
+    setOperationAction(ISD::SINT_TO_FP, T, Custom);
+    setOperationAction(ISD::UINT_TO_FP, T, Custom);
+    setOperationAction(ISD::FP_TO_SINT, T, Custom);
+    setOperationAction(ISD::FP_TO_UINT, T, Custom);
   }
 
+  setCondCodeAction(ISD::SETNE,  MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETGE,  MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::v64f16, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::v64f16, Expand);
+
+  setCondCodeAction(ISD::SETNE,  MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETGE,  MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::v32f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::v32f32, Expand);
+
   // Boolean vectors.
 
   for (MVT T : LegalW) {
@@ -497,7 +612,9 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
   assert(ElemSize*VecLen == HwLen);
   SmallVector<SDValue,32> Words;
 
-  if (VecTy.getVectorElementType() != MVT::i32) {
+  if (VecTy.getVectorElementType() != MVT::i32 &&
+      !(Subtarget.useHVXFloatingPoint() &&
+      VecTy.getVectorElementType() == MVT::f32)) {
     assert((ElemSize == 1 || ElemSize == 2) && "Invalid element size");
     unsigned OpsPerWord = (ElemSize == 1) ? 4 : 2;
     MVT PartVT = MVT::getVectorVT(VecTy.getVectorElementType(), OpsPerWord);
@@ -506,22 +623,31 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
       Words.push_back(DAG.getBitcast(MVT::i32, W));
     }
   } else {
-    Words.assign(Values.begin(), Values.end());
+    for (SDValue V : Values)
+      Words.push_back(DAG.getBitcast(MVT::i32, V));
   }
+  auto isSplat = [] (ArrayRef<SDValue> Values, SDValue &SplatV) {
+    unsigned NumValues = Values.size();
+    assert(NumValues > 0);
+    bool IsUndef = true;
+    for (unsigned i = 0; i != NumValues; ++i) {
+      if (Values[i].isUndef())
+        continue;
+      IsUndef = false;
+      if (!SplatV.getNode())
+        SplatV = Values[i];
+      else if (SplatV != Values[i])
+        return false;
+    }
+    if (IsUndef)
+      SplatV = Values[0];
+    return true;
+  };
 
   unsigned NumWords = Words.size();
-  bool IsSplat = true, IsUndef = true;
   SDValue SplatV;
-  for (unsigned i = 0; i != NumWords && IsSplat; ++i) {
-    if (isUndef(Words[i]))
-      continue;
-    IsUndef = false;
-    if (!SplatV.getNode())
-      SplatV = Words[i];
-    else if (SplatV != Words[i])
-      IsSplat = false;
-  }
-  if (IsUndef)
+  bool IsSplat = isSplat(Words, SplatV);
+  if (IsSplat && isUndef(SplatV))
     return DAG.getUNDEF(VecTy);
   if (IsSplat) {
     assert(SplatV.getNode());
@@ -618,24 +744,75 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
     }
   }
 
-  // Construct two halves in parallel, then or them together.
+  // Find most common element to initialize vector with. This is to avoid
+  // unnecessary vinsert/valign for cases where the same value is present
+  // many times. Creates a histogram of the vector's elements to find the
+  // most common element n.
   assert(4*Words.size() == Subtarget.getVectorLength());
-  SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
-  SDValue HalfV1 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
-  SDValue S = DAG.getConstant(4, dl, MVT::i32);
+  int VecHist[32];
+  int n = 0;
+  for (unsigned i = 0; i != NumWords; ++i) {
+    VecHist[i] = 0;
+    if (Words[i].isUndef())
+      continue;
+    for (unsigned j = i; j != NumWords; ++j)
+      if (Words[i] == Words[j])
+        VecHist[i]++;
+
+    if (VecHist[i] > VecHist[n])
+      n = i;
+  }
+
+  SDValue HalfV = getZero(dl, VecTy, DAG);
+  if (VecHist[n] > 1) {
+    SDValue SplatV = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Words[n]);
+    HalfV = DAG.getNode(HexagonISD::VALIGN, dl, VecTy,
+                       {HalfV, SplatV, DAG.getConstant(HwLen/2, dl, MVT::i32)});
+  }
+  SDValue HalfV0 = HalfV;
+  SDValue HalfV1 = HalfV;
+
+  // Construct two halves in parallel, then or them together. Rn and Rm count
+  // number of rotations needed before the next element. One last rotation is
+  // performed post-loop to position the last element.
+  int Rn = 0, Rm = 0;
+  SDValue Sn, Sm;
+  SDValue N = HalfV0;
+  SDValue M = HalfV1;
   for (unsigned i = 0; i != NumWords/2; ++i) {
-    SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
-                            {HalfV0, Words[i]});
-    SDValue M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
-                            {HalfV1, Words[i+NumWords/2]});
-    HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, S});
-    HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, S});
+
+    // Rotate by element count since last insertion.
+    if (Words[i] != Words[n] || VecHist[n] <= 1) {
+      Sn = DAG.getConstant(Rn, dl, MVT::i32);
+      HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn});
+      N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
+                      {HalfV0, Words[i]});
+      Rn = 0;
+    }
+    if (Words[i+NumWords/2] != Words[n] || VecHist[n] <= 1) {
+      Sm = DAG.getConstant(Rm, dl, MVT::i32);
+      HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm});
+      M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
+                      {HalfV1, Words[i+NumWords/2]});
+      Rm = 0;
+    }
+    Rn += 4;
+    Rm += 4;
   }
+  // Perform last rotation.
+  Sn = DAG.getConstant(Rn+HwLen/2, dl, MVT::i32);
+  Sm = DAG.getConstant(Rm, dl, MVT::i32);
+  HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn});
+  HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm});
+
+  SDValue T0 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV0);
+  SDValue T1 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV1);
+
+  SDValue DstV = DAG.getNode(ISD::OR, dl, ty(T0), {T0, T1});
 
-  HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy,
-                       {HalfV0, DAG.getConstant(HwLen/2, dl, MVT::i32)});
-  SDValue DstV = DAG.getNode(ISD::OR, dl, VecTy, {HalfV0, HalfV1});
-  return DstV;
+  SDValue OutV =
+      DAG.getBitcast(tyVector(ty(DstV), VecTy.getVectorElementType()), DstV);
+  return OutV;
 }
 
 SDValue
@@ -1237,6 +1414,19 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
   if (VecTy.getVectorElementType() == MVT::i1)
     return buildHvxVectorPred(Ops, dl, VecTy, DAG);
 
+  // In case of MVT::f16 BUILD_VECTOR, since MVT::f16 is
+  // not a legal type, just bitcast the node to use i16
+  // types and bitcast the result back to f16
+  if (VecTy.getVectorElementType() == MVT::f16) {
+    SmallVector<SDValue,64> NewOps;
+    for (unsigned i = 0; i != Size; i++)
+      NewOps.push_back(DAG.getBitcast(MVT::i16, Ops[i]));
+
+    SDValue T0 = DAG.getNode(ISD::BUILD_VECTOR, dl,
+        tyVector(VecTy, MVT::i16), NewOps);
+    return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
+  }
+
   if (VecTy.getSizeInBits() == 16*Subtarget.getVectorLength()) {
     ArrayRef<SDValue> A(Ops);
     MVT SingleTy = typeSplit(VecTy).first;
@@ -1249,6 +1439,24 @@ HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
 }
 
 SDValue
+HexagonTargetLowering::LowerHvxSplatVector(SDValue Op, SelectionDAG &DAG)
+      const {
+  const SDLoc &dl(Op);
+  MVT VecTy = ty(Op);
+  MVT ArgTy = ty(Op.getOperand(0));
+
+  if (ArgTy == MVT::f16) {
+    MVT SplatTy =  MVT::getVectorVT(MVT::i16, VecTy.getVectorNumElements());
+    SDValue ToInt16 = DAG.getBitcast(MVT::i16, Op.getOperand(0));
+    SDValue ToInt32 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, ToInt16);
+    SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, dl, SplatTy, ToInt32);
+    return DAG.getBitcast(VecTy, Splat);
+  }
+
+  return SDValue();
+}
+
+SDValue
 HexagonTargetLowering::LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG)
       const {
   // Vector concatenation of two integer (non-bool) vectors does not need
@@ -1363,6 +1571,7 @@ SDValue
 HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
       const {
   const SDLoc &dl(Op);
+  MVT VecTy = ty(Op);
   SDValue VecV = Op.getOperand(0);
   SDValue ValV = Op.getOperand(1);
   SDValue IdxV = Op.getOperand(2);
@@ -1370,6 +1579,14 @@ HexagonTargetLowering::LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG)
   if (ElemTy == MVT::i1)
     return insertHvxElementPred(VecV, IdxV, ValV, dl, DAG);
 
+  if (ElemTy == MVT::f16) {
+    SDValue T0 = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+        tyVector(VecTy, MVT::i16),
+        DAG.getBitcast(tyVector(VecTy, MVT::i16), VecV),
+        DAG.getBitcast(MVT::i16, ValV), IdxV);
+    return DAG.getBitcast(tyVector(VecTy, MVT::f16), T0);
+  }
+
   return insertHvxElementReg(VecV, IdxV, ValV, dl, DAG);
 }
 
@@ -1800,6 +2017,80 @@ HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi});
 }
 
+SDValue HexagonTargetLowering::LowerHvxFpExtend(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // This conversion only applies to QFloat.
+  assert(Subtarget.useHVXQFloatOps());
+
+  assert(Op->getOpcode() == ISD::FP_EXTEND);
+
+  MVT VecTy = ty(Op);
+  MVT ArgTy = ty(Op.getOperand(0));
+  const SDLoc &dl(Op);
+  assert(VecTy == MVT::v64f32 && ArgTy == MVT::v64f16);
+
+  SDValue F16Vec = Op.getOperand(0);
+
+  APFloat FloatVal = APFloat(1.0f);
+  bool Ignored;
+  FloatVal.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &Ignored);
+  SDValue Fp16Ones = DAG.getConstantFP(FloatVal, dl, ArgTy);
+  SDValue VmpyVec =
+      getInstr(Hexagon::V6_vmpy_qf32_hf, dl, VecTy, {F16Vec, Fp16Ones}, DAG);
+
+  MVT HalfTy = typeSplit(VecTy).first;
+  VectorPair Pair = opSplit(VmpyVec, dl, DAG);
+  SDValue LoVec =
+      getInstr(Hexagon::V6_vconv_sf_qf32, dl, HalfTy, {Pair.first}, DAG);
+  SDValue HiVec =
+      getInstr(Hexagon::V6_vconv_sf_qf32, dl, HalfTy, {Pair.second}, DAG);
+
+  SDValue ShuffVec =
+      getInstr(Hexagon::V6_vshuffvdd, dl, VecTy,
+               {HiVec, LoVec, DAG.getConstant(-4, dl, MVT::i32)}, DAG);
+
+  return ShuffVec;
+}
+
+SDValue
+HexagonTargetLowering::LowerHvxConvertFpInt(SDValue Op, SelectionDAG &DAG)
+    const {
+  // This conversion only applies to IEEE.
+  assert(Subtarget.useHVXIEEEFPOps());
+
+  unsigned Opc = Op.getOpcode();
+  // Catch invalid conversion ops (just in case).
+  assert(Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT ||
+         Opc == ISD::SINT_TO_FP || Opc == ISD::UINT_TO_FP);
+  MVT ResTy = ty(Op);
+
+  if (Opc == ISD::FP_TO_SINT || Opc == ISD::FP_TO_UINT) {
+    MVT FpTy = ty(Op.getOperand(0)).getVectorElementType();
+    // There are only conversions of f16.
+    if (FpTy != MVT::f16)
+      return SDValue();
+
+    MVT IntTy = ResTy.getVectorElementType();
+    // Other int types aren't legal in HVX, so we shouldn't see them here.
+    assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32);
+    // Conversions to i8 and i16 are legal.
+    if (IntTy == MVT::i8 || IntTy == MVT::i16)
+      return Op;
+  } else {
+    // Converting int -> fp.
+    if (ResTy.getVectorElementType() != MVT::f16)
+      return SDValue();
+    MVT IntTy = ty(Op.getOperand(0)).getVectorElementType();
+    // Other int types aren't legal in HVX, so we shouldn't see them here.
+    assert(IntTy == MVT::i8 || IntTy == MVT::i16 || IntTy == MVT::i32);
+    // i8, i16 -> f16 is legal.
+    if (IntTy == MVT::i8 || IntTy == MVT::i16)
+      return Op;
+  }
+
+  return SDValue();
+}
+
 SDValue
 HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
   assert(!Op.isMachineOpcode());
@@ -2104,10 +2395,22 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
       case ISD::MLOAD:
       case ISD::MSTORE:
         return SplitHvxMemOp(Op, DAG);
+      case ISD::SINT_TO_FP:
+      case ISD::UINT_TO_FP:
+      case ISD::FP_TO_SINT:
+      case ISD::FP_TO_UINT:
+        if (ty(Op).getSizeInBits() == ty(Op.getOperand(0)).getSizeInBits())
+          return SplitHvxPairOp(Op, DAG);
+        break;
       case ISD::CTPOP:
       case ISD::CTLZ:
       case ISD::CTTZ:
       case ISD::MUL:
+      case ISD::FADD:
+      case ISD::FSUB:
+      case ISD::FMUL:
+      case ISD::FMINNUM:
+      case ISD::FMAXNUM:
       case ISD::MULHS:
       case ISD::MULHU:
       case ISD::AND:
@@ -2134,6 +2437,7 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     default:
       break;
     case ISD::BUILD_VECTOR:            return LowerHvxBuildVector(Op, DAG);
+    case ISD::SPLAT_VECTOR:            return LowerHvxSplatVector(Op, DAG);
     case ISD::CONCAT_VECTORS:          return LowerHvxConcatVectors(Op, DAG);
     case ISD::INSERT_SUBVECTOR:        return LowerHvxInsertSubvector(Op, DAG);
     case ISD::INSERT_VECTOR_ELT:       return LowerHvxInsertElement(Op, DAG);
@@ -2158,6 +2462,11 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::MSTORE:                  return LowerHvxMaskedOp(Op, DAG);
     // Unaligned loads will be handled by the default lowering.
     case ISD::LOAD:                    return SDValue();
+    case ISD::FP_EXTEND:               return LowerHvxFpExtend(Op, DAG);
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:              return LowerHvxConvertFpInt(Op, DAG);
   }
 #ifndef NDEBUG
   Op.dumpr(&DAG);
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 931b0c0e0090..9b4e92a16663 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -146,6 +146,48 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
   return Count;
 }
 
+// Check if the A2_tfrsi instruction is cheap or not. If the operand has
+// to be constant-extendend it is not cheap since it occupies two slots
+// in a packet.
+bool HexagonInstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
+  // Enable the following steps only at Os/Oz
+  if (!(MI.getMF()->getFunction().hasOptSize()))
+    return MI.isAsCheapAsAMove();
+
+  if (MI.getOpcode() == Hexagon::A2_tfrsi) {
+    auto Op = MI.getOperand(1);
+    // If the instruction has a global address as operand, it is not cheap
+    // since the operand will be constant extended.
+    if (Op.getType() == MachineOperand::MO_GlobalAddress)
+      return false;
+    // If the instruction has an operand of size > 16bits, its will be
+    // const-extended and hence, it is not cheap.
+    if (Op.isImm()) {
+      int64_t Imm = Op.getImm();
+      if (!isInt<16>(Imm))
+        return false;
+    }
+  }
+  return MI.isAsCheapAsAMove();
+}
+
+// Do not sink floating point instructions that updates USR register.
+// Example:
+//    feclearexcept
+//    F2_conv_w2sf
+//    fetestexcept
+// MachineSink sinks F2_conv_w2sf and we are not able to catch exceptions.
+// TODO: On some of these floating point instructions, USR is marked as Use.
+// In reality, these instructions also Def the USR. If USR is marked as Def,
+// some of the assumptions in assembler packetization are broken.
+bool HexagonInstrInfo::shouldSink(const MachineInstr &MI) const {
+  // Assumption: A floating point instruction that reads the USR will write
+  // the USR as well.
+  if (isFloat(MI) && MI.hasRegisterImplicitUseOperand(Hexagon::USR))
+    return false;
+  return true;
+}
+
 /// Find the hardware loop instruction used to set-up the specified loop.
 /// On Hexagon, we have two instructions used to set-up the hardware loop
 /// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
@@ -1464,75 +1506,75 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const {
   switch (Opc) {
     case Hexagon::V6_vgathermh_pseudo:
       First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
-                  .add(MI.getOperand(1))
                   .add(MI.getOperand(2))
-                  .add(MI.getOperand(3));
+                  .add(MI.getOperand(3))
+                  .add(MI.getOperand(4));
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
           .add(MI.getOperand(0))
-          .addImm(0)
+          .addImm(MI.getOperand(1).getImm())
           .addReg(Hexagon::VTMP);
       MBB.erase(MI);
       return First.getInstrIterator();
 
     case Hexagon::V6_vgathermw_pseudo:
       First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
-                  .add(MI.getOperand(1))
                   .add(MI.getOperand(2))
-                  .add(MI.getOperand(3));
+                  .add(MI.getOperand(3))
+                  .add(MI.getOperand(4));
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
           .add(MI.getOperand(0))
-          .addImm(0)
+          .addImm(MI.getOperand(1).getImm())
           .addReg(Hexagon::VTMP);
       MBB.erase(MI);
       return First.getInstrIterator();
 
     case Hexagon::V6_vgathermhw_pseudo:
       First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
-                  .add(MI.getOperand(1))
                   .add(MI.getOperand(2))
-                  .add(MI.getOperand(3));
+                  .add(MI.getOperand(3))
+                  .add(MI.getOperand(4));
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
           .add(MI.getOperand(0))
-          .addImm(0)
+          .addImm(MI.getOperand(1).getImm())
           .addReg(Hexagon::VTMP);
       MBB.erase(MI);
       return First.getInstrIterator();
 
     case Hexagon::V6_vgathermhq_pseudo:
       First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
-                  .add(MI.getOperand(1))
                   .add(MI.getOperand(2))
                   .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
+                  .add(MI.getOperand(4))
+                  .add(MI.getOperand(5));
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
           .add(MI.getOperand(0))
-          .addImm(0)
+          .addImm(MI.getOperand(1).getImm())
           .addReg(Hexagon::VTMP);
       MBB.erase(MI);
       return First.getInstrIterator();
 
     case Hexagon::V6_vgathermwq_pseudo:
       First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
-                  .add(MI.getOperand(1))
                   .add(MI.getOperand(2))
                   .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
+                  .add(MI.getOperand(4))
+                  .add(MI.getOperand(5));
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
           .add(MI.getOperand(0))
-          .addImm(0)
+          .addImm(MI.getOperand(1).getImm())
           .addReg(Hexagon::VTMP);
       MBB.erase(MI);
       return First.getInstrIterator();
 
     case Hexagon::V6_vgathermhwq_pseudo:
       First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
-                  .add(MI.getOperand(1))
                   .add(MI.getOperand(2))
                   .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
+                  .add(MI.getOperand(4))
+                  .add(MI.getOperand(5));
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
           .add(MI.getOperand(0))
-          .addImm(0)
+          .addImm(MI.getOperand(1).getImm())
           .addReg(Hexagon::VTMP);
       MBB.erase(MI);
       return First.getInstrIterator();
@@ -1851,6 +1893,7 @@ bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     case Hexagon::C4_cmplte:
     case Hexagon::C4_cmplteu:
       SrcReg2 = MI.getOperand(2).getReg();
+      Value = 0;
       return true;
 
     case Hexagon::C2_cmpeqi:
@@ -2725,7 +2768,13 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vL32b_nt_ai:
   case Hexagon::V6_vS32b_nt_ai:
   case Hexagon::V6_vL32Ub_ai:
-  case Hexagon::V6_vS32Ub_ai: {
+  case Hexagon::V6_vS32Ub_ai:
+  case Hexagon::V6_vgathermh_pseudo:
+  case Hexagon::V6_vgathermw_pseudo:
+  case Hexagon::V6_vgathermhw_pseudo:
+  case Hexagon::V6_vgathermhq_pseudo:
+  case Hexagon::V6_vgathermwq_pseudo:
+  case Hexagon::V6_vgathermhwq_pseudo: {
     unsigned VectorSize = TRI->getSpillSize(Hexagon::HvxVRRegClass);
     assert(isPowerOf2_32(VectorSize));
     if (Offset & (VectorSize-1))
@@ -2751,6 +2800,11 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::S4_storeirit_io:
   case Hexagon::S4_storeirif_io:
     return isShiftedUInt<6,2>(Offset);
+  // Handle these two compare instructions that are not extendable.
+  case Hexagon::A4_cmpbeqi:
+    return isUInt<8>(Offset);
+  case Hexagon::A4_cmpbgti:
+    return isInt<8>(Offset);
   }
 
   if (Extend)
@@ -2788,6 +2842,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::L4_isub_memopw_io:
   case Hexagon::L4_add_memopw_io:
   case Hexagon::L4_sub_memopw_io:
+  case Hexagon::L4_iand_memopw_io:
+  case Hexagon::L4_ior_memopw_io:
   case Hexagon::L4_and_memopw_io:
   case Hexagon::L4_or_memopw_io:
     return (0 <= Offset && Offset <= 255);
@@ -2796,6 +2852,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::L4_isub_memoph_io:
   case Hexagon::L4_add_memoph_io:
   case Hexagon::L4_sub_memoph_io:
+  case Hexagon::L4_iand_memoph_io:
+  case Hexagon::L4_ior_memoph_io:
   case Hexagon::L4_and_memoph_io:
   case Hexagon::L4_or_memoph_io:
     return (0 <= Offset && Offset <= 127);
@@ -2804,6 +2862,8 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::L4_isub_memopb_io:
   case Hexagon::L4_add_memopb_io:
   case Hexagon::L4_sub_memopb_io:
+  case Hexagon::L4_iand_memopb_io:
+  case Hexagon::L4_ior_memopb_io:
   case Hexagon::L4_and_memopb_io:
   case Hexagon::L4_or_memopb_io:
     return (0 <= Offset && Offset <= 63);
@@ -2848,8 +2908,18 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::S2_pstorerdt_io:
   case Hexagon::S2_pstorerdf_io:
     return isShiftedUInt<6,3>(Offset);
+
+  case Hexagon::L2_loadbsw2_io:
+  case Hexagon::L2_loadbzw2_io:
+    return isShiftedInt<11,1>(Offset);
+
+  case Hexagon::L2_loadbsw4_io:
+  case Hexagon::L2_loadbzw4_io:
+    return isShiftedInt<11,2>(Offset);
   } // switch
 
+  dbgs() << "Failed Opcode is : " << Opcode << " (" << getName(Opcode)
+         << ")\n";
   llvm_unreachable("No offset range is defined for this opcode. "
                    "Please define it in the above switch statement!");
 }
@@ -3486,9 +3556,9 @@ int HexagonInstrInfo::getDuplexOpcode(const MachineInstr &MI,
     if (Iter != DupMap.end())
       return Iter->second;
   } else { // Conversion to Tiny core.
-    for (auto Iter = DupMap.begin(), End = DupMap.end(); Iter != End; ++Iter)
-      if (Iter->second == OpNum)
-        return Iter->first;
+    for (const auto &Iter : DupMap)
+      if (Iter.second == OpNum)
+        return Iter.first;
   }
   return -1;
 }
@@ -3516,6 +3586,10 @@ int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const {
     return Hexagon::V6_vL32b_nt_cur_pi;
   case Hexagon::V6_vL32b_nt_ai:
     return Hexagon::V6_vL32b_nt_cur_ai;
+  case Hexagon::V6_vL32b_ppu:
+    return Hexagon::V6_vL32b_cur_ppu;
+  case Hexagon::V6_vL32b_nt_ppu:
+    return Hexagon::V6_vL32b_nt_cur_ppu;
   }
   return 0;
 }
@@ -3532,6 +3606,10 @@ int HexagonInstrInfo::getNonDotCurOp(const MachineInstr &MI) const {
     return Hexagon::V6_vL32b_nt_pi;
   case Hexagon::V6_vL32b_nt_cur_ai:
     return Hexagon::V6_vL32b_nt_ai;
+  case Hexagon::V6_vL32b_cur_ppu:
+    return Hexagon::V6_vL32b_ppu;
+  case Hexagon::V6_vL32b_nt_cur_ppu:
+    return Hexagon::V6_vL32b_nt_ppu;
   }
   return 0;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 830f04d9eac3..2af09c857d86 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -335,6 +335,13 @@ public:
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
   bool isTailCall(const MachineInstr &MI) const override;
+  bool isAsCheapAsAMove(const MachineInstr &MI) const override;
+
+  // Return true if the instruction should be sunk by MachineSink.
+  // MachineSink determines on its own whether the instruction is safe to sink;
+  // this gives the target a hook to override the default behavior with regards
+  // to which instructions should be sunk.
+  bool shouldSink(const MachineInstr &MI) const override;
 
   /// HexagonInstrInfo specifics.
 
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 2cdfbe7845b6..ea6a7498e27f 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -110,6 +110,8 @@ private:
   bool changeAddAsl(NodeAddr<UseNode *> AddAslUN, MachineInstr *AddAslMI,
                     const MachineOperand &ImmOp, unsigned ImmOpNum);
   bool isValidOffset(MachineInstr *MI, int Offset);
+  unsigned getBaseOpPosition(MachineInstr *MI);
+  unsigned getOffsetOpPosition(MachineInstr *MI);
 };
 
 } // end anonymous namespace
@@ -322,6 +324,25 @@ bool HexagonOptAddrMode::isSafeToExtLR(NodeAddr<StmtNode *> SN,
 }
 
 bool HexagonOptAddrMode::isValidOffset(MachineInstr *MI, int Offset) {
+  if (HII->isHVXVec(*MI)) {
+    // only HVX vgather instructions handled
+    // TODO: extend the pass to other vector load/store operations
+    switch (MI->getOpcode()) {
+    case Hexagon::V6_vgathermh_pseudo:
+    case Hexagon::V6_vgathermw_pseudo:
+    case Hexagon::V6_vgathermhw_pseudo:
+    case Hexagon::V6_vgathermhq_pseudo:
+    case Hexagon::V6_vgathermwq_pseudo:
+    case Hexagon::V6_vgathermhwq_pseudo:
+      return HII->isValidOffset(MI->getOpcode(), Offset, HRI, false);
+    default:
+      return false;
+    }
+  }
+
+  if (HII->getAddrMode(*MI) != HexagonII::BaseImmOffset)
+    return false;
+
   unsigned AlignMask = 0;
   switch (HII->getMemAccessSize(*MI)) {
   case HexagonII::MemAccessSize::DoubleWordAccess:
@@ -345,29 +366,67 @@ bool HexagonOptAddrMode::isValidOffset(MachineInstr *MI, int Offset) {
   return HII->isValidOffset(MI->getOpcode(), Offset, HRI, false);
 }
 
+unsigned HexagonOptAddrMode::getBaseOpPosition(MachineInstr *MI) {
+  const MCInstrDesc &MID = MI->getDesc();
+  switch (MI->getOpcode()) {
+  // vgather pseudos are mayLoad and mayStore
+  // hence need to explicitly specify Base and
+  // Offset operand positions
+  case Hexagon::V6_vgathermh_pseudo:
+  case Hexagon::V6_vgathermw_pseudo:
+  case Hexagon::V6_vgathermhw_pseudo:
+  case Hexagon::V6_vgathermhq_pseudo:
+  case Hexagon::V6_vgathermwq_pseudo:
+  case Hexagon::V6_vgathermhwq_pseudo:
+    return 0;
+  default:
+    return MID.mayLoad() ? 1 : 0;
+  }
+}
+
+unsigned HexagonOptAddrMode::getOffsetOpPosition(MachineInstr *MI) {
+  assert(
+      (HII->getAddrMode(*MI) == HexagonII::BaseImmOffset) &&
+      "Looking for an offset in non-BaseImmOffset addressing mode instruction");
+
+  const MCInstrDesc &MID = MI->getDesc();
+  switch (MI->getOpcode()) {
+  // vgather pseudos are mayLoad and mayStore
+  // hence need to explicitly specify Base and
+  // Offset operand positions
+  case Hexagon::V6_vgathermh_pseudo:
+  case Hexagon::V6_vgathermw_pseudo:
+  case Hexagon::V6_vgathermhw_pseudo:
+  case Hexagon::V6_vgathermhq_pseudo:
+  case Hexagon::V6_vgathermwq_pseudo:
+  case Hexagon::V6_vgathermhwq_pseudo:
+    return 1;
+  default:
+    return MID.mayLoad() ? 2 : 1;
+  }
+}
+
 bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
                                         MachineInstr *AddMI,
                                         const NodeList &UNodeList) {
 
   Register AddDefR = AddMI->getOperand(0).getReg();
+  Register BaseReg = AddMI->getOperand(1).getReg();
   for (auto I = UNodeList.rbegin(), E = UNodeList.rend(); I != E; ++I) {
     NodeAddr<UseNode *> UN = *I;
     NodeAddr<StmtNode *> SN = UN.Addr->getOwner(*DFG);
     MachineInstr *MI = SN.Addr->getCode();
     const MCInstrDesc &MID = MI->getDesc();
     if ((!MID.mayLoad() && !MID.mayStore()) ||
-        HII->getAddrMode(*MI) != HexagonII::BaseImmOffset ||
-        HII->isHVXVec(*MI))
+        HII->getAddrMode(*MI) != HexagonII::BaseImmOffset)
       return false;
 
-    MachineOperand BaseOp = MID.mayLoad() ? MI->getOperand(1)
-                                          : MI->getOperand(0);
+    MachineOperand BaseOp = MI->getOperand(getBaseOpPosition(MI));
 
     if (!BaseOp.isReg() || BaseOp.getReg() != AddDefR)
       return false;
 
-    MachineOperand OffsetOp = MID.mayLoad() ? MI->getOperand(2)
-                                            : MI->getOperand(1);
+    MachineOperand OffsetOp = MI->getOperand(getOffsetOpPosition(MI));
     if (!OffsetOp.isImm())
       return false;
 
@@ -382,11 +441,19 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
     // Ex: Rx= add(Rt,#10)
     //     memw(Rx+#0) = Rs
     // will be replaced with =>  memw(Rt+#10) = Rs
-    Register BaseReg = AddMI->getOperand(1).getReg();
     if (!isSafeToExtLR(AddSN, AddMI, BaseReg, UNodeList))
       return false;
   }
 
+  NodeId LRExtRegRD = 0;
+  // Iterate through all the UseNodes in SN and find the reaching def
+  // for the LRExtReg.
+  for (NodeAddr<UseNode *> UA : AddSN.Addr->members_if(DFG->IsUse, *DFG)) {
+    RegisterRef RR = UA.Addr->getRegRef(*DFG);
+    if (BaseReg == RR.Reg)
+      LRExtRegRD = UA.Addr->getReachingDef();
+  }
+
   // Update all the uses of 'add' with the appropriate base and offset
   // values.
   bool Changed = false;
@@ -400,6 +467,12 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
     LLVM_DEBUG(dbgs() << "\t\t[MI <BB#" << UseMI->getParent()->getNumber()
                       << ">]: " << *UseMI << "\n");
     Changed |= updateAddUses(AddMI, UseMI);
+
+    // Set the reachingDef for UseNode under consideration
+    // after updating the Add use. This local change is
+    // to avoid rebuilding of the RDF graph after update.
+    NodeAddr<DefNode *> LRExtRegDN = DFG->addr<DefNode *>(LRExtRegRD);
+    UseN.Addr->linkToDef(UseN.Id, LRExtRegDN);
   }
 
   if (Changed)
@@ -409,21 +482,18 @@ bool HexagonOptAddrMode::processAddUses(NodeAddr<StmtNode *> AddSN,
 }
 
 bool HexagonOptAddrMode::updateAddUses(MachineInstr *AddMI,
-                                        MachineInstr *UseMI) {
+                                       MachineInstr *UseMI) {
   const MachineOperand ImmOp = AddMI->getOperand(2);
   const MachineOperand AddRegOp = AddMI->getOperand(1);
-  Register newReg = AddRegOp.getReg();
-  const MCInstrDesc &MID = UseMI->getDesc();
+  Register NewReg = AddRegOp.getReg();
 
-  MachineOperand &BaseOp = MID.mayLoad() ? UseMI->getOperand(1)
-                                         : UseMI->getOperand(0);
-  MachineOperand &OffsetOp = MID.mayLoad() ? UseMI->getOperand(2)
-                                           : UseMI->getOperand(1);
-  BaseOp.setReg(newReg);
+  MachineOperand &BaseOp = UseMI->getOperand(getBaseOpPosition(UseMI));
+  MachineOperand &OffsetOp = UseMI->getOperand(getOffsetOpPosition(UseMI));
+  BaseOp.setReg(NewReg);
   BaseOp.setIsUndef(AddRegOp.isUndef());
   BaseOp.setImplicit(AddRegOp.isImplicit());
   OffsetOp.setImm(ImmOp.getImm() + OffsetOp.getImm());
-  MRI->clearKillFlags(newReg);
+  MRI->clearKillFlags(NewReg);
 
   return true;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index cad5ca8ab92e..3abbd896c519 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -87,18 +87,6 @@ def V8I8:   PatLeaf<(v8i8    DoubleRegs:$R)>;
 def V4I16:  PatLeaf<(v4i16   DoubleRegs:$R)>;
 def V2I32:  PatLeaf<(v2i32   DoubleRegs:$R)>;
 
-def HQ8:    PatLeaf<(VecQ8   HvxQR:$R)>;
-def HQ16:   PatLeaf<(VecQ16  HvxQR:$R)>;
-def HQ32:   PatLeaf<(VecQ32  HvxQR:$R)>;
-
-def HVI8:   PatLeaf<(VecI8   HvxVR:$R)>;
-def HVI16:  PatLeaf<(VecI16  HvxVR:$R)>;
-def HVI32:  PatLeaf<(VecI32  HvxVR:$R)>;
-
-def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
-def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
-def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
-
 def SDTVecLeaf:
   SDTypeProfile<1, 0, [SDTCisVec<0>]>;
 def SDTVecVecIntOp:
@@ -269,6 +257,9 @@ def anyimm3: PatLeaf<(i32 AnyImm3:$Addr)>;
 
 def f32ImmPred : PatLeaf<(f32 fpimm:$F)>;
 def f64ImmPred : PatLeaf<(f64 fpimm:$F)>;
+def f32zero: PatLeaf<(f32 fpimm:$F), [{
+  return N->isExactlyValue(APFloat::getZero(APFloat::IEEEsingle(), false));
+}]>;
 
 // This complex pattern is really only to detect various forms of
 // sign-extension i32->i64. The selected value will be of type i64
@@ -378,6 +369,12 @@ def Umin: pf2<umin>;  def Umax: pf2<umax>;
 
 def Rol: pf2<rotl>;
 
+def Fptosi: pf1<fp_to_sint>;
+def Fptoui: pf1<fp_to_uint>;
+def Sitofp: pf1<sint_to_fp>;
+def Uitofp: pf1<uint_to_fp>;
+
+
 // --(1) Immediate -------------------------------------------------------
 //
 
@@ -2083,7 +2080,7 @@ let AddedComplexity = 20 in {
   defm: Loadxi_pat<sextloadi8,      i32,   anyimm0, L2_loadrb_io>;
   defm: Loadxi_pat<sextloadi16,     i32,   anyimm1, L2_loadrh_io>;
   defm: Loadxi_pat<sextloadv2i8,    v2i16, anyimm1, L2_loadbsw2_io>;
-  defm: Loadxi_pat<sextloadv4i8,    v4i16, anyimm2, L2_loadbzw4_io>;
+  defm: Loadxi_pat<sextloadv4i8,    v4i16, anyimm2, L2_loadbsw4_io>;
   defm: Loadxi_pat<zextloadi1,      i32,   anyimm0, L2_loadrub_io>;
   defm: Loadxi_pat<zextloadi8,      i32,   anyimm0, L2_loadrub_io>;
   defm: Loadxi_pat<zextloadi16,     i32,   anyimm1, L2_loadruh_io>;
@@ -2135,7 +2132,7 @@ let AddedComplexity  = 60 in {
   def: Loadxu_pat<sextloadi8,   i32,   anyimm0, L4_loadrb_ur>;
   def: Loadxu_pat<sextloadi16,  i32,   anyimm1, L4_loadrh_ur>;
   def: Loadxu_pat<sextloadv2i8, v2i16, anyimm1, L4_loadbsw2_ur>;
-  def: Loadxu_pat<sextloadv4i8, v4i16, anyimm2, L4_loadbzw4_ur>;
+  def: Loadxu_pat<sextloadv4i8, v4i16, anyimm2, L4_loadbsw4_ur>;
   def: Loadxu_pat<zextloadi1,   i32,   anyimm0, L4_loadrub_ur>;
   def: Loadxu_pat<zextloadi8,   i32,   anyimm0, L4_loadrub_ur>;
   def: Loadxu_pat<zextloadi16,  i32,   anyimm1, L4_loadruh_ur>;
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
index a22a3f8ec0ca..0a3dff057ccd 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td
@@ -6,6 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+def HQ8:    PatLeaf<(VecQ8   HvxQR:$R)>;
+def HQ16:   PatLeaf<(VecQ16  HvxQR:$R)>;
+def HQ32:   PatLeaf<(VecQ32  HvxQR:$R)>;
+
+def HVI8:   PatLeaf<(VecI8   HvxVR:$R)>;
+def HVI16:  PatLeaf<(VecI16  HvxVR:$R)>;
+def HVI32:  PatLeaf<(VecI32  HvxVR:$R)>;
+def HVF16:  PatLeaf<(VecF16  HvxVR:$R)>;
+def HVF32:  PatLeaf<(VecF32  HvxVR:$R)>;
+
+def HWI8:   PatLeaf<(VecPI8  HvxWR:$R)>;
+def HWI16:  PatLeaf<(VecPI16 HvxWR:$R)>;
+def HWI32:  PatLeaf<(VecPI32 HvxWR:$R)>;
+def HWF16:  PatLeaf<(VecPF16 HvxWR:$R)>;
+def HWF32:  PatLeaf<(VecPF32 HvxWR:$R)>;
 
 def SDTVecUnaryOp:
   SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>;
@@ -49,7 +64,7 @@ def HexagonVPACKL:     SDNode<"HexagonISD::VPACKL",     SDTVecUnaryOp>;
 def HexagonVUNPACK:    SDNode<"HexagonISD::VUNPACK",    SDTVecUnaryOp>;
 def HexagonVUNPACKU:   SDNode<"HexagonISD::VUNPACKU",   SDTVecUnaryOp>;
 
-def vzero:  PatFrag<(ops), (splat_vector (i32 0))>;
+def vzero:  PatFrags<(ops), [(splat_vector (i32 0)), (splat_vector (f32zero))]>;
 def qtrue:  PatFrag<(ops), (HexagonQTRUE)>;
 def qfalse: PatFrag<(ops), (HexagonQFALSE)>;
 def qcat:   PatFrag<(ops node:$Qs, node:$Qt),
@@ -150,12 +165,19 @@ let Predicates = [UseHVX] in {
   defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecI8,  IsVecOff>;
   defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecI16, IsVecOff>;
   defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecI32, IsVecOff>;
-
   defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecI8,  IsVecOff>;
   defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecI16, IsVecOff>;
   defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecI32, IsVecOff>;
 }
 
+let Predicates = [UseHVXV68] in {
+  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF16, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_nt_ai, alignednontemporalload, VecF32, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecF16, IsVecOff>;
+  defm: HvxLda_pat<V6_vL32b_ai,               alignedload, VecF32, IsVecOff>;
+  defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecF16, IsVecOff>;
+  defm: HvxLd_pat<V6_vL32Ub_ai,             unalignedload, VecF32, IsVecOff>;
+}
 
 // HVX stores
 
@@ -199,6 +221,15 @@ let Predicates = [UseHVX] in {
   defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVI32, IsVecOff>;
 }
 
+let Predicates = [UseHVXV68] in {
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF16, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32b_nt_ai, alignednontemporalstore, HVF32, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32b_ai,               alignedstore, HVF16, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32b_ai,               alignedstore, HVF32, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVF16, IsVecOff>;
+  defm: HvxSt_pat<V6_vS32Ub_ai,            unalignedstore, HVF32, IsVecOff>;
+}
+
 // Bitcasts between same-size vector types are no-ops, except for the
 // actual type change.
 let Predicates = [UseHVX] in {
@@ -211,6 +242,24 @@ let Predicates = [UseHVX] in {
   defm: NopCast_pat<VecPI16, VecPI32, HvxWR>;
 }
 
+let Predicates = [UseHVX, UseHVXFloatingPoint] in {
+  defm: NopCast_pat<VecI8,   VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI8,   VecF32,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI16,  VecF32,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF16,  HvxVR>;
+  defm: NopCast_pat<VecI32,  VecF32,  HvxVR>;
+  defm: NopCast_pat<VecF16,  VecF32,  HvxVR>;
+
+  defm: NopCast_pat<VecPI8,  VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI8,  VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI16, VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPI32, VecPF16, HvxWR>;
+  defm: NopCast_pat<VecPI32, VecPF32, HvxWR>;
+  defm: NopCast_pat<VecPF16, VecPF32, HvxWR>;
+}
+
 let Predicates = [UseHVX] in {
   let AddedComplexity = 100 in {
     // These should be preferred over a vsplat of 0.
@@ -220,6 +269,7 @@ let Predicates = [UseHVX] in {
     def: Pat<(VecPI8  vzero), (PS_vdd0)>;
     def: Pat<(VecPI16 vzero), (PS_vdd0)>;
     def: Pat<(VecPI32 vzero), (PS_vdd0)>;
+    def: Pat<(VecPF32 vzero), (PS_vdd0)>;
 
     def: Pat<(concat_vectors  (VecI8 vzero),  (VecI8 vzero)), (PS_vdd0)>;
     def: Pat<(concat_vectors (VecI16 vzero), (VecI16 vzero)), (PS_vdd0)>;
@@ -251,6 +301,28 @@ let Predicates = [UseHVX] in {
            (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
 }
 
+let Predicates = [UseHVX, UseHVXFloatingPoint] in {
+  let AddedComplexity = 100 in {
+    def: Pat<(VecF16  vzero), (V6_vd0)>;
+    def: Pat<(VecF32  vzero), (V6_vd0)>;
+    def: Pat<(VecPF16 vzero), (PS_vdd0)>;
+    def: Pat<(VecPF32 vzero), (PS_vdd0)>;
+
+    def: Pat<(concat_vectors (VecF16 vzero), (VecF16 vzero)), (PS_vdd0)>;
+    def: Pat<(concat_vectors (VecF32 vzero), (VecF32 vzero)), (PS_vdd0)>;
+  }
+
+  def: Pat<(VecPF16 (concat_vectors HVF16:$Vs, HVF16:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+  def: Pat<(VecPF32 (concat_vectors HVF32:$Vs, HVF32:$Vt)),
+           (Combinev HvxVR:$Vt, HvxVR:$Vs)>;
+
+  def: Pat<(HexagonVINSERTW0 HVF16:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+  def: Pat<(HexagonVINSERTW0 HVF32:$Vu, I32:$Rt),
+           (V6_vinsertwr HvxVR:$Vu, I32:$Rt)>;
+}
+
 // Splats for HvxV60
 def V60splatib: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatB $V)))>;
 def V60splatih: OutPatFrag<(ops node:$V),  (V6_lvsplatw (ToI32 (SplatH $V)))>;
@@ -307,6 +379,18 @@ let Predicates = [UseHVX,UseHVXV62] in {
     def: Pat<(VecPI32 (splat_vector I32:$Rs)), (Rep (V62splatrw $Rs))>;
   }
 }
+let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
+  let AddedComplexity = 30 in {
+    def: Pat<(VecF16  (splat_vector u16_0ImmPred:$V)), (V62splatih imm:$V)>;
+    def: Pat<(VecF32  (splat_vector anyint:$V)),       (V62splatiw imm:$V)>;
+    def: Pat<(VecF32  (splat_vector f32ImmPred:$V)),   (V62splatiw (ftoi $V))>;
+  }
+  let AddedComplexity = 20 in {
+    def: Pat<(VecF16  (splat_vector I32:$Rs)), (V62splatrh $Rs)>;
+    def: Pat<(VecF32  (splat_vector I32:$Rs)), (V62splatrw $Rs)>;
+    def: Pat<(VecF32  (splat_vector F32:$Rs)), (V62splatrw $Rs)>;
+  }
+}
 
 class Vneg1<ValueType VecTy>
   : PatFrag<(ops), (VecTy (splat_vector (i32 -1)))>;
@@ -369,6 +453,107 @@ let Predicates = [UseHVX] in {
            (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
 }
 
+// For now, we always deal with vector floating point in SF mode.
+class OpR_RR_pat_conv<InstHexagon MI, PatFrag Op, ValueType ResType,
+                      PatFrag RsPred, PatFrag RtPred = RsPred>
+  : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
+        (V6_vconv_sf_qf32 (VecF32 (MI RsPred:$Rs, RtPred:$Rt)))>;
+
+class OpR_RR_pat_conv_hf<InstHexagon MI, PatFrag Op, ValueType ResType,
+                      PatFrag RsPred, PatFrag RtPred = RsPred>
+  : Pat<(ResType (Op RsPred:$Rs, RtPred:$Rt)),
+        (V6_vconv_hf_qf16 (VecF16 (MI RsPred:$Rs, RtPred:$Rt)))>;
+
+let Predicates = [UseHVXV68, UseHVXQFloat] in {
+  def: OpR_RR_pat_conv_hf<V6_vsub_hf,        pf2<fsub>,  VecF16, HVF16>;
+  def: OpR_RR_pat_conv_hf<V6_vadd_hf,        pf2<fadd>,  VecF16, HVF16>;
+  def: OpR_RR_pat_conv_hf<V6_vmpy_qf16_hf,   pf2<fmul>,  VecF16, HVF16>;
+  def: OpR_RR_pat_conv<V6_vsub_sf,        pf2<fsub>,  VecF32, HVF32>;
+  def: OpR_RR_pat_conv<V6_vadd_sf,        pf2<fadd>,  VecF32, HVF32>;
+  def: OpR_RR_pat_conv<V6_vmpy_qf32_sf,   pf2<fmul>,  VecF32, HVF32>;
+
+  // For now we assume that the fp32 register is always coming in as IEEE float
+  // since the qfloat arithmetic instructions above always generate the
+  // accompanying conversions as part of their pattern
+  def: Pat<(VecF16 (pf1<fpround> HWF32:$Vuu)),
+           (V6_vdealh (V6_vconv_hf_qf32
+             (VecPF32 (Combinev (V6_vadd_sf (HiVec HvxWR:$Vuu), (V6_vd0)),
+                                (V6_vadd_sf (LoVec HvxWR:$Vuu), (V6_vd0))
+             ))))>;
+  // fpextend for QFloat is handled manually in HexagonISelLoweringHVX.cpp.
+}
+
+// HVX IEEE arithmetic Instructions
+let Predicates = [UseHVXV68, UseHVXIEEEFP] in {
+  def: Pat<(fadd HVF16:$Rs, HVF16:$Rt),
+           (V6_vadd_hf_hf HVF16:$Rs, HVF16:$Rt)>;
+  def: Pat<(fadd HVF32:$Rs, HVF32:$Rt),
+           (V6_vadd_sf_sf HVF32:$Rs, HVF32:$Rt)>;
+  def: Pat<(fsub HVF16:$Rs, HVF16:$Rt),
+           (V6_vsub_hf_hf HVF16:$Rs, HVF16:$Rt)>;
+  def: Pat<(fsub HVF32:$Rs, HVF32:$Rt),
+           (V6_vsub_sf_sf HVF32:$Rs, HVF32:$Rt)>;
+  def: Pat<(fmul HVF16:$Rs, HVF16:$Rt),
+           (V6_vmpy_hf_hf HVF16:$Rs, HVF16:$Rt)>;
+  def: Pat<(fmul HVF32:$Rs, HVF32:$Rt),
+           (V6_vmpy_sf_sf HVF32:$Rs, HVF32:$Rt)>;
+
+  def: Pat<(VecF16 (pf1<fpround> HWF32:$Vuu)),
+           (V6_vdealh (V6_vcvt_hf_sf (HiVec HvxWR:$Vuu), (LoVec HvxWR:$Vuu)))>;
+  def: Pat<(VecPF32 (pf1<fpextend> HVF16:$Vu)),
+           (V6_vcvt_sf_hf (V6_vshuffh HvxVR:$Vu))>;
+
+  def: OpR_R_pat<V6_vcvt_h_hf,  Fptosi, VecI16, HVF16>;
+  def: OpR_R_pat<V6_vcvt_uh_hf, Fptoui, VecI16, HVF16>;
+  def: OpR_R_pat<V6_vcvt_hf_h,  Sitofp, VecF16, HVI16>;
+  def: OpR_R_pat<V6_vcvt_hf_uh, Uitofp, VecF16, HVI16>;
+
+  def: Pat<(VecI8 (Fptosi HWF16:$Vu)),
+           (V6_vcvt_b_hf (HiVec $Vu), (LoVec $Vu))>;
+  def: Pat<(VecI8 (Fptoui HWF16:$Vu)),
+           (V6_vcvt_ub_hf (HiVec $Vu), (LoVec $Vu))>;
+  def: Pat<(VecPF16 (Sitofp HVI8:$Vu)), (V6_vcvt_hf_b HvxVR:$Vu)>;
+  def: Pat<(VecPF16 (Uitofp HVI8:$Vu)), (V6_vcvt_hf_ub HvxVR:$Vu)>;
+}
+
+let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
+  def: Pat<(vselect HQ16:$Qu, HVF16:$Vs, HVF16:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(vselect (qnot HQ16:$Qu), HVF16:$Vs, HVF16:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+
+  def: Pat<(vselect HQ32:$Qu, HVF32:$Vs, HVF32:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vs, HvxVR:$Vt)>;
+  def: Pat<(vselect (qnot HQ32:$Qu), HVF32:$Vs, HVF32:$Vt),
+           (V6_vmux HvxQR:$Qu, HvxVR:$Vt, HvxVR:$Vs)>;
+}
+
+let Predicates = [UseHVXV68, UseHVX128B, UseHVXQFloat] in {
+  let AddedComplexity = 220 in {
+    defm: MinMax_pats<V6_vmin_hf, V6_vmax_hf, vselect,  setgt, VecQ16, HVF16>;
+    defm: MinMax_pats<V6_vmin_hf, V6_vmax_hf, vselect, setogt, VecQ16, HVF16>;
+    defm: MinMax_pats<V6_vmin_sf, V6_vmax_sf, vselect,  setgt, VecQ32, HVF32>;
+    defm: MinMax_pats<V6_vmin_sf, V6_vmax_sf, vselect, setogt, VecQ32, HVF32>;
+  }
+  def: OpR_RR_pat<V6_vmin_hf, pf2<fminnum>, VecF16, HVF16>;
+  def: OpR_RR_pat<V6_vmax_hf, pf2<fmaxnum>, VecF16, HVF16>;
+  def: OpR_RR_pat<V6_vmin_sf, pf2<fminnum>, VecF32, HVF32>;
+  def: OpR_RR_pat<V6_vmax_sf, pf2<fmaxnum>, VecF32, HVF32>;
+}
+
+let Predicates = [UseHVXV68, UseHVX128B, UseHVXIEEEFP] in {
+  let AddedComplexity = 220 in {
+    defm: MinMax_pats<V6_vfmin_hf, V6_vfmax_hf, vselect,  setgt, VecQ16, HVF16>;
+    defm: MinMax_pats<V6_vfmin_hf, V6_vfmax_hf, vselect, setogt, VecQ16, HVF16>;
+    defm: MinMax_pats<V6_vfmin_sf, V6_vfmax_sf, vselect,  setgt, VecQ32, HVF32>;
+    defm: MinMax_pats<V6_vfmin_sf, V6_vfmax_sf, vselect, setogt, VecQ32, HVF32>;
+  }
+  def: OpR_RR_pat<V6_vfmin_hf, pf2<fminnum>, VecF16, HVF16>;
+  def: OpR_RR_pat<V6_vfmax_hf, pf2<fmaxnum>, VecF16, HVF16>;
+  def: OpR_RR_pat<V6_vfmin_sf, pf2<fminnum>, VecF32, HVF32>;
+  def: OpR_RR_pat<V6_vfmax_sf, pf2<fmaxnum>, VecF32, HVF32>;
+}
+
 let Predicates = [UseHVX] in {
   // For i8 vectors Vs = (a0, a1, ...), Vt = (b0, b1, ...),
   // V6_vmpybv Vs, Vt produces a pair of i16 vectors Hi:Lo,
@@ -551,6 +736,12 @@ let Predicates = [UseHVX] in {
   def: HvxSel_pat<PS_wselect, HWI32>;
 }
 
+def V2Q: OutPatFrag<(ops node:$Vs), (V6_vandvrt $Vs, (A2_tfrsi -1))>;
+
+let Predicates = [UseHVX] in
+  def: Pat<(select I1:$Pu, VecI1:$Qs, VecI1:$Qt),
+           (V2Q (PS_vselect $Pu, (Q2V $Qs), (Q2V $Qt)))>;
+
 let Predicates = [UseHVX] in {
   def: Pat<(VecQ8   (qtrue)), (PS_qtrue)>;
   def: Pat<(VecQ16  (qtrue)), (PS_qtrue)>;
@@ -623,3 +814,63 @@ let Predicates = [UseHVX] in {
   def: AccRRR_pat<V6_vgtuw_or,     Or, setugt,   HQ32, HVI32, HVI32>;
   def: AccRRR_pat<V6_vgtuw_xor,   Xor, setugt,   HQ32, HVI32, HVI32>;
 }
+
+let Predicates = [UseHVXV68, UseHVXFloatingPoint] in {
+  def: OpR_RR_pat<V6_veqh,              seteq,  VecQ16, HVF16>;
+  def: OpR_RR_pat<V6_veqh,             setoeq,  VecQ16, HVF16>;
+  def: OpR_RR_pat<V6_veqh,             setueq,  VecQ16, HVF16>;
+  def: OpR_RR_pat<V6_vgthf,             setgt,  VecQ16, HVF16>;
+  def: OpR_RR_pat<V6_vgthf,            setogt,  VecQ16, HVF16>;
+  def: OpR_RR_pat<V6_vgthf,            setugt,  VecQ16, HVF16>;
+
+  def: OpR_RR_pat<V6_veqw,              seteq,  VecQ32, HVF32>;
+  def: OpR_RR_pat<V6_veqw,             setoeq,  VecQ32, HVF32>;
+  def: OpR_RR_pat<V6_veqw,             setueq,  VecQ32, HVF32>;
+  def: OpR_RR_pat<V6_vgtsf,             setgt,  VecQ32, HVF32>;
+  def: OpR_RR_pat<V6_vgtsf,            setogt,  VecQ32, HVF32>;
+  def: OpR_RR_pat<V6_vgtsf,            setugt,  VecQ32, HVF32>;
+
+  def: AccRRR_pat<V6_veqh_and,    And,          seteq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_or,      Or,          seteq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_xor,    Xor,          seteq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_and,    And,         setoeq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_or,      Or,         setoeq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_xor,    Xor,         setoeq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_and,    And,         setueq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_or,      Or,         setueq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_veqh_xor,    Xor,         setueq,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_and,   And,          setgt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_or,     Or,          setgt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_xor,   Xor,          setgt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_and,   And,         setogt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_or,     Or,         setogt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_xor,   Xor,         setogt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_and,   And,         setugt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_or,     Or,         setugt,  HQ16, HVF16, HVF16>;
+  def: AccRRR_pat<V6_vgthf_xor,   Xor,         setugt,  HQ16, HVF16, HVF16>;
+
+  def: AccRRR_pat<V6_veqw_and,    And,          seteq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_or,      Or,          seteq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_xor,    Xor,          seteq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_and,    And,         setoeq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_or,      Or,         setoeq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_xor,    Xor,         setoeq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_and,    And,         setueq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_or,      Or,         setueq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_veqw_xor,    Xor,         setueq,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_and,   And,          setgt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_or,     Or,          setgt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_xor,   Xor,          setgt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_and,   And,         setogt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_or,     Or,         setogt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_xor,   Xor,         setogt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_and,   And,         setugt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_or,     Or,         setugt,  HQ32, HVF32, HVF32>;
+  def: AccRRR_pat<V6_vgtsf_xor,   Xor,         setugt,  HQ32, HVF32, HVF32>;
+
+  def: Pat<(VecQ16 (setone HVF16:$Vt, HVF16:$Vu)),
+           (V6_pred_not (V6_veqh HvxVR:$Vt, HvxVR:$Vu))>;
+
+  def: Pat<(VecQ32 (setone HVF32:$Vt, HVF32:$Vu)),
+           (V6_pred_not (V6_veqw HvxVR:$Vt, HvxVR:$Vu))>;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
index 4cd45ecbe1a1..f927f9b9e7c3 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
@@ -7,28 +7,31 @@
 //===----------------------------------------------------------------------===//
 
 multiclass vgathermh<RegisterClass RC> {
-  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
   def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
-                           (ins  IntRegs:$_dst_, IntRegs:$Rt,
-                                 ModRegs:$Mu, RC:$Vv),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv),
                            ".error \"should not emit\" ",
                            []>;
 }
 
 multiclass vgathermw<RegisterClass RC> {
-  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = WordAccess in
   def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
-                           (ins IntRegs:$_dst_, IntRegs:$Rt,
-                                ModRegs:$Mu, RC:$Vv),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv),
                            ".error \"should not emit\" ",
                            []>;
 }
 
 multiclass vgathermhw<RegisterClass RC> {
-  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+   mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
   def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
-                           (ins IntRegs:$_dst_, IntRegs:$Rt,
-                                ModRegs:$Mu, RC:$Vv),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv),
                            ".error \"should not emit\" ",
                            []>;
 }
@@ -38,28 +41,34 @@ defm V6_vgathermw_pseudo  : vgathermw<HvxVR>;
 defm V6_vgathermhw_pseudo  : vgathermhw<HvxWR>;
 
 multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> {
-  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
   def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
-                           (ins  IntRegs:$_dst_, RC2:$Vq, IntRegs:$Rt,
-                                 ModRegs:$Mu, RC1:$Vv),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                RC2:$Vq, IntRegs:$Rt, ModRegs:$Mu,
+                                RC1:$Vv),
                            ".error \"should not emit\" ",
                            []>;
 }
 
 multiclass vgathermwq<RegisterClass RC1, RegisterClass RC2> {
-  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = WordAccess in
   def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
-                           (ins IntRegs:$_dst_, RC2:$Vq, IntRegs:$Rt,
-                                ModRegs:$Mu, RC1:$Vv),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                RC2:$Vq, IntRegs:$Rt, ModRegs:$Mu,
+                                RC1:$Vv),
                            ".error \"should not emit\" ",
                            []>;
 }
 
 multiclass vgathermhwq<RegisterClass RC1, RegisterClass RC2> {
-  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1, mayStore = 1 in
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess  in
   def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
-                           (ins IntRegs:$_dst_, RC2:$Vq, IntRegs:$Rt,
-                                ModRegs:$Mu, RC1:$Vv),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                              RC2:$Vq, IntRegs:$Rt, ModRegs:$Mu,
+                              RC1:$Vv),
                            ".error \"should not emit\" ",
                            []>;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index 8b7138d3c809..4c387c8ba638 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -479,6 +479,10 @@ def VecI16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v32i16, v64i16,  v32i16]>;
 def VecI32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v16i32, v32i32,  v16i32]>;
+def VecF16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32f16, v64f16,  v32f16]>;
+def VecF32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v16f32, v32f32,  v16f32]>;
 
 def VecPI8:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v128i8, v256i8,  v128i8]>;
@@ -486,6 +490,10 @@ def VecPI16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v64i16, v128i16, v64i16]>;
 def VecPI32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v32i32, v64i32,  v32i32]>;
+def VecPF16: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v64f16, v128f16, v64f16]>;
+def VecPF32: ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
+                               [v32f32, v64f32,  v32f32]>;
 
 def VecQ8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v64i1,  v128i1,  v64i1]>;
@@ -496,13 +504,13 @@ def VecQ32:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
 
 // HVX register classes
 
-def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512,
+def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32, VecF16, VecF32], 512,
   (add (sequence "V%u", 0, 31), VTMP)> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>;
 }
 
-def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32], 1024,
+def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32, VecPF16, VecPF32], 1024,
   (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<1024,1024,1024>, RegInfo<2048,2048,2048>, RegInfo<1024,1024,1024>]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index 9a0f57fce97d..ada78ca70559 100644
--- a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -224,14 +224,14 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
   unsigned NumRegs = MRI->getNumVirtRegs();
   BitVector DoubleRegs(NumRegs);
   for (unsigned i = 0; i < NumRegs; ++i) {
-    unsigned R = Register::index2VirtReg(i);
+    Register R = Register::index2VirtReg(i);
     if (MRI->getRegClass(R) == DoubleRC)
       DoubleRegs.set(i);
   }
 
   BitVector FixedRegs(NumRegs);
   for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
-    unsigned R = Register::index2VirtReg(x);
+    Register R = Register::index2VirtReg(x);
     MachineInstr *DefI = MRI->getVRegDef(R);
     // In some cases a register may exist, but never be defined or used.
     // It should never appear anywhere, but mark it as "fixed", just to be
@@ -244,7 +244,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
   for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
     if (FixedRegs[x])
       continue;
-    unsigned R = Register::index2VirtReg(x);
+    Register R = Register::index2VirtReg(x);
     LLVM_DEBUG(dbgs() << printReg(R, TRI) << " ~~");
     USet &Asc = AssocMap[R];
     for (auto U = MRI->use_nodbg_begin(R), Z = MRI->use_nodbg_end();
@@ -281,7 +281,7 @@ void HexagonSplitDoubleRegs::partitionRegisters(UUSetMap &P2Rs) {
   unsigned NextP = 1;
   USet Visited;
   for (int x = DoubleRegs.find_first(); x >= 0; x = DoubleRegs.find_next(x)) {
-    unsigned R = Register::index2VirtReg(x);
+    Register R = Register::index2VirtReg(x);
     if (Visited.count(R))
       continue;
     // Create a new partition for R.
@@ -578,8 +578,7 @@ void HexagonSplitDoubleRegs::collectIndRegs(LoopRegMap &IRM) {
     append_range(WorkQ, *WorkQ[i]);
 
   USet Rs;
-  for (unsigned i = 0, n = WorkQ.size(); i < n; ++i) {
-    MachineLoop *L = WorkQ[i];
+  for (MachineLoop *L : WorkQ) {
     Rs.clear();
     collectIndRegsForLoop(L, Rs);
     if (!Rs.empty())
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 08bb4580b585..bdd2a2cfc5fa 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -228,7 +228,9 @@ bool HexagonSubtarget::isTypeForHVX(Type *VecTy, bool IncludeBool) const {
   if (!VecTy->isVectorTy() || isa<ScalableVectorType>(VecTy))
     return false;
   // Avoid types like <2 x i32*>.
-  if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
+  Type *ScalTy = VecTy->getScalarType();
+  if (!ScalTy->isIntegerTy() &&
+      !(ScalTy->isFloatingPointTy() && useHVXFloatingPoint()))
     return false;
   // The given type may be something like <17 x i32>, which is not MVT,
   // but can be represented as (non-simple) EVT.
@@ -466,28 +468,46 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
     return;
   }
 
-  if (!hasV60Ops())
-    return;
-
-  // Set the latency for a copy to zero since we hope that is will get removed.
+  // Set the latency for a copy to zero since we hope that is will get
+  // removed.
   if (DstInst->isCopy())
     Dep.setLatency(0);
 
   // If it's a REG_SEQUENCE/COPY, use its destination instruction to determine
   // the correct latency.
-  if ((DstInst->isRegSequence() || DstInst->isCopy()) && Dst->NumSuccs == 1) {
+  // If there are multiple uses of the def of COPY/REG_SEQUENCE, set the latency
+  // only if the latencies on all the uses are equal, otherwise set it to
+  // default.
+  if ((DstInst->isRegSequence() || DstInst->isCopy())) {
     Register DReg = DstInst->getOperand(0).getReg();
-    MachineInstr *DDst = Dst->Succs[0].getSUnit()->getInstr();
-    unsigned UseIdx = -1;
-    for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) {
-      const MachineOperand &MO = DDst->getOperand(OpNum);
-      if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == DReg) {
-        UseIdx = OpNum;
+    int DLatency = -1;
+    for (const auto &DDep : Dst->Succs) {
+      MachineInstr *DDst = DDep.getSUnit()->getInstr();
+      int UseIdx = -1;
+      for (unsigned OpNum = 0; OpNum < DDst->getNumOperands(); OpNum++) {
+        const MachineOperand &MO = DDst->getOperand(OpNum);
+        if (MO.isReg() && MO.getReg() && MO.isUse() && MO.getReg() == DReg) {
+          UseIdx = OpNum;
+          break;
+        }
+      }
+
+      if (UseIdx == -1)
+        continue;
+
+      int Latency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst, 0,
+                                                 *DDst, UseIdx));
+      // Set DLatency for the first time.
+      DLatency = (DLatency == -1) ? Latency : DLatency;
+
+      // For multiple uses, if the Latency is different across uses, reset
+      // DLatency.
+      if (DLatency != Latency) {
+        DLatency = -1;
         break;
       }
     }
-    int DLatency = (InstrInfo.getOperandLatency(&InstrItins, *SrcInst,
-                                                0, *DDst, UseIdx));
+
     DLatency = std::max(DLatency, 0);
     Dep.setLatency((unsigned)DLatency);
   }
@@ -500,8 +520,10 @@ void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
     Dep.setLatency(0);
     return;
   }
-
-  updateLatency(*SrcInst, *DstInst, Dep);
+  int Latency = Dep.getLatency();
+  bool IsArtificial = Dep.isArtificial();
+  Latency = updateLatency(*SrcInst, *DstInst, IsArtificial, Latency);
+  Dep.setLatency(Latency);
 }
 
 void HexagonSubtarget::getPostRAMutations(
@@ -530,21 +552,19 @@ bool HexagonSubtarget::usePredicatedCalls() const {
   return EnablePredicatedCalls;
 }
 
-void HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
-      MachineInstr &DstInst, SDep &Dep) const {
-  if (Dep.isArtificial()) {
-    Dep.setLatency(1);
-    return;
-  }
-
+int HexagonSubtarget::updateLatency(MachineInstr &SrcInst,
+                                    MachineInstr &DstInst, bool IsArtificial,
+                                    int Latency) const {
+  if (IsArtificial)
+    return 1;
   if (!hasV60Ops())
-    return;
-
-  auto &QII = static_cast<const HexagonInstrInfo&>(*getInstrInfo());
+    return Latency;
 
+  auto &QII = static_cast<const HexagonInstrInfo &>(*getInstrInfo());
   // BSB scheduling.
   if (QII.isHVXVec(SrcInst) || useBSBScheduling())
-    Dep.setLatency((Dep.getLatency() + 1) >> 1);
+    Latency = (Latency + 1) >> 1;
+  return Latency;
 }
 
 void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
@@ -580,9 +600,9 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
         // For some instructions (ex: COPY), we might end up with < 0 latency
         // as they don't have any Itinerary class associated with them.
         Latency = std::max(Latency, 0);
-
+        bool IsArtificial = I.isArtificial();
+        Latency = updateLatency(*SrcI, *DstI, IsArtificial, Latency);
         I.setLatency(Latency);
-        updateLatency(*SrcI, *DstI, I);
       }
     }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index e4f375440be1..db682676cf12 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -325,8 +325,8 @@ public:
 
 private:
   // Helper function responsible for increasing the latency only.
-  void updateLatency(MachineInstr &SrcInst, MachineInstr &DstInst, SDep &Dep)
-      const;
+  int updateLatency(MachineInstr &SrcInst, MachineInstr &DstInst,
+                    bool IsArtificial, int Latency) const;
   void restoreLatency(SUnit *Src, SUnit *Dst) const;
   void changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat) const;
   bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII,
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index fcf829b522cc..c6703bb8a62a 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -139,6 +139,7 @@ namespace llvm {
   void initializeHexagonBitSimplifyPass(PassRegistry&);
   void initializeHexagonConstExtendersPass(PassRegistry&);
   void initializeHexagonConstPropagationPass(PassRegistry&);
+  void initializeHexagonCopyToCombinePass(PassRegistry&);
   void initializeHexagonEarlyIfConversionPass(PassRegistry&);
   void initializeHexagonExpandCondsetsPass(PassRegistry&);
   void initializeHexagonGenMuxPass(PassRegistry&);
@@ -199,6 +200,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonTarget() {
   initializeHexagonBitSimplifyPass(PR);
   initializeHexagonConstExtendersPass(PR);
   initializeHexagonConstPropagationPass(PR);
+  initializeHexagonCopyToCombinePass(PR);
   initializeHexagonEarlyIfConversionPass(PR);
   initializeHexagonGenMuxPass(PR);
   initializeHexagonHardwareLoopsPass(PR);
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 85ec0cdcd8f0..e9b658d18175 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -886,7 +886,8 @@ bool HexagonPacketizerList::canPromoteToDotNew(const MachineInstr &MI,
 
   // Create a dot new machine instruction to see if resources can be
   // allocated. If not, bail out now.
-  int NewOpcode = HII->getDotNewOp(MI);
+  int NewOpcode = (RC != &Hexagon::PredRegsRegClass) ? HII->getDotNewOp(MI) :
+    HII->getDotNewPredOp(MI, MBPI);
   const MCInstrDesc &D = HII->get(NewOpcode);
   MachineInstr *NewMI = MF.CreateMachineInstr(D, DebugLoc());
   bool ResourcesAvailable = ResourceTracker->canReserveResources(*NewMI);
@@ -1107,6 +1108,11 @@ static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
       HII.isHVXMemWithAIndirect(MI, MJ))
     return true;
 
+  // Don't allow a store and an instruction that must be in slot0 and
+  // doesn't allow a slot1 instruction.
+  if (MI.mayStore() && HII.isRestrictNoSlot1Store(MJ) && HII.isPureSlot0(MJ))
+    return true;
+
   // An inline asm cannot be together with a branch, because we may not be
   // able to remove the asm out after packetizing (i.e. if the asm must be
   // moved past the bundle).  Similarly, two asms cannot be together to avoid
@@ -1526,6 +1532,13 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       bool IsVecJ = HII->isHVXVec(J);
       bool IsVecI = HII->isHVXVec(I);
 
+      // Don't reorder the loads if there is an order dependence. This would
+      // occur if the first instruction must go in slot0.
+      if (LoadJ && LoadI && HII->isPureSlot0(J)) {
+        FoundSequentialDependence = true;
+        break;
+      }
+
       if (Slot1Store && MF.getSubtarget<HexagonSubtarget>().hasV65Ops() &&
           ((LoadJ && StoreI && !NVStoreI) ||
            (StoreJ && LoadI && !NVStoreJ)) &&
@@ -1696,9 +1709,12 @@ HexagonPacketizerList::addToPacket(MachineInstr &MI) {
   MachineBasicBlock::iterator MII = MI.getIterator();
   MachineBasicBlock *MBB = MI.getParent();
 
-  if (CurrentPacketMIs.empty())
+  if (CurrentPacketMIs.empty()) {
     PacketStalls = false;
+    PacketStallCycles = 0;
+  }
   PacketStalls |= producesStall(MI);
+  PacketStallCycles = std::max(PacketStallCycles, calcStall(MI));
 
   if (MI.isImplicitDef()) {
     // Add to the packet to allow subsequent instructions to be checked
@@ -1818,14 +1834,6 @@ bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
   if (Minimal)
     return false;
 
-  // Constrainst for not packetizing this MI with existing instructions in a
-  // packet.
-  //	MI is a store instruction.
-  //	CurrentPacketMIs has a SLOT0 only instruction with constraint
-  //    A_RESTRICT_NOSLOT1_STORE/isRestrictNoSlot1Store.
-  if (MI.mayStore() && isPureSlot0InsnWithNoSlot1Store(MI))
-    return false;
-
   if (producesStall(MI))
     return false;
 
@@ -1865,25 +1873,8 @@ bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
   return true;
 }
 
-bool HexagonPacketizerList::isPureSlot0InsnWithNoSlot1Store(
-    const MachineInstr &MI) {
-  bool noSlot1Store = false;
-  bool isSlot0Only = false;
-  for (auto J : CurrentPacketMIs) {
-    noSlot1Store |= HII->isRestrictNoSlot1Store(*J);
-    isSlot0Only |= HII->isPureSlot0(*J);
-  }
-
-  return (noSlot1Store && isSlot0Only);
-}
-
 // V60 forward scheduling.
-bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
-  // If the packet already stalls, then ignore the stall from a subsequent
-  // instruction in the same packet.
-  if (PacketStalls)
-    return false;
-
+unsigned int HexagonPacketizerList::calcStall(const MachineInstr &I) {
   // Check whether the previous packet is in a different loop. If this is the
   // case, there is little point in trying to avoid a stall because that would
   // favor the rare case (loop entry) over the common case (loop iteration).
@@ -1895,10 +1886,12 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
     auto *OldBB = OldPacketMIs.front()->getParent();
     auto *ThisBB = I.getParent();
     if (MLI->getLoopFor(OldBB) != MLI->getLoopFor(ThisBB))
-      return false;
+      return 0;
   }
 
   SUnit *SUI = MIToSUnit[const_cast<MachineInstr *>(&I)];
+  if (!SUI)
+    return 0;
 
   // If the latency is 0 and there is a data dependence between this
   // instruction and any instruction in the current packet, we disregard any
@@ -1927,7 +1920,7 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
       if (Pred.getSUnit() == SUJ)
         if ((Pred.getLatency() == 0 && Pred.isAssignedRegDep()) ||
             HII->isNewValueJump(I) || HII->isToBeScheduledASAP(*J, I))
-          return false;
+          return 0;
   }
 
   // Check if the latency is greater than one between this instruction and any
@@ -1936,10 +1929,20 @@ bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
     SUnit *SUJ = MIToSUnit[J];
     for (auto &Pred : SUI->Preds)
       if (Pred.getSUnit() == SUJ && Pred.getLatency() > 1)
-        return true;
+        return Pred.getLatency();
   }
 
-  return false;
+  return 0;
+}
+
+bool HexagonPacketizerList::producesStall(const MachineInstr &I) {
+  unsigned int Latency = calcStall(I);
+  if (Latency == 0)
+    return false;
+  // Ignore stall unless it stalls more than previous instruction in packet
+  if (PacketStalls)
+    return Latency > PacketStallCycles;
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 27a47220570a..6a709e566f86 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -56,6 +56,9 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   // Set to true if the packet contains an instruction that stalls with an
   // instruction from the previous packet.
   bool PacketStalls = false;
+  // Set to the number of cycles of stall a given instruction will incur
+  // because of dependence on instruction in previous packet.
+  unsigned int PacketStallCycles = 0;
 
   // Set to true if the packet has a duplex pair of sub-instructions.
   bool PacketHasDuplex = false;
@@ -156,7 +159,7 @@ protected:
   bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
-  bool isPureSlot0InsnWithNoSlot1Store(const MachineInstr &MI);
+  unsigned int calcStall(const MachineInstr &MI);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 21386a91c7b3..6aca8d807872 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -443,7 +443,7 @@ auto AlignVectors::createAdjustedPointer(IRBuilder<> &Builder, Value *Ptr,
   // we don't need to do pointer casts.
   auto *PtrTy = cast<PointerType>(Ptr->getType());
   if (!PtrTy->isOpaque()) {
-    Type *ElemTy = PtrTy->getElementType();
+    Type *ElemTy = PtrTy->getNonOpaquePointerElementType();
     int ElemSize = HVC.getAllocSizeOf(ElemTy);
     if (Adjust % ElemSize == 0 && Adjust != 0) {
       Value *Tmp0 =
@@ -718,7 +718,7 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
 
   // Maximum alignment present in the whole address group.
   const AddrInfo &WithMaxAlign =
-      getMaxOf(BaseInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
+      getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.HaveAlign; });
   Align MaxGiven = WithMaxAlign.HaveAlign;
 
   // Minimum alignment present in the move address group.
@@ -1181,12 +1181,15 @@ auto HexagonVectorCombine::rescale(IRBuilder<> &Builder, Value *Mask,
   int ToCount = (FromCount * FromSize) / ToSize;
   assert((FromCount * FromSize) % ToSize == 0);
 
+  auto *FromITy = IntegerType::get(F.getContext(), FromSize * 8);
+  auto *ToITy = IntegerType::get(F.getContext(), ToSize * 8);
+
   // Mask <N x i1> -> sext to <N x FromTy> -> bitcast to <M x ToTy> ->
   // -> trunc to <M x i1>.
   Value *Ext = Builder.CreateSExt(
-      Mask, VectorType::get(FromSTy, FromCount, /*Scalable*/ false));
+      Mask, VectorType::get(FromITy, FromCount, /*Scalable*/ false));
   Value *Cast = Builder.CreateBitCast(
-      Ext, VectorType::get(ToSTy, ToCount, /*Scalable*/ false));
+      Ext, VectorType::get(ToITy, ToCount, /*Scalable*/ false));
   return Builder.CreateTrunc(
       Cast, VectorType::get(getBoolTy(), ToCount, /*Scalable*/ false));
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
index f973862a0c9b..94b878e21f4d 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp
@@ -659,8 +659,7 @@ void HexagonVectorLoopCarriedReuse::findLoopCarriedDeps() {
       delete D;
   }
   LLVM_DEBUG(dbgs() << "Found " << Dependences.size() << " dependences\n");
-  LLVM_DEBUG(for (size_t i = 0; i < Dependences.size();
-                  ++i) { dbgs() << *Dependences[i] << "\n"; });
+  LLVM_DEBUG(for (const DepChain *D : Dependences) dbgs() << *D << "\n";);
 }
 
 Pass *llvm::createHexagonVectorLoopCarriedReuseLegacyPass() {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 96c2965296ca..8a866cfe9161 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -16,6 +16,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
+
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
@@ -65,7 +66,8 @@ void HexagonMCChecker::init() {
 
 void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
                                bool &isTrue) {
-  if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+  if (HexagonMCInstrInfo::isPredicated(MCII, MCI) &&
+      HexagonMCInstrInfo::isPredReg(RI, R)) {
     // Note an used predicate register.
     PredReg = R;
     isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
@@ -123,7 +125,7 @@ void HexagonMCChecker::init(MCInst const &MCI) {
         // same packet with an instruction that modifies is explicitly. Deal
         // with such situations individually.
         SoftDefs.insert(R);
-      else if (isPredicateRegister(R) &&
+      else if (HexagonMCInstrInfo::isPredReg(RI, R) &&
                HexagonMCInstrInfo::isPredicateLate(MCII, MCI))
         // Include implicit late predicates.
         LatePreds.insert(R);
@@ -167,7 +169,7 @@ void HexagonMCChecker::init(MCInst const &MCI) {
         // side-effect, then note as a soft definition.
         SoftDefs.insert(*SRI);
       else if (HexagonMCInstrInfo::isPredicateLate(MCII, MCI) &&
-               isPredicateRegister(*SRI))
+               HexagonMCInstrInfo::isPredReg(RI, *SRI))
         // Some insns produce predicates too late to be used in the same packet.
         LatePreds.insert(*SRI);
       else if (i == 0 && HexagonMCInstrInfo::getType(MCII, MCI) ==
@@ -193,7 +195,7 @@ void HexagonMCChecker::init(MCInst const &MCI) {
       if (MCI.getOperand(i).isReg()) {
         unsigned P = MCI.getOperand(i).getReg();
 
-        if (isPredicateRegister(P))
+        if (HexagonMCInstrInfo::isPredReg(RI, P))
           NewPreds.insert(P);
       }
 }
@@ -202,7 +204,7 @@ HexagonMCChecker::HexagonMCChecker(MCContext &Context, MCInstrInfo const &MCII,
                                    MCSubtargetInfo const &STI, MCInst &mcb,
                                    MCRegisterInfo const &ri, bool ReportErrors)
     : Context(Context), MCB(mcb), RI(ri), MCII(MCII), STI(STI),
-      ReportErrors(ReportErrors), ReversePairs() {
+      ReportErrors(ReportErrors) {
   init();
 }
 
@@ -210,8 +212,7 @@ HexagonMCChecker::HexagonMCChecker(HexagonMCChecker const &Other,
                                    MCSubtargetInfo const &STI,
                                    bool CopyReportErrors)
     : Context(Other.Context), MCB(Other.MCB), RI(Other.RI), MCII(Other.MCII),
-      STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false),
-      ReversePairs() {
+      STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false) {
   init();
 }
 
@@ -233,9 +234,10 @@ bool HexagonMCChecker::check(bool FullCheck) {
   bool chkHWLoop = checkHWLoop();
   bool chkValidTmpDst = FullCheck ? checkValidTmpDst() : true;
   bool chkLegalVecRegPair = checkLegalVecRegPair();
+  bool ChkHVXAccum = checkHVXAccum();
   bool chk = chkP && chkNV && chkR && chkRRO && chkS && chkSh && chkSl &&
              chkAXOK && chkCofMax1 && chkHWLoop && chkValidTmpDst &&
-             chkLegalVecRegPair;
+             chkLegalVecRegPair && ChkHVXAccum;
 
   return chk;
 }
@@ -274,20 +276,27 @@ static bool isDuplexAGroup(unsigned Opcode) {
 }
 
 static bool isNeitherAnorX(MCInstrInfo const &MCII, MCInst const &ID) {
-  unsigned Result = 0;
+  if (HexagonMCInstrInfo::isFloat(MCII, ID))
+    return true;
   unsigned Type = HexagonMCInstrInfo::getType(MCII, ID);
-  if (Type == HexagonII::TypeDUPLEX) {
-    unsigned subInst0Opcode = ID.getOperand(0).getInst()->getOpcode();
-    unsigned subInst1Opcode = ID.getOperand(1).getInst()->getOpcode();
-    Result += !isDuplexAGroup(subInst0Opcode);
-    Result += !isDuplexAGroup(subInst1Opcode);
-  } else
-    Result +=
-        Type != HexagonII::TypeALU32_2op && Type != HexagonII::TypeALU32_3op &&
-        Type != HexagonII::TypeALU32_ADDI && Type != HexagonII::TypeS_2op &&
-        Type != HexagonII::TypeS_3op &&
-        (Type != HexagonII::TypeALU64 || HexagonMCInstrInfo::isFloat(MCII, ID));
-  return Result != 0;
+  switch (Type) {
+  case HexagonII::TypeALU32_2op:
+  case HexagonII::TypeALU32_3op:
+  case HexagonII::TypeALU32_ADDI:
+  case HexagonII::TypeS_2op:
+  case HexagonII::TypeS_3op:
+  case HexagonII::TypeEXTENDER:
+  case HexagonII::TypeM:
+  case HexagonII::TypeALU64:
+    return false;
+  case HexagonII::TypeSUBINSN: {
+    return !isDuplexAGroup(ID.getOpcode());
+  }
+  case HexagonII::TypeDUPLEX:
+    llvm_unreachable("unexpected duplex instruction");
+  default:
+    return true;
+  }
 }
 
 bool HexagonMCChecker::checkAXOK() {
@@ -315,8 +324,7 @@ bool HexagonMCChecker::checkAXOK() {
 
 void HexagonMCChecker::reportBranchErrors() {
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
-    if (Desc.isBranch() || Desc.isCall() || Desc.isReturn())
+    if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I))
       reportNote(I.getLoc(), "Branching instruction");
   }
 }
@@ -326,8 +334,7 @@ bool HexagonMCChecker::checkHWLoop() {
       !HexagonMCInstrInfo::isOuterLoop(MCB))
     return true;
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
-    if (Desc.isBranch() || Desc.isCall() || Desc.isReturn()) {
+    if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I)) {
       reportError(MCB.getLoc(),
                   "Branches cannot be in a packet with hardware loops");
       reportBranchErrors();
@@ -340,8 +347,7 @@ bool HexagonMCChecker::checkHWLoop() {
 bool HexagonMCChecker::checkCOFMax1() {
   SmallVector<MCInst const *, 2> BranchLocations;
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
-    if (Desc.isBranch() || Desc.isCall() || Desc.isReturn())
+    if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, I))
       BranchLocations.push_back(&I);
   }
   for (unsigned J = 0, N = BranchLocations.size(); J < N; ++J) {
@@ -373,18 +379,8 @@ bool HexagonMCChecker::checkCOFMax1() {
 }
 
 bool HexagonMCChecker::checkSlots() {
-  unsigned slotsUsed = 0;
-  for (auto HMI : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-    MCInst const &MCI = *HMI.getInst();
-    if (HexagonMCInstrInfo::isImmext(MCI))
-      continue;
-    if (HexagonMCInstrInfo::isDuplex(MCII, MCI))
-      slotsUsed += 2;
-    else
-      ++slotsUsed;
-  }
-
-  if (slotsUsed > HEXAGON_PACKET_SIZE) {
+  if (HexagonMCInstrInfo::slotsConsumed(MCII, STI, MCB) >
+      HexagonMCInstrInfo::packetSizeSlots(STI)) {
     reportError("invalid instruction packet: out of slots");
     return false;
   }
@@ -424,81 +420,109 @@ bool HexagonMCChecker::checkPredicates() {
 
 // Check legal use of new values.
 bool HexagonMCChecker::checkNewValues() {
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
-    if (!HexagonMCInstrInfo::isNewValue(MCII, I))
+  for (auto const &ConsumerInst :
+       HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+    if (!HexagonMCInstrInfo::isNewValue(MCII, ConsumerInst))
       continue;
-    auto Consumer = HexagonMCInstrInfo::predicateInfo(MCII, I);
-    bool Branch = HexagonMCInstrInfo::getDesc(MCII, I).isBranch();
-    MCOperand const &Op = HexagonMCInstrInfo::getNewValueOperand(MCII, I);
+
+    const HexagonMCInstrInfo::PredicateInfo ConsumerPredInfo =
+        HexagonMCInstrInfo::predicateInfo(MCII, ConsumerInst);
+
+    bool Branch = HexagonMCInstrInfo::getDesc(MCII, ConsumerInst).isBranch();
+    MCOperand const &Op =
+        HexagonMCInstrInfo::getNewValueOperand(MCII, ConsumerInst);
     assert(Op.isReg());
-    auto Producer = registerProducer(Op.getReg(), Consumer);
-    if (std::get<0>(Producer) == nullptr) {
-      reportError(I.getLoc(), "New value register consumer has no producer");
+
+    auto Producer = registerProducer(Op.getReg(), ConsumerPredInfo);
+    const MCInst *const ProducerInst = std::get<0>(Producer);
+    const HexagonMCInstrInfo::PredicateInfo ProducerPredInfo =
+        std::get<2>(Producer);
+
+    if (ProducerInst == nullptr) {
+      reportError(ConsumerInst.getLoc(),
+                  "New value register consumer has no producer");
       return false;
     }
     if (!RelaxNVChecks) {
       // Checks that statically prove correct new value consumption
-      if (std::get<2>(Producer).isPredicated() &&
-          (!Consumer.isPredicated() ||
-           llvm::HexagonMCInstrInfo::getType(MCII, I) == HexagonII::TypeNCJ)) {
+      if (ProducerPredInfo.isPredicated() &&
+          (!ConsumerPredInfo.isPredicated() ||
+           llvm::HexagonMCInstrInfo::getType(MCII, ConsumerInst) ==
+               HexagonII::TypeNCJ)) {
         reportNote(
-            std::get<0>(Producer)->getLoc(),
+            ProducerInst->getLoc(),
             "Register producer is predicated and consumer is unconditional");
-        reportError(I.getLoc(),
+        reportError(ConsumerInst.getLoc(),
                     "Instruction does not have a valid new register producer");
         return false;
       }
-      if (std::get<2>(Producer).Register != Hexagon::NoRegister &&
-          std::get<2>(Producer).Register != Consumer.Register) {
-        reportNote(std::get<0>(Producer)->getLoc(),
+      if (ProducerPredInfo.Register != Hexagon::NoRegister &&
+          ProducerPredInfo.Register != ConsumerPredInfo.Register) {
+        reportNote(ProducerInst->getLoc(),
                    "Register producer does not use the same predicate "
                    "register as the consumer");
-        reportError(I.getLoc(),
+        reportError(ConsumerInst.getLoc(),
                     "Instruction does not have a valid new register producer");
         return false;
       }
     }
-    if (std::get<2>(Producer).Register == Consumer.Register &&
-        Consumer.PredicatedTrue != std::get<2>(Producer).PredicatedTrue) {
+    if (ProducerPredInfo.Register == ConsumerPredInfo.Register &&
+        ConsumerPredInfo.PredicatedTrue != ProducerPredInfo.PredicatedTrue) {
       reportNote(
-          std::get<0>(Producer)->getLoc(),
+          ProducerInst->getLoc(),
           "Register producer has the opposite predicate sense as consumer");
-      reportError(I.getLoc(),
+      reportError(ConsumerInst.getLoc(),
                   "Instruction does not have a valid new register producer");
       return false;
     }
-    MCInstrDesc const &Desc =
-        HexagonMCInstrInfo::getDesc(MCII, *std::get<0>(Producer));
-    if (Desc.OpInfo[std::get<1>(Producer)].RegClass ==
+
+    MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, *ProducerInst);
+    const unsigned ProducerOpIndex = std::get<1>(Producer);
+
+    if (Desc.OpInfo[ProducerOpIndex].RegClass ==
         Hexagon::DoubleRegsRegClassID) {
-      reportNote(std::get<0>(Producer)->getLoc(),
+      reportNote(ProducerInst->getLoc(),
                  "Double registers cannot be new-value producers");
-      reportError(I.getLoc(),
+      reportError(ConsumerInst.getLoc(),
                   "Instruction does not have a valid new register producer");
       return false;
     }
-    if ((Desc.mayLoad() && std::get<1>(Producer) == 1) ||
-        (Desc.mayStore() && std::get<1>(Producer) == 0)) {
-      unsigned Mode =
-          HexagonMCInstrInfo::getAddrMode(MCII, *std::get<0>(Producer));
+
+    // The ProducerOpIsMemIndex logic checks for the index of the producer
+    // register operand.  Z-reg load instructions have an implicit operand
+    // that's not encoded, so the producer won't appear as the 1-th def, it
+    // will be at the 0-th.
+    const unsigned ProducerOpSearchIndex =
+        (HexagonMCInstrInfo::getType(MCII, *ProducerInst) ==
+         HexagonII::TypeCVI_ZW)
+            ? 0
+            : 1;
+
+    const bool ProducerOpIsMemIndex =
+        ((Desc.mayLoad() && ProducerOpIndex == ProducerOpSearchIndex) ||
+         (Desc.mayStore() && ProducerOpIndex == 0));
+
+    if (ProducerOpIsMemIndex) {
+      unsigned Mode = HexagonMCInstrInfo::getAddrMode(MCII, *ProducerInst);
+
       StringRef ModeError;
       if (Mode == HexagonII::AbsoluteSet)
         ModeError = "Absolute-set";
       if (Mode == HexagonII::PostInc)
         ModeError = "Auto-increment";
       if (!ModeError.empty()) {
-        reportNote(std::get<0>(Producer)->getLoc(),
+        reportNote(ProducerInst->getLoc(),
                    ModeError + " registers cannot be a new-value "
                                "producer");
-        reportError(I.getLoc(),
+        reportError(ConsumerInst.getLoc(),
                     "Instruction does not have a valid new register producer");
         return false;
       }
     }
-    if (Branch && HexagonMCInstrInfo::isFloat(MCII, *std::get<0>(Producer))) {
-      reportNote(std::get<0>(Producer)->getLoc(),
+    if (Branch && HexagonMCInstrInfo::isFloat(MCII, *ProducerInst)) {
+      reportNote(ProducerInst->getLoc(),
                  "FPU instructions cannot be new-value producers for jumps");
-      reportError(I.getLoc(),
+      reportError(ConsumerInst.getLoc(),
                   "Instruction does not have a valid new register producer");
       return false;
     }
@@ -541,9 +565,11 @@ HexagonMCChecker::registerProducer(
     unsigned Register, HexagonMCInstrInfo::PredicateInfo ConsumerPredicate) {
   std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo>
       WrongSense;
+
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
     MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
     auto ProducerPredicate = HexagonMCInstrInfo::predicateInfo(MCII, I);
+
     for (unsigned J = 0, N = Desc.getNumDefs(); J < N; ++J)
       for (auto K = MCRegAliasIterator(I.getOperand(J).getReg(), &RI, true);
            K.isValid(); ++K)
@@ -568,9 +594,15 @@ void HexagonMCChecker::checkRegisterCurDefs() {
   for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
     if (HexagonMCInstrInfo::isCVINew(MCII, I) &&
         HexagonMCInstrInfo::getDesc(MCII, I).mayLoad()) {
-      unsigned Register = I.getOperand(0).getReg();
-      if (!registerUsed(Register))
-        reportWarning("Register `" + Twine(RI.getName(Register)) +
+      const unsigned RegDef = I.getOperand(0).getReg();
+
+      bool HasRegDefUse = false;
+      for (MCRegAliasIterator Alias(RegDef, &RI, true); Alias.isValid();
+           ++Alias)
+        HasRegDefUse = HasRegDefUse || registerUsed(*Alias);
+
+      if (!HasRegDefUse)
+        reportWarning("Register `" + Twine(RI.getName(RegDef)) +
                       "' used with `.cur' "
                       "but not used in the same packet");
     }
@@ -599,7 +631,7 @@ bool HexagonMCChecker::checkRegisters() {
       reportErrorRegisters(BadR);
       return false;
     }
-    if (!isPredicateRegister(R) && Defs[R].size() > 1) {
+    if (!HexagonMCInstrInfo::isPredReg(RI, R) && Defs[R].size() > 1) {
       // Check for multiple register definitions.
       PredSet &PM = Defs[R];
 
@@ -784,3 +816,22 @@ bool HexagonMCChecker::checkLegalVecRegPair() {
   }
   return true;
 }
+
+// Vd.tmp can't be accumulated
+bool HexagonMCChecker::checkHVXAccum()
+{
+  for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCII, MCB)) {
+    bool IsTarget =
+        HexagonMCInstrInfo::isAccumulator(MCII, I) && I.getOperand(0).isReg();
+    if (!IsTarget)
+      continue;
+    unsigned int R = I.getOperand(0).getReg();
+    TmpDefsIterator It = TmpDefs.find(R);
+    if (It != TmpDefs.end()) {
+      reportError("register `" + Twine(RI.getName(R)) + ".tmp" +
+                  "' is accumulated in this packet");
+      return false;
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index dbd3d8ae45e6..b83931eb88ac 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -81,6 +81,10 @@ class HexagonMCChecker {
   void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
 
   bool registerUsed(unsigned Register);
+
+  /// \return a tuple of: pointer to the producer instruction or nullptr if
+  /// none was found, the operand index, and the PredicateInfo for the
+  /// producer.
   std::tuple<MCInst const *, unsigned, HexagonMCInstrInfo::PredicateInfo>
   registerProducer(unsigned Register,
                    HexagonMCInstrInfo::PredicateInfo Predicated);
@@ -100,14 +104,10 @@ class HexagonMCChecker {
   bool checkCOFMax1();
   bool checkLegalVecRegPair();
   bool checkValidTmpDst();
+  bool checkHVXAccum();
 
   static void compoundRegisterMap(unsigned &);
 
-  bool isPredicateRegister(unsigned R) const {
-    return (Hexagon::P0 == R || Hexagon::P1 == R || Hexagon::P2 == R ||
-            Hexagon::P3 == R);
-  }
-
   bool isLoopRegister(unsigned R) const {
     return (Hexagon::SA0 == R || Hexagon::LC0 == R || Hexagon::SA1 == R ||
             Hexagon::LC1 == R);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 33b2e9a9e302..f8ac35aed7c0 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -712,7 +712,6 @@ unsigned
 HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
                                         SmallVectorImpl<MCFixup> &Fixups,
                                         MCSubtargetInfo const &STI) const {
-#ifndef NDEBUG
   size_t OperandNumber = ~0U;
   for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i)
     if (&MI.getOperand(i) == &MO) {
@@ -720,7 +719,6 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
       break;
     }
   assert((OperandNumber != ~0U) && "Operand not found");
-#endif
 
   if (HexagonMCInstrInfo::isNewValue(MCII, MI) &&
       &MO == &HexagonMCInstrInfo::getNewValueOperand(MCII, MI)) {
@@ -777,9 +775,13 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
   assert(!MO.isImm());
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    if (HexagonMCInstrInfo::isSubInstruction(MI) ||
-        HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCJ)
+    switch (HexagonMCInstrInfo::getDesc(MCII, MI).OpInfo[OperandNumber].RegClass) {
+    case GeneralSubRegsRegClassID:
+    case GeneralDoubleLow8RegsRegClassID:
       return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg);
+    default:
+      break;
+    }
     return MCT.getRegisterInfo()->getEncodingValue(Reg);
   }
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index e7ade7834a9f..3deef95df324 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -365,8 +365,10 @@ static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
                MCI.begin() + HexagonMCInstrInfo::bundleInstructionsOffset;
            B != MCI.end(); ++B) {
         MCInst const *Inst = B->getInst();
-        if (JumpInst == Inst)
+        if (JumpInst == Inst) {
+          BExtended = false;
           continue;
+        }
         if (HexagonMCInstrInfo::isImmext(*Inst)) {
           BExtended = true;
           continue;
@@ -405,24 +407,27 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo co
   if (MCI.size() < 2)
     return;
 
-  bool StartedValid = llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI);
-
   // Create a vector, needed to keep the order of jump instructions.
   MCInst CheckList(MCI);
 
+  // Keep the last known good bundle around in case the shuffle fails.
+  MCInst LastValidBundle(MCI);
+
+  bool PreviouslyValid = llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI);
+
   // Look for compounds until none are found, only update the bundle when
   // a compound is found.
   while (lookForCompound(MCII, Context, CheckList)) {
-    // Keep the original bundle around in case the shuffle fails.
-    MCInst OriginalBundle(MCI);
-
     // Need to update the bundle.
     MCI = CheckList;
 
-    if (StartedValid &&
-        !llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI)) {
+    const bool IsValid = llvm::HexagonMCShuffle(Context, false, MCII, STI, MCI);
+    if (PreviouslyValid && !IsValid) {
       LLVM_DEBUG(dbgs() << "Found ERROR\n");
-      MCI = OriginalBundle;
+      MCI = LastValidBundle;
+    } else if (IsValid) {
+      LastValidBundle = MCI;
+      PreviouslyValid = true;
     }
   }
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index e1c95f1cc920..36d6c8c9f84b 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -284,8 +284,6 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::J2_jumprf:
   case Hexagon::J2_jumprtnew:
   case Hexagon::J2_jumprfnew:
-  case Hexagon::J2_jumprtnewpt:
-  case Hexagon::J2_jumprfnewpt:
   case Hexagon::PS_jmprett:
   case Hexagon::PS_jmpretf:
   case Hexagon::PS_jmprettnew:
@@ -303,8 +301,6 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::L4_return_f:
   case Hexagon::L4_return_tnew_pnt:
   case Hexagon::L4_return_fnew_pnt:
-  case Hexagon::L4_return_tnew_pt:
-  case Hexagon::L4_return_fnew_pt:
     // [if ([!]p0[.new])] dealloc_return
     SrcReg = MCI.getOperand(1).getReg();
     if (Hexagon::P0 == SrcReg) {
@@ -699,6 +695,7 @@ inline static void addOps(MCInst &subInstPtr, MCInst const &Inst,
 
 MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
   MCInst Result;
+  Result.setLoc(Inst.getLoc());
   bool Absolute;
   int64_t Value;
   switch (Inst.getOpcode()) {
@@ -830,7 +827,6 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     Result.setOpcode(Hexagon::SL2_jumpr31_f);
     break; //    none  SUBInst if (!p0) jumpr r31
   case Hexagon::J2_jumprfnew:
-  case Hexagon::J2_jumprfnewpt:
   case Hexagon::PS_jmpretfnewpt:
   case Hexagon::PS_jmpretfnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_fnew);
@@ -840,7 +836,6 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     Result.setOpcode(Hexagon::SL2_jumpr31_t);
     break; //    none  SUBInst if (p0) jumpr r31
   case Hexagon::J2_jumprtnew:
-  case Hexagon::J2_jumprtnewpt:
   case Hexagon::PS_jmprettnewpt:
   case Hexagon::PS_jmprettnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_tnew);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 68ccb20f4f15..494b0e6cbac6 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -128,23 +128,28 @@ bool canonicalizePacketImpl(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   bool CheckOk = Check ? Check->check(false) : true;
   if (!CheckOk)
     return false;
+
+  MCInst OrigMCB = MCB;
+
   // Examine the packet and convert pairs of instructions to compound
   // instructions when possible.
   if (!HexagonDisableCompound)
     HexagonMCInstrInfo::tryCompound(MCII, STI, Context, MCB);
   HexagonMCShuffle(Context, false, MCII, STI, MCB);
 
+  const SmallVector<DuplexCandidate, 8> possibleDuplexes =
+      (STI.getFeatureBits()[Hexagon::FeatureDuplex])
+          ? HexagonMCInstrInfo::getDuplexPossibilties(MCII, STI, MCB)
+          : SmallVector<DuplexCandidate, 8>();
+
   // Examine the packet and convert pairs of instructions to duplex
   // instructions when possible.
-  if (STI.getFeatureBits() [Hexagon::FeatureDuplex]) {
-    SmallVector<DuplexCandidate, 8> possibleDuplexes;
-    possibleDuplexes =
-        HexagonMCInstrInfo::getDuplexPossibilties(MCII, STI, MCB);
-    HexagonMCShuffle(Context, MCII, STI, MCB, possibleDuplexes);
-  }
+  HexagonMCShuffle(Context, MCII, STI, MCB, possibleDuplexes);
+
   // Examines packet and pad the packet, if needed, when an
   // end-loop is in the bundle.
   HexagonMCInstrInfo::padEndloop(MCB, Context);
+
   // If compounding and duplexing didn't reduce the size below
   // 4 or less we have a packet that is too big.
   if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) {
@@ -156,7 +161,9 @@ bool canonicalizePacketImpl(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   CheckOk = Check ? Check->check(true) : true;
   if (!CheckOk)
     return false;
+
   HexagonMCShuffle(Context, true, MCII, STI, MCB);
+
   return true;
 }
 } // namespace
@@ -857,16 +864,16 @@ bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
 }
 
 int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) {
-  auto Sentinal = static_cast<int64_t>(std::numeric_limits<uint32_t>::max())
+  auto Sentinel = static_cast<int64_t>(std::numeric_limits<uint32_t>::max())
                   << 8;
   if (MCI.size() <= Index)
-    return Sentinal;
+    return Sentinel;
   MCOperand const &MCO = MCI.getOperand(Index);
   if (!MCO.isExpr())
-    return Sentinal;
+    return Sentinel;
   int64_t Value;
   if (!MCO.getExpr()->evaluateAsAbsolute(Value))
-    return Sentinal;
+    return Sentinel;
   return Value;
 }
 
@@ -915,10 +922,7 @@ void HexagonMCInstrInfo::padEndloop(MCInst &MCB, MCContext &Context) {
   MCInst Nop;
   Nop.setOpcode(Hexagon::A2_nop);
   assert(isBundle(MCB));
-  while ((HexagonMCInstrInfo::isInnerLoop(MCB) &&
-          (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) ||
-         ((HexagonMCInstrInfo::isOuterLoop(MCB) &&
-           (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE))))
+  while (LoopNeedsPadding(MCB))
     MCB.addOperand(MCOperand::createInst(new (Context) MCInst(Nop)));
 }
 
@@ -1030,3 +1034,19 @@ unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
     return Consumer == Producer;
   return 0;
 }
+
+bool HexagonMCInstrInfo::LoopNeedsPadding(MCInst const &MCB) {
+  return (
+      (HexagonMCInstrInfo::isInnerLoop(MCB) &&
+       (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_INNER_SIZE)) ||
+      ((HexagonMCInstrInfo::isOuterLoop(MCB) &&
+        (HexagonMCInstrInfo::bundleSize(MCB) < HEXAGON_PACKET_OUTER_SIZE))));
+}
+
+bool HexagonMCInstrInfo::IsABranchingInst(MCInstrInfo const &MCII,
+                                          MCSubtargetInfo const &STI,
+                                          MCInst const &I) {
+  assert(!HexagonMCInstrInfo::isBundle(I));
+  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, I);
+  return (Desc.isBranch() || Desc.isCall() || Desc.isReturn());
+}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 5c56db14798f..f0c4a86fde78 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -65,18 +65,24 @@ public:
 
 namespace HexagonMCInstrInfo {
 
-size_t const innerLoopOffset = 0;
-int64_t const innerLoopMask = 1 << innerLoopOffset;
+constexpr size_t innerLoopOffset = 0;
+constexpr int64_t innerLoopMask = 1 << innerLoopOffset;
 
-size_t const outerLoopOffset = 1;
-int64_t const outerLoopMask = 1 << outerLoopOffset;
+constexpr size_t outerLoopOffset = 1;
+constexpr int64_t outerLoopMask = 1 << outerLoopOffset;
 
 // do not reorder memory load/stores by default load/stores are re-ordered
 // and by default loads can be re-ordered
-size_t const memReorderDisabledOffset = 2;
-int64_t const memReorderDisabledMask = 1 << memReorderDisabledOffset;
+constexpr size_t memReorderDisabledOffset = 2;
+constexpr int64_t memReorderDisabledMask = 1 << memReorderDisabledOffset;
 
-size_t const bundleInstructionsOffset = 1;
+constexpr size_t splitNoMemOrderOffset = 3;
+constexpr int64_t splitNoMemorderMask = 1 << splitNoMemOrderOffset;
+
+constexpr size_t noShuffleOffset = 4;
+constexpr int64_t noShuffleMask = 1 << noShuffleOffset;
+
+constexpr size_t bundleInstructionsOffset = 1;
 
 void addConstant(MCInst &MI, uint64_t Value, MCContext &Context);
 void addConstExtender(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
@@ -95,6 +101,8 @@ bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                         MCContext &Context, MCInst &MCB,
                         HexagonMCChecker *Checker,
                         bool AttemptCompatibility = false);
+bool IsABranchingInst(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst const &I);
 
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
@@ -307,6 +315,10 @@ bool mustNotExtend(MCExpr const &Expr);
 // Returns true if this instruction requires a slot to execute.
 bool requiresSlot(MCSubtargetInfo const &STI, MCInst const &MCI);
 
+
+// Returns true if \a MCB would require endloop padding.
+bool LoopNeedsPadding(MCInst const &MCB);
+
 unsigned packetSize(StringRef CPU);
 
 // Returns the maximum number of slots available in the given
@@ -318,8 +330,7 @@ unsigned packetSizeSlots(MCSubtargetInfo const &STI);
 unsigned slotsConsumed(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                        MCInst const &MCI);
 
-
-// Pad the bundle with nops to satisfy endloop requirements
+// Pad the bundle with nops to satisfy endloop requirements.
 void padEndloop(MCInst &MCI, MCContext &Context);
 class PredicateInfo {
 public:
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index d38b77b42fbc..d96fade71a84 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -81,10 +81,9 @@ void HexagonMCShuffler::copyTo(MCInst &MCB) {
   MCB.addOperand(MCOperand::createImm(BundleFlags));
   MCB.setLoc(Loc);
   // Copy the results into the bundle.
-  for (HexagonShuffler::iterator I = begin(); I != end(); ++I) {
-
-    MCInst const &MI = I->getDesc();
-    MCInst const *Extender = I->getExtender();
+  for (auto &I : *this) {
+    MCInst const &MI = I.getDesc();
+    MCInst const *Extender = I.getExtender();
     if (Extender)
       MCB.addOperand(MCOperand::createInst(Extender));
     MCB.addOperand(MCOperand::createInst(&MI));
@@ -101,10 +100,10 @@ bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
   return false;
 }
 
-bool llvm::HexagonMCShuffle(MCContext &Context, bool Fatal,
+bool llvm::HexagonMCShuffle(MCContext &Context, bool ReportErrors,
                             MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                             MCInst &MCB) {
-  HexagonMCShuffler MCS(Context, Fatal, MCII, STI, MCB);
+  HexagonMCShuffler MCS(Context, ReportErrors, MCII, STI, MCB);
 
   if (DisableShuffle)
     // Ignore if user chose so.
@@ -128,11 +127,11 @@ bool llvm::HexagonMCShuffle(MCContext &Context, bool Fatal,
   return MCS.reshuffleTo(MCB);
 }
 
-bool
-llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
-                       MCSubtargetInfo const &STI, MCInst &MCB,
-                       SmallVector<DuplexCandidate, 8> possibleDuplexes) {
-  if (DisableShuffle)
+bool llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
+                            MCSubtargetInfo const &STI, MCInst &MCB,
+                            SmallVector<DuplexCandidate, 8> possibleDuplexes) {
+
+  if (DisableShuffle || possibleDuplexes.size() == 0)
     return false;
 
   if (!HexagonMCInstrInfo::bundleSize(MCB)) {
@@ -173,10 +172,8 @@ llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
     HexagonMCShuffler MCS(Context, false, MCII, STI, MCB);
     doneShuffling = MCS.reshuffleTo(MCB); // shuffle
   }
-  if (!doneShuffling)
-    return true;
 
-  return false;
+  return doneShuffling;
 }
 
 bool llvm::HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
index 3410c0ddbd84..4fc8addb27bc 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -28,16 +28,17 @@ class MCSubtargetInfo;
 // Insn bundle shuffler.
 class HexagonMCShuffler : public HexagonShuffler {
 public:
-  HexagonMCShuffler(MCContext &Context, bool Fatal, MCInstrInfo const &MCII,
-                    MCSubtargetInfo const &STI, MCInst &MCB)
-      : HexagonShuffler(Context, Fatal, MCII, STI) {
+  HexagonMCShuffler(MCContext &Context, bool ReportErrors,
+                    MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                    MCInst &MCB)
+      : HexagonShuffler(Context, ReportErrors, MCII, STI) {
     init(MCB);
   }
 
-  HexagonMCShuffler(MCContext &Context, bool Fatal, MCInstrInfo const &MCII,
-                    MCSubtargetInfo const &STI, MCInst &MCB,
-                    MCInst const &AddMI, bool InsertAtFront)
-      : HexagonShuffler(Context, Fatal, MCII, STI) {
+  HexagonMCShuffler(MCContext &Context, bool ReportErrors,
+                    MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                    MCInst &MCB, MCInst const &AddMI, bool InsertAtFront)
+      : HexagonShuffler(Context, ReportErrors, MCII, STI) {
     init(MCB, AddMI, InsertAtFront);
   }
 
@@ -52,9 +53,11 @@ private:
   void init(MCInst &MCB, MCInst const &AddMI, bool InsertAtFront);
 };
 
-// Invocation of the shuffler.
-bool HexagonMCShuffle(MCContext &Context, bool Fatal, MCInstrInfo const &MCII,
-                      MCSubtargetInfo const &STI, MCInst &MCB);
+// Invocation of the shuffler.  Returns true if the shuffle succeeded.  If
+// true, MCB will contain the newly-shuffled packet.
+bool HexagonMCShuffle(MCContext &Context, bool ReportErrors,
+                      MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst &MCB);
 bool HexagonMCShuffle(MCContext &Context, MCInstrInfo const &MCII,
                       MCSubtargetInfo const &STI, MCInst &MCB,
                       MCInst const &AddMI, int fixupCount);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index dfdddb50657c..6a08d7503bac 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -517,6 +517,14 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
     return nullptr;
   }
 
+  // Add qfloat subtarget feature by default to v68 and above
+  // unless explicitely disabled
+  if (checkFeature(X, Hexagon::ExtensionHVXV68) &&
+      ArchFS.find("-hvx-qfloat", 0) == std::string::npos) {
+    llvm::FeatureBitset Features = X->getFeatureBits();
+    X->setFeatureBits(Features.set(Hexagon::ExtensionHVXQFloat));
+  }
+
   if (HexagonDisableDuplex) {
     llvm::FeatureBitset Features = X->getFeatureBits();
     X->setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
@@ -551,21 +559,11 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI,
 }
 
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
-  static std::map<StringRef,unsigned> ElfFlags = {
-    {"hexagonv5",  ELF::EF_HEXAGON_MACH_V5},
-    {"hexagonv55", ELF::EF_HEXAGON_MACH_V55},
-    {"hexagonv60", ELF::EF_HEXAGON_MACH_V60},
-    {"hexagonv62", ELF::EF_HEXAGON_MACH_V62},
-    {"hexagonv65", ELF::EF_HEXAGON_MACH_V65},
-    {"hexagonv66", ELF::EF_HEXAGON_MACH_V66},
-    {"hexagonv67", ELF::EF_HEXAGON_MACH_V67},
-    {"hexagonv67t", ELF::EF_HEXAGON_MACH_V67T},
-    {"hexagonv68", ELF::EF_HEXAGON_MACH_V68},
-    {"hexagonv69", ELF::EF_HEXAGON_MACH_V69},
-  };
+  using llvm::Hexagon::ElfFlagsByCpuStr;
 
-  auto F = ElfFlags.find(STI.getCPU());
-  assert(F != ElfFlags.end() && "Unrecognized Architecture");
+  const std::string CPU(STI.getCPU().str());
+  auto F = ElfFlagsByCpuStr.find(CPU);
+  assert(F != ElfFlagsByCpuStr.end() && "Unrecognized Architecture");
   return F->second;
 }
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 1fce90b82864..d82731e153fe 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -167,7 +167,8 @@ static bool checkHVXPipes(const HVXInstsT &hvxInsts, unsigned startIdx,
 HexagonShuffler::HexagonShuffler(MCContext &Context, bool ReportErrors,
                                  MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
-    : Context(Context), MCII(MCII), STI(STI), ReportErrors(ReportErrors) {
+    : Context(Context), BundleFlags(), MCII(MCII), STI(STI),
+      ReportErrors(ReportErrors), CheckFailure() {
   reset();
 }
 
@@ -244,8 +245,8 @@ void HexagonShuffler::restrictNoSlot1Store(
                        "Instruction does not allow a store in slot 1"));
 }
 
-bool HexagonShuffler::applySlotRestrictions(
-    HexagonPacketSummary const &Summary) {
+bool HexagonShuffler::applySlotRestrictions(HexagonPacketSummary const &Summary,
+                                            const bool DoShuffle) {
   // These restrictions can modify the slot masks in the instructions
   // in the Packet member.  They should run unconditionally and their
   // order does not matter.
@@ -262,7 +263,7 @@ bool HexagonShuffler::applySlotRestrictions(
   if (!CheckFailure)
     restrictBranchOrder(Summary);
   if (!CheckFailure)
-    restrictPreferSlot3(Summary);
+    restrictPreferSlot3(Summary, DoShuffle);
   return !CheckFailure;
 }
 
@@ -303,10 +304,9 @@ void HexagonShuffler::restrictBranchOrder(HexagonPacketSummary const &Summary) {
     Packet = PacketSave;
   }
 
-  reportError("invalid instruction packet: out of slots");
+  reportResourceError(Summary, "out of slots");
 }
 
-
 void HexagonShuffler::permitNonSlot() {
   for (HexagonInstr &ISJ : insts()) {
     const bool RequiresSlot = HexagonMCInstrInfo::requiresSlot(STI, *ISJ.ID);
@@ -319,21 +319,19 @@ bool HexagonShuffler::ValidResourceUsage(HexagonPacketSummary const &Summary) {
   Optional<HexagonPacket> ShuffledPacket = tryAuction(Summary);
 
   if (!ShuffledPacket) {
-    reportError("invalid instruction packet: slot error");
+    reportResourceError(Summary, "slot error");
     return false;
-  } else {
-    Packet = *ShuffledPacket;
   }
 
   // Verify the CVI slot subscriptions.
-  llvm::stable_sort(*this, HexagonInstr::lessCVI);
+  llvm::stable_sort(*ShuffledPacket, HexagonInstr::lessCVI);
   // create vector of hvx instructions to check
   HVXInstsT hvxInsts;
   hvxInsts.clear();
-  for (const_iterator I = cbegin(); I != cend(); ++I) {
+  for (const auto &I : *ShuffledPacket) {
     struct CVIUnits inst;
-    inst.Units = I->CVI.getUnits();
-    inst.Lanes = I->CVI.getLanes();
+    inst.Units = I.CVI.getUnits();
+    inst.Lanes = I.CVI.getLanes();
     if (inst.Units == 0)
       continue; // not an hvx inst or an hvx inst that doesn't uses any pipes
     hvxInsts.push_back(inst);
@@ -349,6 +347,9 @@ bool HexagonShuffler::ValidResourceUsage(HexagonPacketSummary const &Summary) {
       return false;
     }
   }
+
+  Packet = *ShuffledPacket;
+
   return true;
 }
 
@@ -438,6 +439,15 @@ bool HexagonShuffler::restrictStoreLoadOrder(
   return true;
 }
 
+static std::string SlotMaskToText(unsigned SlotMask) {
+    SmallVector<std::string, HEXAGON_PRESHUFFLE_PACKET_SIZE> Slots;
+    for (unsigned SlotNum = 0; SlotNum < HEXAGON_PACKET_SIZE; SlotNum++)
+        if ((SlotMask & (1 << SlotNum)) != 0)
+            Slots.push_back(utostr(SlotNum));
+
+    return llvm::join(Slots, StringRef(", "));
+}
+
 HexagonShuffler::HexagonPacketSummary HexagonShuffler::GetPacketSummary() {
   HexagonPacketSummary Summary = HexagonPacketSummary();
 
@@ -454,8 +464,13 @@ HexagonShuffler::HexagonPacketSummary HexagonShuffler::GetPacketSummary() {
       ++Summary.pSlot3Cnt;
       Summary.PrefSlot3Inst = ISJ;
     }
-    Summary.ReservedSlotMask |=
+    const unsigned ReservedSlots =
         HexagonMCInstrInfo::getOtherReservedSlots(MCII, STI, ID);
+    Summary.ReservedSlotMask |= ReservedSlots;
+    if (ReservedSlots != 0)
+      AppliedRestrictions.push_back(std::make_pair(ID.getLoc(),
+                  (Twine("Instruction has reserved slots: ") +
+                   SlotMaskToText(ReservedSlots)).str()));
 
     switch (HexagonMCInstrInfo::getType(MCII, ID)) {
     case HexagonII::TypeS_2op:
@@ -463,7 +478,8 @@ HexagonShuffler::HexagonPacketSummary HexagonShuffler::GetPacketSummary() {
     case HexagonII::TypeALU64:
       break;
     case HexagonII::TypeJ:
-      Summary.branchInsts.push_back(ISJ);
+      if (HexagonMCInstrInfo::IsABranchingInst(MCII, STI, *ISJ->ID))
+        Summary.branchInsts.push_back(ISJ);
       break;
     case HexagonII::TypeCVI_VM_VP_LDU:
     case HexagonII::TypeCVI_VM_LD:
@@ -565,14 +581,15 @@ bool HexagonShuffler::ValidPacketMemoryOps(
   return !InvalidPacket;
 }
 
-void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary) {
+void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary,
+                                          const bool DoShuffle) {
   // flag if an instruction requires to be in slot 3
   const bool HasOnlySlot3 = llvm::any_of(insts(), [&](HexagonInstr const &I) {
     return (I.Core.getUnits() == Slot3Mask);
   });
-  const bool NeedsPrefSlot3Shuffle =
-      (Summary.branchInsts.size() <= 1 && !HasOnlySlot3 &&
-       Summary.pSlot3Cnt == 1 && Summary.PrefSlot3Inst);
+  const bool NeedsPrefSlot3Shuffle = Summary.branchInsts.size() <= 1 &&
+                                     !HasOnlySlot3 && Summary.pSlot3Cnt == 1 &&
+                                     Summary.PrefSlot3Inst && DoShuffle;
 
   if (!NeedsPrefSlot3Shuffle)
     return;
@@ -590,9 +607,9 @@ void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary) {
 }
 
 /// Check that the packet is legal and enforce relative insn order.
-bool HexagonShuffler::check() {
+bool HexagonShuffler::check(const bool RequireShuffle) {
   const HexagonPacketSummary Summary = GetPacketSummary();
-  if (!applySlotRestrictions(Summary))
+  if (!applySlotRestrictions(Summary, RequireShuffle))
     return false;
 
   if (!ValidPacketMemoryOps(Summary)) {
@@ -600,13 +617,14 @@ bool HexagonShuffler::check() {
     return false;
   }
 
-  ValidResourceUsage(Summary);
+  if (RequireShuffle)
+    ValidResourceUsage(Summary);
 
   return !CheckFailure;
 }
 
 llvm::Optional<HexagonShuffler::HexagonPacket>
-HexagonShuffler::tryAuction(HexagonPacketSummary const &Summary) const {
+HexagonShuffler::tryAuction(HexagonPacketSummary const &Summary) {
   HexagonPacket PacketResult = Packet;
   HexagonUnitAuction AuctionCore(Summary.ReservedSlotMask);
   llvm::stable_sort(PacketResult, HexagonInstr::lessCore);
@@ -635,13 +653,13 @@ bool HexagonShuffler::shuffle() {
   if (size() > HEXAGON_PACKET_SIZE) {
     // Ignore a packet with with more than what a packet can hold
     // or with compound or duplex insns for now.
-    reportError(Twine("invalid instruction packet"));
+    reportError("invalid instruction packet");
     return false;
   }
 
   // Check and prepare packet.
-  bool Ok = true;
-  if (size() > 1 && (Ok = check()))
+  bool Ok = check();
+  if (size() > 1 && Ok)
     // Reorder the handles for each slot.
     for (unsigned nSlot = 0, emptySlots = 0; nSlot < HEXAGON_PACKET_SIZE;
          ++nSlot) {
@@ -684,6 +702,32 @@ bool HexagonShuffler::shuffle() {
   return Ok;
 }
 
+void HexagonShuffler::reportResourceError(HexagonPacketSummary const &Summary, StringRef Err) {
+  if (ReportErrors)
+    reportResourceUsage(Summary);
+  reportError(Twine("invalid instruction packet: ") + Err);
+}
+
+
+void HexagonShuffler::reportResourceUsage(HexagonPacketSummary const &Summary) {
+  auto SM = Context.getSourceManager();
+  if (SM) {
+    for (HexagonInstr const &I : insts()) {
+      const unsigned Units = I.Core.getUnits();
+
+      if (HexagonMCInstrInfo::requiresSlot(STI, *I.ID)) {
+        const std::string UnitsText = Units ? SlotMaskToText(Units) : "<None>";
+        SM->PrintMessage(I.ID->getLoc(), SourceMgr::DK_Note,
+                Twine("Instruction can utilize slots: ") +
+                UnitsText);
+      }
+      else if (!HexagonMCInstrInfo::isImmext(*I.ID))
+        SM->PrintMessage(I.ID->getLoc(), SourceMgr::DK_Note,
+                       "Instruction does not require a slot");
+    }
+  }
+}
+
 void HexagonShuffler::reportError(Twine const &Msg) {
   CheckFailure = true;
   if (ReportErrors) {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index 1b4ebc5111db..70992e4c7e81 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -72,16 +72,6 @@ public:
   using UnitsAndLanes = std::pair<unsigned, unsigned>;
 
 private:
-  // Available HVX slots.
-  enum {
-    CVI_NONE = 0,
-    CVI_XLANE = 1 << 0,
-    CVI_SHIFT = 1 << 1,
-    CVI_MPY0 = 1 << 2,
-    CVI_MPY1 = 1 << 3,
-    CVI_ZW = 1 << 4
-  };
-
   // Count of adjacent slots that the insn requires to be executed.
   unsigned Lanes;
   // Flag whether the insn is a load or a store.
@@ -177,21 +167,23 @@ protected:
   bool ReportErrors;
   bool CheckFailure;
   std::vector<std::pair<SMLoc, std::string>> AppliedRestrictions;
-  bool applySlotRestrictions(HexagonPacketSummary const &Summary);
+
+  bool applySlotRestrictions(HexagonPacketSummary const &Summary,
+                             const bool DoShuffle);
   void restrictSlot1AOK(HexagonPacketSummary const &Summary);
   void restrictNoSlot1Store(HexagonPacketSummary const &Summary);
   void restrictNoSlot1();
   bool restrictStoreLoadOrder(HexagonPacketSummary const &Summary);
   void restrictBranchOrder(HexagonPacketSummary const &Summary);
-  void restrictPreferSlot3(HexagonPacketSummary const &Summary);
+  void restrictPreferSlot3(HexagonPacketSummary const &Summary,
+                           const bool DoShuffle);
   void permitNonSlot();
 
-  Optional<HexagonPacket> tryAuction(HexagonPacketSummary const &Summary) const;
+  Optional<HexagonPacket> tryAuction(HexagonPacketSummary const &Summary);
 
   HexagonPacketSummary GetPacketSummary();
   bool ValidPacketMemoryOps(HexagonPacketSummary const &Summary) const;
   bool ValidResourceUsage(HexagonPacketSummary const &Summary);
-  bool validPacketInsts() const;
 
 public:
   using iterator = HexagonPacket::iterator;
@@ -205,7 +197,7 @@ public:
   // Reset to initial state.
   void reset();
   // Check if the bundle may be validly shuffled.
-  bool check();
+  bool check(const bool RequireShuffle = true);
   // Reorder the insn handles in the bundle.
   bool shuffle();
 
@@ -242,6 +234,8 @@ public:
 
   // Return the error code for the last check or shuffling of the bundle.
   void reportError(Twine const &Msg);
+  void reportResourceError(HexagonPacketSummary const &Summary, StringRef Err);
+  void reportResourceUsage(HexagonPacketSummary const &Summary);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index a994bd7e57a4..660215ca7435 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -141,7 +141,7 @@ struct LanaiOperand : public MCParsedAsmOperand {
     struct MemOp Mem;
   };
 
-  explicit LanaiOperand(KindTy Kind) : MCParsedAsmOperand(), Kind(Kind) {}
+  explicit LanaiOperand(KindTy Kind) : Kind(Kind) {}
 
 public:
   // The functions below are used by the autogenerated ASM matcher and hence to
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 0d9e63c112fb..010ff80ad42a 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -284,7 +284,7 @@ LanaiTargetLowering::getSingleConstraintMatchWeight(
 void LanaiTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
-  SDValue Result(nullptr, 0);
+  SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1)
@@ -511,7 +511,7 @@ SDValue LanaiTargetLowering::LowerCCCArguments(
   // the sret argument into rv for the return. Save the argument into
   // a virtual register so that we can access it from the return points.
   if (MF.getFunction().hasStructRetAttr()) {
-    unsigned Reg = LanaiMFI->getSRetReturnReg();
+    Register Reg = LanaiMFI->getSRetReturnReg();
     if (!Reg) {
       Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
       LanaiMFI->setSRetReturnReg(Reg);
@@ -577,7 +577,7 @@ LanaiTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   if (DAG.getMachineFunction().getFunction().hasStructRetAttr()) {
     MachineFunction &MF = DAG.getMachineFunction();
     LanaiMachineFunctionInfo *LanaiMFI = MF.getInfo<LanaiMachineFunctionInfo>();
-    unsigned Reg = LanaiMFI->getSRetReturnReg();
+    Register Reg = LanaiMFI->getSRetReturnReg();
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments().");
     SDValue Val =
@@ -1077,7 +1077,7 @@ SDValue LanaiTargetLowering::LowerRETURNADDR(SDValue Op,
 
   // Return the link register, which contains the return address.
   // Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
+  Register Reg = MF.addLiveIn(TRI->getRARegister(), getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
 
diff --git a/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp b/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
index 67443b771d3d..ce79bdafc425 100644
--- a/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
+++ b/llvm/lib/Target/Lanai/LanaiMemAluCombiner.cpp
@@ -412,9 +412,8 @@ bool LanaiMemAluCombiner::runOnMachineFunction(MachineFunction &MF) {
 
   TII = MF.getSubtarget<LanaiSubtarget>().getInstrInfo();
   bool Modified = false;
-  for (MfIterator MFI = MF.begin(); MFI != MF.end(); ++MFI) {
-    Modified |= combineMemAluInBasicBlock(&*MFI);
-  }
+  for (MachineBasicBlock &MBB : MF)
+    Modified |= combineMemAluInBasicBlock(&MBB);
   return Modified;
 }
 } // namespace
diff --git a/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp b/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
index abe20c8e18cf..03cf10205173 100644
--- a/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -165,7 +165,7 @@ void LanaiRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   if ((isSPLSOpcode(MI.getOpcode()) && !isInt<10>(Offset)) ||
       !isInt<16>(Offset)) {
     assert(RS && "Register scavenging must be on");
-    unsigned Reg = RS->FindUnusedReg(&Lanai::GPRRegClass);
+    Register Reg = RS->FindUnusedReg(&Lanai::GPRRegClass);
     if (!Reg)
       Reg = RS->scavengeRegister(&Lanai::GPRRegClass, II, SPAdj);
     assert(Reg && "Register scavenger failed");
diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
index d9d7847a0c5a..37a4843e1bc4 100644
--- a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
+++ b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -43,4 +43,4 @@ LanaiSubtarget::LanaiSubtarget(const Triple &TargetTriple, StringRef Cpu,
                                CodeGenOpt::Level /*OptLevel*/)
     : LanaiGenSubtargetInfo(TargetTriple, Cpu, /*TuneCPU*/ Cpu, FeatureString),
       FrameLowering(initializeSubtargetDependencies(Cpu, FeatureString)),
-      InstrInfo(), TLInfo(TM, *this), TSInfo() {}
+      TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
index 7027d18126bb..d8a66bc8a0da 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
@@ -148,7 +148,7 @@ void LanaiInstPrinter::printInst(const MCInst *MI, uint64_t Address,
 
 void LanaiInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &OS, const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg())
     OS << "%" << getRegisterName(Op.getReg());
diff --git a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
index 4db879c34ad9..dcd581875f60 100644
--- a/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
+++ b/llvm/lib/Target/M68k/AsmParser/M68kAsmParser.cpp
@@ -1,4 +1,4 @@
-//===---- M68kAsmParser.cpp - Parse M68k assembly to MCInst instructions --===//
+//===-- M68kAsmParser.cpp - Parse M68k assembly to MCInst instructions ----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
index a08ffa787095..a565ff4e004d 100644
--- a/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
+++ b/llvm/lib/Target/M68k/Disassembler/M68kDisassembler.cpp
@@ -1,4 +1,4 @@
-//===- M68kDisassembler.cpp - Disassembler for M68k -------------*- C++ -*-===//
+//===-- M68kDisassembler.cpp - Disassembler for M68k ------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
index 9cd959012e6f..b3d17184f1fe 100644
--- a/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.cpp
@@ -1,4 +1,4 @@
-//===-- M68kCallLowering.cpp - Call lowering -------------------*- C++ -*-===//
+//===-- M68kCallLowering.cpp - Call lowering --------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/GISel/M68kCallLowering.h b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
index 47cdefdba100..24212e6dd9c6 100644
--- a/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
+++ b/llvm/lib/Target/M68k/GISel/M68kCallLowering.h
@@ -1,4 +1,4 @@
-//===-- M68kCallLowering.h - Call lowering -------------------*- C++ -*-===//
+//===-- M68kCallLowering.h - Call lowering ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp b/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp
index 9ac4ab9a5ba1..a627eccd110d 100644
--- a/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kInstructionSelector.cpp
@@ -1,4 +1,4 @@
-//===- M68kInstructionSelector.cpp ------------------------------*- C++ -*-===//
+//===-- M68kInstructionSelector.cpp -----------------------------*- C++ -*-===//
 //===----------------------------------------------------------------------===//
 /// \file
 /// This file implements the targeting of the InstructionSelector class for
diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
index bcbe62816beb..860c0ce29326 100644
--- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.cpp
@@ -1,4 +1,4 @@
-//===-- M68kLegalizerInfo.cpp ----------------------------------*- C++ -*-===//
+//===-- M68kLegalizerInfo.cpp -----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
index 205aa81aedcc..a10401ed1a9a 100644
--- a/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kLegalizerInfo.h
@@ -1,4 +1,4 @@
-//===- M68kLegalizerInfo --------------------------------------*- C++ -*-==//
+//===-- M68kLegalizerInfo ---------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
index 5c0f5dae8e37..b6ed6ab28a5d 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.cpp
@@ -1,4 +1,4 @@
-//===-- M68kRegisterBankInfo.cpp -------------------------------*- C++ -*-===//
+//===-- M68kRegisterBankInfo.cpp --------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
index 853c75df2bb3..6c0b8ca7ba5a 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBankInfo.h
@@ -1,4 +1,4 @@
-//===-- M68kRegisterBankInfo.h ---------------------------------*- C++ -*-===//
+//===-- M68kRegisterBankInfo.h ----------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -42,4 +42,4 @@ public:
   getInstrMapping(const MachineInstr &MI) const override;
 };
 } // end namespace llvm
-#endif
+#endif // LLVM_LIB_TARGET_M68K_GLSEL_M68KREGISTERBANKINFO_H
diff --git a/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td b/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td
index 942677a60e6c..2a00ec065cd4 100644
--- a/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td
+++ b/llvm/lib/Target/M68k/GISel/M68kRegisterBanks.td
@@ -1,4 +1,4 @@
-//===-- M68kRegisterBanks.td - Describe the M68k Banks -------*- tablegen -*-===//
+//===-- M68kRegisterBanks.td - Describe the M68k Banks -----*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68k.h b/llvm/lib/Target/M68k/M68k.h
index cef40bee7d93..b6069d736deb 100644
--- a/llvm/lib/Target/M68k/M68k.h
+++ b/llvm/lib/Target/M68k/M68k.h
@@ -1,4 +1,4 @@
-//===- M68k.h - Top-level interface for M68k representation -*- C++ -*-===//
+//===-- M68k.h - Top-level interface for M68k representation ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -54,4 +54,4 @@ createM68kInstructionSelector(const M68kTargetMachine &, const M68kSubtarget &,
 
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68K_H
diff --git a/llvm/lib/Target/M68k/M68k.td b/llvm/lib/Target/M68k/M68k.td
index fde491e1b6d5..de7a6c82d110 100644
--- a/llvm/lib/Target/M68k/M68k.td
+++ b/llvm/lib/Target/M68k/M68k.td
@@ -1,4 +1,4 @@
-//===-- M68k.td - Motorola 680x0 target definitions ------*- tablegen -*-===//
+//===-- M68k.td - Motorola 680x0 target definitions --------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
index 08b7153632b4..3bcce9e3ba3b 100644
--- a/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
+++ b/llvm/lib/Target/M68k/M68kAsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===----- M68kAsmPrinter.cpp - M68k LLVM Assembly Printer -----*- C++ -*-===//
+//===-- M68kAsmPrinter.cpp - M68k LLVM Assembly Printer ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kAsmPrinter.h b/llvm/lib/Target/M68k/M68kAsmPrinter.h
index dff3bb876336..1a76e3bf4e27 100644
--- a/llvm/lib/Target/M68k/M68kAsmPrinter.h
+++ b/llvm/lib/Target/M68k/M68kAsmPrinter.h
@@ -1,4 +1,4 @@
-//===----- M68kAsmPrinter.h - M68k LLVM Assembly Printer -------- C++ -*--===//
+//===-- M68kAsmPrinter.h - M68k LLVM Assembly Printer -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -66,4 +66,4 @@ public:
 };
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KASMPRINTER_H
diff --git a/llvm/lib/Target/M68k/M68kCallingConv.h b/llvm/lib/Target/M68k/M68kCallingConv.h
index 20ffa993897f..6823df5472df 100644
--- a/llvm/lib/Target/M68k/M68kCallingConv.h
+++ b/llvm/lib/Target/M68k/M68kCallingConv.h
@@ -1,4 +1,4 @@
-//===-- M68kCallingConv.h - M68k Custom CC Routines ---------*- C++ -*-===//
+//===-- M68kCallingConv.h - M68k Custom CC Routines -------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -74,4 +74,4 @@ inline bool CC_M68k_Any_AssignToReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KCALLINGCONV_H
diff --git a/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp b/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
index 4149ae92ffe9..7f0c0dd92dbb 100644
--- a/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
+++ b/llvm/lib/Target/M68k/M68kCollapseMOVEMPass.cpp
@@ -1,4 +1,4 @@
-//===----- M68kCollapseMOVEMPass.cpp - Expand MOVEM pass --------*- C++ -*-===//
+//===-- M68kCollapseMOVEMPass.cpp - Expand MOVEM pass -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
index 6a4aeaab518a..acfa30f28c2b 100644
--- a/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
+++ b/llvm/lib/Target/M68k/M68kExpandPseudo.cpp
@@ -1,4 +1,4 @@
-//===--M68kExpandPseudo.cpp - Expand pseudo instructions ------*- C++ -*-===//
+//===-- M68kExpandPseudo.cpp - Expand pseudo instructions -------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
index 66ea6ae38f43..643e156f9446 100644
--- a/llvm/lib/Target/M68k/M68kFrameLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
@@ -1,4 +1,4 @@
-//===-- M68kFrameLowering.cpp - M68k Frame Information ------*- C++ -*-===//
+//===-- M68kFrameLowering.cpp - M68k Frame Information ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -157,7 +157,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
       MachineOperand &MO = MBBI->getOperand(i);
       if (!MO.isReg() || MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
@@ -463,7 +463,7 @@ void M68kFrameLowering::emitPrologueCalleeSavedFrameMoves(
   // Calculate offsets.
   for (const auto &I : CSI) {
     int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
 
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
     BuildCFI(MBB, MBBI, DL,
@@ -485,7 +485,7 @@ void M68kFrameLowering::emitPrologue(MachineFunction &MF,
   uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
   bool NeedsDwarfCFI = MMI.hasDebugInfo() || Fn.needsUnwindTableEntry();
-  unsigned FramePtr = TRI->getFrameRegister(MF);
+  Register FramePtr = TRI->getFrameRegister(MF);
   const unsigned MachineFramePtr = FramePtr;
   unsigned BasePtr = TRI->getBaseRegister();
 
@@ -683,7 +683,7 @@ void M68kFrameLowering::emitEpilogue(MachineFunction &MF,
   DebugLoc DL;
   if (MBBI != MBB.end())
     DL = MBBI->getDebugLoc();
-  unsigned FramePtr = TRI->getFrameRegister(MF);
+  Register FramePtr = TRI->getFrameRegister(MF);
   unsigned MachineFramePtr = FramePtr;
 
   // Get the number of bytes to allocate from the FrameInfo.
@@ -819,7 +819,7 @@ bool M68kFrameLowering::assignCalleeSavedSpillSlots(
     // Since emitPrologue and emitEpilogue will handle spilling and restoring of
     // the frame register, we can delete it from CSI list and not have to worry
     // about avoiding it later.
-    unsigned FPReg = TRI->getFrameRegister(MF);
+    Register FPReg = TRI->getFrameRegister(MF);
     for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
       if (TRI->regsOverlap(CSI[i].getReg(), FPReg)) {
         CSI.erase(CSI.begin() + i);
@@ -842,7 +842,7 @@ bool M68kFrameLowering::spillCalleeSavedRegisters(
   unsigned Mask = 0;
   for (const auto &Info : CSI) {
     FI = std::max(FI, Info.getFrameIdx());
-    unsigned Reg = Info.getReg();
+    Register Reg = Info.getReg();
     unsigned Shift = MRI.getSpillRegisterOrder(Reg);
     Mask |= 1 << Shift;
   }
@@ -856,7 +856,7 @@ bool M68kFrameLowering::spillCalleeSavedRegisters(
   const MachineFunction &MF = *MBB.getParent();
   const MachineRegisterInfo &RI = MF.getRegInfo();
   for (const auto &Info : CSI) {
-    unsigned Reg = Info.getReg();
+    Register Reg = Info.getReg();
     bool IsLiveIn = RI.isLiveIn(Reg);
     if (!IsLiveIn)
       MBB.addLiveIn(Reg);
@@ -877,7 +877,7 @@ bool M68kFrameLowering::restoreCalleeSavedRegisters(
   unsigned Mask = 0;
   for (const auto &Info : CSI) {
     FI = std::max(FI, Info.getFrameIdx());
-    unsigned Reg = Info.getReg();
+    Register Reg = Info.getReg();
     unsigned Shift = MRI.getSpillRegisterOrder(Reg);
     Mask |= 1 << Shift;
   }
diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.h b/llvm/lib/Target/M68k/M68kFrameLowering.h
index 0eba9e08d858..a5349377232e 100644
--- a/llvm/lib/Target/M68k/M68kFrameLowering.h
+++ b/llvm/lib/Target/M68k/M68kFrameLowering.h
@@ -1,4 +1,4 @@
-//===- M68kFrameLowering.h - Define frame lowering for M68k -*- C++ -*-===//
+//===-- M68kFrameLowering.h - Define frame lowering for M68k ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -169,4 +169,4 @@ public:
 };
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KFRAMELOWERING_H
diff --git a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
index 0076c2647df3..9ef97b96ea9a 100644
--- a/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
+++ b/llvm/lib/Target/M68k/M68kISelDAGToDAG.cpp
@@ -1,4 +1,4 @@
-//===- M68kISelDAGToDAG.cpp - M68k Dag to Dag Inst Selector -*- C++ -*-===//
+//===-- M68kISelDAGToDAG.cpp - M68k Dag to Dag Inst Selector ----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 79b395f8f984..dba190a2ebc0 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- M68kISelLowering.cpp - M68k DAG Lowering Impl ------*- C++ -*--===//
+//===-- M68kISelLowering.cpp - M68k DAG Lowering Impl -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -268,7 +268,7 @@ static bool MatchingStackOffset(SDValue Arg, unsigned Offset,
 
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
-    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!Register::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
@@ -900,7 +900,7 @@ SDValue M68kTargetLowering::LowerFormalArguments(
       else
         llvm_unreachable("Unknown argument type!");
 
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
       // If this is an 8 or 16-bit value, it is really passed promoted to 32
@@ -1276,7 +1276,7 @@ bool M68kTargetLowering::IsEligibleForTailCallOptimization(
         CCValAssign &VA = ArgLocs[i];
         if (!VA.isRegLoc())
           continue;
-        unsigned Reg = VA.getLocReg();
+        Register Reg = VA.getLocReg();
         switch (Reg) {
         default:
           break;
@@ -1409,32 +1409,32 @@ SDValue M68kTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Arith, SetCC);
 }
 
-/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
-/// according to equal/not-equal condition code \p CC.
+/// Create a BTST (Bit Test) node - Test bit \p BitNo in \p Src and set
+/// condition according to equal/not-equal condition code \p CC.
 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
                                    const SDLoc &DL, SelectionDAG &DAG) {
-  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BT
+  // If Src is i8, promote it to i32 with any_extend.  There is no i8 BTST
   // instruction.  Since the shift amount is in-range-or-undefined, we know
   // that doing a bittest on the i32 value is ok.
   if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
     Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
 
   // If the operand types disagree, extend the shift amount to match.  Since
-  // BT ignores high bits (like shifts) we can use anyextend.
+  // BTST ignores high bits (like shifts) we can use anyextend.
   if (Src.getValueType() != BitNo.getValueType())
     BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
 
-  SDValue BT = DAG.getNode(M68kISD::BT, DL, MVT::i32, Src, BitNo);
+  SDValue BTST = DAG.getNode(M68kISD::BTST, DL, MVT::i32, Src, BitNo);
 
   // NOTE BTST sets CCR.Z flag
   M68k::CondCode Cond = CC == ISD::SETEQ ? M68k::COND_NE : M68k::COND_EQ;
   return DAG.getNode(M68kISD::SETCC, DL, MVT::i8,
-                     DAG.getConstant(Cond, DL, MVT::i8), BT);
+                     DAG.getConstant(Cond, DL, MVT::i8), BTST);
 }
 
-/// Result of 'and' is compared against zero. Change to a BT node if possible.
-static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL,
-                            SelectionDAG &DAG) {
+/// Result of 'and' is compared against zero. Change to a BTST node if possible.
+static SDValue LowerAndToBTST(SDValue And, ISD::CondCode CC, const SDLoc &DL,
+                              SelectionDAG &DAG) {
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
   if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -1468,7 +1468,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL,
       RHS = AndLHS.getOperand(1);
     }
 
-    // Use BT if the immediate can't be encoded in a TEST instruction.
+    // Use BTST if the immediate can't be encoded in a TEST instruction.
     if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
       LHS = AndLHS;
       RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), DL, LHS.getValueType());
@@ -1592,8 +1592,8 @@ static unsigned TranslateM68kCC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
 }
 
 // Convert (truncate (srl X, N) to i1) to (bt X, N)
-static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC, const SDLoc &DL,
-                                 SelectionDAG &DAG) {
+static SDValue LowerTruncateToBTST(SDValue Op, ISD::CondCode CC,
+                                   const SDLoc &DL, SelectionDAG &DAG) {
 
   assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
          "Expected TRUNCATE to i1 node");
@@ -1889,14 +1889,14 @@ SDValue M68kTargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned M68kCC,
 }
 
 /// Result of 'and' or 'trunc to i1' is compared against zero.
-/// Change to a BT node if possible.
-SDValue M68kTargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
-                                      const SDLoc &DL,
-                                      SelectionDAG &DAG) const {
+/// Change to a BTST node if possible.
+SDValue M68kTargetLowering::LowerToBTST(SDValue Op, ISD::CondCode CC,
+                                        const SDLoc &DL,
+                                        SelectionDAG &DAG) const {
   if (Op.getOpcode() == ISD::AND)
-    return LowerAndToBT(Op, CC, DL, DAG);
+    return LowerAndToBTST(Op, CC, DL, DAG);
   if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
-    return LowerTruncateToBT(Op, CC, DL, DAG);
+    return LowerTruncateToBTST(Op, CC, DL, DAG);
   return SDValue();
 }
 
@@ -1909,14 +1909,14 @@ SDValue M68kTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
-  // Optimize to BT if possible.
-  // Lower (X & (1 << N)) == 0 to BT(X, N).
-  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
-  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
-  // Lower (trunc (X >> N) to i1) to BT(X, N).
+  // Optimize to BTST if possible.
+  // Lower (X & (1 << N)) == 0 to BTST(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BTST(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BTST(X, N).
+  // Lower (trunc (X >> N) to i1) to BTST(X, N).
   if (Op0.hasOneUse() && isNullConstant(Op1) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    if (SDValue NewSetCC = LowerToBT(Op0, CC, DL, DAG)) {
+    if (SDValue NewSetCC = LowerToBTST(Op0, CC, DL, DAG)) {
       if (VT == MVT::i1)
         return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, NewSetCC);
       return NewSetCC;
@@ -2099,7 +2099,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     bool IllegalFPCMov = false;
 
-    if ((isM68kLogicalCmp(Cmp) && !IllegalFPCMov) || Opc == M68kISD::BT) {
+    if ((isM68kLogicalCmp(Cmp) && !IllegalFPCMov) || Opc == M68kISD::BTST) {
       Cond = Cmp;
       addTest = false;
     }
@@ -2163,7 +2163,7 @@ SDValue M68kTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     // We know the result of AND is compared against zero. Try to match
     // it to BT.
     if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
-      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+      if (SDValue NewSetCC = LowerToBTST(Cond, ISD::SETNE, DL, DAG)) {
         CC = NewSetCC.getOperand(0);
         Cond = NewSetCC.getOperand(1);
         addTest = false;
@@ -2282,7 +2282,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
 
-    if (isM68kLogicalCmp(Cmp) || Opc == M68kISD::BT) {
+    if (isM68kLogicalCmp(Cmp) || Opc == M68kISD::BTST) {
       Cond = Cmp;
       AddTest = false;
     } else {
@@ -2427,7 +2427,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
     // We know the result is compared against zero. Try to match it to BT.
     if (Cond.hasOneUse()) {
-      if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+      if (SDValue NewSetCC = LowerToBTST(Cond, ISD::SETNE, DL, DAG)) {
         CC = NewSetCC.getOperand(0);
         Cond = NewSetCC.getOperand(1);
         AddTest = false;
@@ -3101,9 +3101,9 @@ M68kTargetLowering::EmitLoweredSelect(MachineInstr &MI,
   // destination registers, and the registers that went into the PHI.
 
   for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
-    unsigned DestReg = MIIt->getOperand(0).getReg();
-    unsigned Op1Reg = MIIt->getOperand(1).getReg();
-    unsigned Op2Reg = MIIt->getOperand(2).getReg();
+    Register DestReg = MIIt->getOperand(0).getReg();
+    Register Op1Reg = MIIt->getOperand(1).getReg();
+    Register Op2Reg = MIIt->getOperand(2).getReg();
 
     // If this CMOV we are generating is the opposite condition from
     // the jump we generated, then we have to swap the operands for the
@@ -3211,13 +3211,13 @@ SDValue M68kTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     auto &MRI = MF.getRegInfo();
     auto SPTy = getPointerTy(DAG.getDataLayout());
     auto *ARClass = getRegClassFor(SPTy);
-    unsigned Vreg = MRI.createVirtualRegister(ARClass);
+    Register Vreg = MRI.createVirtualRegister(ARClass);
     Chain = DAG.getCopyToReg(Chain, DL, Vreg, Size);
     Result = DAG.getNode(M68kISD::SEG_ALLOCA, DL, SPTy, Chain,
                          DAG.getRegister(Vreg, SPTy));
   } else {
     auto &TLI = DAG.getTargetLoweringInfo();
-    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+    Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
                     " not tell us which reg is the stack pointer!");
 
@@ -3391,8 +3391,8 @@ const char *M68kTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "M68kISD::AND";
   case M68kISD::CMP:
     return "M68kISD::CMP";
-  case M68kISD::BT:
-    return "M68kISD::BT";
+  case M68kISD::BTST:
+    return "M68kISD::BTST";
   case M68kISD::SELECT:
     return "M68kISD::SELECT";
   case M68kISD::CMOV:
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.h b/llvm/lib/Target/M68k/M68kISelLowering.h
index 6a5a40a8815b..9375a99962eb 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.h
+++ b/llvm/lib/Target/M68k/M68kISelLowering.h
@@ -1,4 +1,4 @@
-//===-- M68kISelLowering.h - M68k DAG Lowering Interface ----*- C++ -*-===//
+//===-- M68kISelLowering.h - M68k DAG Lowering Interface --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -43,7 +43,7 @@ enum NodeType {
   CMP,
 
   /// M68k bit-test instructions.
-  BT,
+  BTST,
 
   /// M68k Select
   SELECT,
@@ -204,8 +204,8 @@ private:
                            const CCValAssign &VA, ISD::ArgFlagsTy Flags) const;
 
   SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &DL,
-                    SelectionDAG &DAG) const;
+  SDValue LowerToBTST(SDValue And, ISD::CondCode CC, const SDLoc &DL,
+                      SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -276,4 +276,4 @@ private:
 };
 } // namespace llvm
 
-#endif // M68kISELLOWERING_H
+#endif // LLVM_LIB_TARGET_M68K_M68KISELLOWERING_H
diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
index b2c05365d30b..ef50de576641 100644
--- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td
+++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td
@@ -150,8 +150,7 @@ let mayLoad = 1, mayStore = 1 in {
 
 // FIXME MxBiArOp_FMR/FMI cannot consume CCR from MxAdd/MxSub which leads for
 // MxAdd to survive the match and subsequent mismatch.
-class MxBiArOp_FMR<string MN, SDNode NODE, MxType TYPE,
-                   MxOperand MEMOpd, ComplexPattern MEMPat,
+class MxBiArOp_FMR<string MN, MxType TYPE, MxOperand MEMOpd,
                    bits<4> CMD, MxEncEA EA, MxEncExt EXT>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$opd),
              MN#"."#TYPE.Prefix#"\t$opd, $dst",
@@ -160,8 +159,7 @@ class MxBiArOp_FMR<string MN, SDNode NODE, MxType TYPE,
                              !cast<MxEncOpMode>("MxOpMode"#TYPE.Size#"EA"#TYPE.RLet),
                              MxBeadDReg<1>, EA, EXT>>;
 
-class MxBiArOp_FMI<string MN, SDNode NODE, MxType TYPE,
-                   MxOperand MEMOpd, ComplexPattern MEMPat,
+class MxBiArOp_FMI<string MN, MxType TYPE, MxOperand MEMOpd,
                    bits<4> CMD, MxEncEA MEMEA, MxEncExt MEMExt>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$opd),
              MN#"."#TYPE.Prefix#"\t$opd, $dst",
@@ -218,47 +216,47 @@ multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm,
   def NAME#"32di" : MxBiArOp_RFRI_xEA<MN, NODE, MxType32d, CMD>;
 
   // op $reg, $mem
-  def NAME#"8pd"  : MxBiArOp_FMR<MN, NODE, MxType8d,  MxType8.POp,  MxType8.PPat,
+  def NAME#"8pd"  : MxBiArOp_FMR<MN, MxType8d,  MxType8.POp,
                                  CMD, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"16pd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.POp, MxType16.PPat,
+  def NAME#"16pd" : MxBiArOp_FMR<MN, MxType16d, MxType16.POp,
                                  CMD, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"32pd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.POp, MxType32.PPat,
+  def NAME#"32pd" : MxBiArOp_FMR<MN, MxType32d, MxType32.POp,
                                  CMD, MxEncEAp_0, MxExtI16_0>;
 
-  def NAME#"8fd"  : MxBiArOp_FMR<MN, NODE, MxType8d,  MxType8.FOp,  MxType8.FPat,
+  def NAME#"8fd"  : MxBiArOp_FMR<MN, MxType8d,  MxType8.FOp,
                                  CMD, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"16fd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.FOp, MxType16.FPat,
+  def NAME#"16fd" : MxBiArOp_FMR<MN, MxType16d, MxType16.FOp,
                                  CMD, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"32fd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.FOp, MxType32.FPat,
+  def NAME#"32fd" : MxBiArOp_FMR<MN, MxType32d, MxType32.FOp,
                                  CMD, MxEncEAf_0, MxExtBrief_0>;
 
-  def NAME#"8jd"  : MxBiArOp_FMR<MN, NODE, MxType8d,  MxType8.JOp,  MxType8.JPat,
+  def NAME#"8jd"  : MxBiArOp_FMR<MN, MxType8d,  MxType8.JOp,
                                  CMD, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"16jd" : MxBiArOp_FMR<MN, NODE, MxType16d, MxType16.JOp, MxType16.JPat,
+  def NAME#"16jd" : MxBiArOp_FMR<MN, MxType16d, MxType16.JOp,
                                  CMD, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"32jd" : MxBiArOp_FMR<MN, NODE, MxType32d, MxType32.JOp, MxType32.JPat,
+  def NAME#"32jd" : MxBiArOp_FMR<MN, MxType32d, MxType32.JOp,
                                  CMD, MxEncEAj_0, MxExtEmpty>;
 
   // op $imm, $mem
-  def NAME#"8pi"  : MxBiArOp_FMI<MN, NODE, MxType8,  MxType8.POp,  MxType8.PPat,
+  def NAME#"8pi"  : MxBiArOp_FMI<MN, MxType8,  MxType8.POp,
                                  CMDI, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"16pi" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.POp, MxType16.PPat,
+  def NAME#"16pi" : MxBiArOp_FMI<MN, MxType16, MxType16.POp,
                                  CMDI, MxEncEAp_0, MxExtI16_0>;
-  def NAME#"32pi" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.POp, MxType32.PPat,
+  def NAME#"32pi" : MxBiArOp_FMI<MN, MxType32, MxType32.POp,
                                  CMDI, MxEncEAp_0, MxExtI16_0>;
 
-  def NAME#"8fi"  : MxBiArOp_FMI<MN, NODE, MxType8,  MxType8.FOp,  MxType8.FPat,
+  def NAME#"8fi"  : MxBiArOp_FMI<MN, MxType8,  MxType8.FOp,
                                  CMDI, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"16fi" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.FOp, MxType16.FPat,
+  def NAME#"16fi" : MxBiArOp_FMI<MN, MxType16, MxType16.FOp,
                                  CMDI, MxEncEAf_0, MxExtBrief_0>;
-  def NAME#"32fi" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.FOp, MxType32.FPat,
+  def NAME#"32fi" : MxBiArOp_FMI<MN, MxType32, MxType32.FOp,
                                  CMDI, MxEncEAf_0, MxExtBrief_0>;
 
-  def NAME#"8ji"  : MxBiArOp_FMI<MN, NODE, MxType8,  MxType8.JOp,  MxType8.JPat,
+  def NAME#"8ji"  : MxBiArOp_FMI<MN, MxType8,  MxType8.JOp,
                                  CMDI, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"16ji" : MxBiArOp_FMI<MN, NODE, MxType16, MxType16.JOp, MxType16.JPat,
+  def NAME#"16ji" : MxBiArOp_FMI<MN, MxType16, MxType16.JOp,
                                  CMDI, MxEncEAj_0, MxExtEmpty>;
-  def NAME#"32ji" : MxBiArOp_FMI<MN, NODE, MxType32, MxType32.JOp, MxType32.JPat,
+  def NAME#"32ji" : MxBiArOp_FMI<MN, MxType32, MxType32.JOp,
                                  CMDI, MxEncEAj_0, MxExtEmpty>;
 
   def NAME#"16dr" : MxBiArOp_RFRR_xEA<MN, NODE, MxType16d, MxType16r,
@@ -284,8 +282,7 @@ multiclass MxBiArOp_DF<string MN, SDNode NODE, bit isComm,
 // operations do not produce CCR we should not match them against Mx nodes that
 // produce it.
 let Pattern = [(null_frag)] in
-multiclass MxBiArOp_AF<string MN, SDNode NODE, bit isComm,
-                       bits<4> CMD, bits<4> CMDI> {
+multiclass MxBiArOp_AF<string MN, SDNode NODE, bits<4> CMD> {
 
   def NAME#"32ak" : MxBiArOp_RFRM<MN, NODE, MxType32a, MxType32.KOp, MxType32.KPat,
                                   CMD, MxEncEAk, MxExtBrief_2>;
@@ -307,9 +304,9 @@ multiclass MxBiArOp_AF<string MN, SDNode NODE, bit isComm,
 // NOTE These naturally produce CCR
 
 defm ADD : MxBiArOp_DF<"add",  MxAdd, 1, 0xD, 0x6>;
-defm ADD : MxBiArOp_AF<"adda", MxAdd, 1, 0xD, 0x6>;
+defm ADD : MxBiArOp_AF<"adda", MxAdd, 0xD>;
 defm SUB : MxBiArOp_DF<"sub",  MxSub, 0, 0x9, 0x4>;
-defm SUB : MxBiArOp_AF<"suba", MxSub, 0, 0x9, 0x4>;
+defm SUB : MxBiArOp_AF<"suba", MxSub, 0x9>;
 
 
 let Uses = [CCR], Defs = [CCR] in {
diff --git a/llvm/lib/Target/M68k/M68kInstrBits.td b/llvm/lib/Target/M68k/M68kInstrBits.td
index d97ca50f74a9..d610bce5c277 100644
--- a/llvm/lib/Target/M68k/M68kInstrBits.td
+++ b/llvm/lib/Target/M68k/M68kInstrBits.td
@@ -1,4 +1,4 @@
-//===------- M68kInstrBits.td - Bit Manipulation Instrs --*- tablegen -*-===//
+//===-- M68kInstrBits.td - Bit Manipulation Instrs ---------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 ///
 ///  Machine:
 ///
-///    BCNG    [ ]   BCLR    [ ]   BSET     [ ]   BTST     [~]
+///    BCHG    [ ]   BCLR    [ ]   BSET     [ ]   BTST     [~]
 ///
 ///  Map:
 ///
@@ -51,24 +51,24 @@ class MxBTSTEnc_I<MxBead8Imm IMM, MxEncEA EA, MxEncExt EXT>
 let Defs = [CCR] in {
 class MxBTST_RR<MxType TYPE>
     : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBt TYPE.VT:$dst, TYPE.VT:$bitno))],
+             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.VT:$bitno))],
              MxBTSTEnc_R<MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>;
 
 class MxBTST_RI<MxType TYPE>
     : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBt TYPE.VT:$dst, TYPE.IPat:$bitno))],
+             [(set CCR, (MxBtst TYPE.VT:$dst, TYPE.IPat:$bitno))],
              MxBTSTEnc_I<MxBead8Imm<1>, MxEncEAd_0, MxExtEmpty>>;
 
 class MxBTST_MR<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
                 MxEncEA EA, MxEncExt EXT>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBt (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))],
+             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))],
              MxBTSTEnc_R<MxBeadDReg<1>, EA, EXT>>;
 
 class MxBTST_MI<MxType TYPE, MxOperand MEMOpd, ComplexPattern MEMPat,
                 MxEncEA EA, MxEncExt EXT>
     : MxInst<(outs), (ins MEMOpd:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst",
-             [(set CCR, (MxBt (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))],
+             [(set CCR, (MxBtst (TYPE.Load MEMPat:$dst), TYPE.IPat:$bitno))],
              MxBTSTEnc_I<MxBead8Imm<1>, EA, EXT>>;
 } // Defs = [CCR]
 
diff --git a/llvm/lib/Target/M68k/M68kInstrBuilder.h b/llvm/lib/Target/M68k/M68kInstrBuilder.h
index e32b1b047a2b..e85bd270287c 100644
--- a/llvm/lib/Target/M68k/M68kInstrBuilder.h
+++ b/llvm/lib/Target/M68k/M68kInstrBuilder.h
@@ -1,4 +1,4 @@
-//===-- M68kInstrBuilder.h - Functions to build M68k insts --*- C++ -*-===//
+//===-- M68kInstrBuilder.h - Functions to build M68k insts ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -91,4 +91,4 @@ addMemOperand(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
 } // end namespace M68k
 } // end namespace llvm
 
-#endif // LLVM_LIB_TARGET_M6800_M6800INSTRBUILDER_H
+#endif // LLVM_LIB_TARGET_M68K_M68KINSTRBUILDER_H
diff --git a/llvm/lib/Target/M68k/M68kInstrCompiler.td b/llvm/lib/Target/M68k/M68kInstrCompiler.td
index 8fb331dec0e9..2ecf5ca0e6d0 100644
--- a/llvm/lib/Target/M68k/M68kInstrCompiler.td
+++ b/llvm/lib/Target/M68k/M68kInstrCompiler.td
@@ -1,4 +1,4 @@
-//===-- M68kInstrCompiler.td - Pseudos and Patterns ------*- tablegen -*-===//
+//===-- M68kInstrCompiler.td - Pseudos and Patterns --------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kInstrControl.td b/llvm/lib/Target/M68k/M68kInstrControl.td
index 9f87833ab0e2..be9045b6e0d2 100644
--- a/llvm/lib/Target/M68k/M68kInstrControl.td
+++ b/llvm/lib/Target/M68k/M68kInstrControl.td
@@ -1,4 +1,4 @@
-//===-- M68kInstrControl.td - Control Flow Instructions --*- tablegen -*-===//
+//===-- M68kInstrControl.td - Control Flow Instructions ----*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td
index 40b9e4a2a7fa..3dd5d9f8c7ac 100644
--- a/llvm/lib/Target/M68k/M68kInstrData.td
+++ b/llvm/lib/Target/M68k/M68kInstrData.td
@@ -1,4 +1,4 @@
-//== M68kInstrData.td - M68k Data Movement Instructions -*- tablegen --===//
+//===-- M68kInstrData.td - M68k Data Movement Instructions -*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kInstrFormats.td b/llvm/lib/Target/M68k/M68kInstrFormats.td
index 99b7ffd17971..7e0c96a5b1f6 100644
--- a/llvm/lib/Target/M68k/M68kInstrFormats.td
+++ b/llvm/lib/Target/M68k/M68kInstrFormats.td
@@ -1,4 +1,4 @@
-//=== M68kInstrFormats.td - M68k Instruction Formats ---*- tablegen -*-===//
+//===-- M68kInstrFormats.td - M68k Instruction Formats -----*- tablegen -*-===//
 //                     The LLVM Compiler Infrastructure
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.cpp b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
index 639bcd455687..105c816f9885 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.cpp
@@ -1,4 +1,4 @@
-//===-- M68kInstrInfo.cpp - M68k Instruction Information ----*- C++ -*-===//
+//===-- M68kInstrInfo.cpp - M68k Instruction Information --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -348,8 +348,8 @@ void M68kInstrInfo::AddZExt(MachineBasicBlock &MBB,
 bool M68kInstrInfo::ExpandMOVX_RR(MachineInstrBuilder &MIB, MVT MVTDst,
                                   MVT MVTSrc) const {
   unsigned Move = MVTDst == MVT::i16 ? M68k::MOV16rr : M68k::MOV32rr;
-  unsigned Dst = MIB->getOperand(0).getReg();
-  unsigned Src = MIB->getOperand(1).getReg();
+  Register Dst = MIB->getOperand(0).getReg();
+  Register Src = MIB->getOperand(1).getReg();
 
   assert(Dst != Src && "You cannot use the same Regs with MOVX_RR");
 
@@ -394,8 +394,8 @@ bool M68kInstrInfo::ExpandMOVSZX_RR(MachineInstrBuilder &MIB, bool IsSigned,
   else // i32
     Move = M68k::MOV32rr;
 
-  unsigned Dst = MIB->getOperand(0).getReg();
-  unsigned Src = MIB->getOperand(1).getReg();
+  Register Dst = MIB->getOperand(0).getReg();
+  Register Src = MIB->getOperand(1).getReg();
 
   assert(Dst != Src && "You cannot use the same Regs with MOVSX_RR");
 
@@ -437,7 +437,7 @@ bool M68kInstrInfo::ExpandMOVSZX_RM(MachineInstrBuilder &MIB, bool IsSigned,
                                     MVT MVTSrc) const {
   LLVM_DEBUG(dbgs() << "Expand " << *MIB.getInstr() << " to LOAD and ");
 
-  unsigned Dst = MIB->getOperand(0).getReg();
+  Register Dst = MIB->getOperand(0).getReg();
 
   // We need the subreg of Dst to make instruction verifier happy because the
   // real machine instruction consumes and produces values of the same size and
@@ -559,7 +559,7 @@ bool M68kInstrInfo::ExpandMOVEM(MachineInstrBuilder &MIB,
 static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
                              const MCInstrDesc &Desc) {
   assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
-  unsigned Reg = MIB->getOperand(0).getReg();
+  Register Reg = MIB->getOperand(0).getReg();
   MIB->setDesc(Desc);
 
   // MachineInstr::addOperand() will insert explicit operands before any
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.h b/llvm/lib/Target/M68k/M68kInstrInfo.h
index 6aced1487365..84d50c181ead 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.h
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- M68kInstrInfo.h - M68k Instruction Information ------*- C++ -*-===//
+//===-- M68kInstrInfo.h - M68k Instruction Information ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -336,4 +336,4 @@ public:
 
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KINSTRINFO_H
diff --git a/llvm/lib/Target/M68k/M68kInstrInfo.td b/llvm/lib/Target/M68k/M68kInstrInfo.td
index ed6cd9ecf442..c581dd91eaaa 100644
--- a/llvm/lib/Target/M68k/M68kInstrInfo.td
+++ b/llvm/lib/Target/M68k/M68kInstrInfo.td
@@ -1,4 +1,4 @@
-//== M68kInstrInfo.td - Main M68k Instruction Definition -*- tablegen -*-=//
+//===-- M68kInstrInfo.td - Main M68k Instruction Definition -*- tablegen -*-==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -137,7 +137,7 @@ def MxSMul : SDNode<"M68kISD::SMUL", MxSDT_BiArithCCROut, [SDNPCommutative]>;
 def MxUMul : SDNode<"M68kISD::UMUL", MxSDT_2BiArithCCROut, [SDNPCommutative]>;
 
 def MxCmp     : SDNode<"M68kISD::CMP", MxSDT_CmpTest>;
-def MxBt      : SDNode<"M68kISD::BT",  MxSDT_CmpTest>;
+def MxBtst    : SDNode<"M68kISD::BTST", MxSDT_CmpTest>;
 
 def MxCmov    : SDNode<"M68kISD::CMOV",        MxSDT_Cmov>;
 def MxBrCond  : SDNode<"M68kISD::BRCOND",      MxSDT_BrCond, [SDNPHasChain]>;
@@ -587,8 +587,8 @@ class MxType<ValueType vt, string prefix, string postfix,
              // qOp:  Supported PCD operand
              // qPat: What PCD pattern is used
              MxOperand qOp, ComplexPattern qPat,
-             // kOp:  Supported PCD operand
-             // kPat: What PCD pattern is used
+             // kOp:  Supported PCI operand
+             // kPat: What PCI pattern is used
              MxOperand kOp, ComplexPattern kPat,
              // iOp:  Supported immediate operand
              // iPat: What immediate pattern is used
diff --git a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
index cab687638076..f1967ec11928 100644
--- a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
+++ b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td
@@ -1,4 +1,4 @@
-//===------ M68kInstrShiftRotate.td - Logical Instrs -----*- tablegen -*-===//
+//===-- M68kInstrShiftRotate.td - Logical Instrs -----------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.cpp b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
index f14361559b13..a0b1452ee663 100644
--- a/llvm/lib/Target/M68k/M68kMCInstLower.cpp
+++ b/llvm/lib/Target/M68k/M68kMCInstLower.cpp
@@ -1,4 +1,4 @@
-//===-- M68kMCInstLower.cpp - M68k MachineInstr to MCInst ---*- C++ -*-===//
+//===-- M68kMCInstLower.cpp - M68k MachineInstr to MCInst -------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kMCInstLower.h b/llvm/lib/Target/M68k/M68kMCInstLower.h
index d6160629545e..76d9a36f70ef 100644
--- a/llvm/lib/Target/M68k/M68kMCInstLower.h
+++ b/llvm/lib/Target/M68k/M68kMCInstLower.h
@@ -1,4 +1,4 @@
-//===-- M68kMCInstLower.h - Lower MachineInstr to MCInst -----*- C++ -*--===//
+//===-- M68kMCInstLower.h - Lower MachineInstr to MCInst --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -51,4 +51,4 @@ public:
 };
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KMCINSTLOWER_H
diff --git a/llvm/lib/Target/M68k/M68kMachineFunction.cpp b/llvm/lib/Target/M68k/M68kMachineFunction.cpp
index 3d048df7ba49..b1e7369116d7 100644
--- a/llvm/lib/Target/M68k/M68kMachineFunction.cpp
+++ b/llvm/lib/Target/M68k/M68kMachineFunction.cpp
@@ -1,4 +1,4 @@
-//===-- M68kMachineFunctionInfo.cpp - M68k private data ----*- C++ -*--===//
+//===-- M68kMachineFunctionInfo.cpp - M68k private data ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kMachineFunction.h b/llvm/lib/Target/M68k/M68kMachineFunction.h
index 5760bdd4b9e3..93c5255199d4 100644
--- a/llvm/lib/Target/M68k/M68kMachineFunction.h
+++ b/llvm/lib/Target/M68k/M68kMachineFunction.h
@@ -1,4 +1,4 @@
-//===-- M68kMachineFunctionInfo.h - M68k private data ---------*- C++ -*-=//
+//===-- M68kMachineFunctionInfo.h - M68k private data -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -111,4 +111,4 @@ private:
 
 } // end of namespace llvm
 
-#endif // M68K_MACHINE_FUNCTION_INFO_H
+#endif // LLVM_LIB_TARGET_M68K_M68KMACHINEFUNCTION_H
diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
index 69d16035b1d9..0cae7ac4e312 100644
--- a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- M68kRegisterInfo.cpp - CPU0 Register Information -----*- C++ -*--===//
+//===-- M68kRegisterInfo.cpp - CPU0 Register Information --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.h b/llvm/lib/Target/M68k/M68kRegisterInfo.h
index 51b94294772c..7f822e1cb34f 100644
--- a/llvm/lib/Target/M68k/M68kRegisterInfo.h
+++ b/llvm/lib/Target/M68k/M68kRegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- M68kRegisterInfo.h - M68k Register Information Impl --*- C++ --===//
+//===-- M68kRegisterInfo.h - M68k Register Information Impl -----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -106,4 +106,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KREGISTERINFO_H
diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.td b/llvm/lib/Target/M68k/M68kRegisterInfo.td
index e2ea2967f75b..49874a2b1099 100644
--- a/llvm/lib/Target/M68k/M68kRegisterInfo.td
+++ b/llvm/lib/Target/M68k/M68kRegisterInfo.td
@@ -1,4 +1,4 @@
-//== M68kRegisterInfo.td - M68k register definitions ----*- tablegen -*-==//
+//==-- M68kRegisterInfo.td - M68k register definitions ------*- tablegen -*-==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kSchedule.td b/llvm/lib/Target/M68k/M68kSchedule.td
index a94cd8f31e2e..6a1bf0c6a020 100644
--- a/llvm/lib/Target/M68k/M68kSchedule.td
+++ b/llvm/lib/Target/M68k/M68kSchedule.td
@@ -1,4 +1,4 @@
-//===-- M68kSchedule.td - M68k Scheduling Definitions --*- tablegen -*-===//
+//===-- M68kSchedule.td - M68k Scheduling Definitions ------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.cpp b/llvm/lib/Target/M68k/M68kSubtarget.cpp
index 991889706e67..ec3830243daf 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.cpp
+++ b/llvm/lib/Target/M68k/M68kSubtarget.cpp
@@ -1,4 +1,4 @@
-//===-- M68kSubtarget.cpp - M68k Subtarget Information ------*- C++ -*-===//
+//===-- M68kSubtarget.cpp - M68k Subtarget Information ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kSubtarget.h b/llvm/lib/Target/M68k/M68kSubtarget.h
index f45cb7edca1f..9bf2984983a1 100644
--- a/llvm/lib/Target/M68k/M68kSubtarget.h
+++ b/llvm/lib/Target/M68k/M68kSubtarget.h
@@ -1,4 +1,4 @@
-//===-- M68kSubtarget.h - Define Subtarget for the M68k -----*- C++ -*-===//
+//===-- M68kSubtarget.h - Define Subtarget for the M68k ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,8 +11,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_CPU0_M68KSUBTARGET_H
-#define LLVM_LIB_TARGET_CPU0_M68KSUBTARGET_H
+#ifndef LLVM_LIB_TARGET_M68K_M68KSUBTARGET_H
+#define LLVM_LIB_TARGET_M68K_M68KSUBTARGET_H
 
 #include "M68kFrameLowering.h"
 #include "M68kISelLowering.h"
@@ -179,4 +179,4 @@ public:
 };
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KSUBTARGET_H
diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
index e8126c6219e8..fd21fe6bcea8 100644
--- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp
+++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
@@ -1,4 +1,4 @@
-//===-- M68kTargetMachine.cpp - M68k target machine ---------*- C++ -*-===//
+//===-- M68kTargetMachine.cpp - M68k Target Machine -------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.h b/llvm/lib/Target/M68k/M68kTargetMachine.h
index 34fae8e45504..8dda720774e7 100644
--- a/llvm/lib/Target/M68k/M68kTargetMachine.h
+++ b/llvm/lib/Target/M68k/M68kTargetMachine.h
@@ -1,4 +1,4 @@
-//===-- M68kTargetMachine.h - Define TargetMachine for M68k ----- C++ -===//
+//===-- M68kTargetMachine.h - Define TargetMachine for M68k -----*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -53,4 +53,4 @@ public:
 };
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KTARGETMACHINE_H
diff --git a/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp b/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp
index 3e26b37e7760..4986d5dbebb9 100644
--- a/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp
+++ b/llvm/lib/Target/M68k/M68kTargetObjectFile.cpp
@@ -1,4 +1,4 @@
-//===-- M68kELFTargetObjectFile.cpp - M68k Object Files -----*- C++ -*-===//
+//===-- M68kELFTargetObjectFile.cpp - M68k Object Files ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/M68kTargetObjectFile.h b/llvm/lib/Target/M68k/M68kTargetObjectFile.h
index dbc5375d5423..80a7d0d6e120 100644
--- a/llvm/lib/Target/M68k/M68kTargetObjectFile.h
+++ b/llvm/lib/Target/M68k/M68kTargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- M68kELFTargetObjectFile.h - M68k Object Info ---------*- C++ -====//
+//===-- M68kELFTargetObjectFile.h - M68k Object Info ------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -28,4 +28,4 @@ public:
 };
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_M68KTARGETOBJECTFILE_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index c1f88fb78ee1..b66557ec6c3a 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -1,4 +1,4 @@
-//===-- M68kAsmBackend.cpp - M68k Assembler Backend ---------*- C++ -*-===//
+//===-- M68kAsmBackend.cpp - M68k Assembler Backend -------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h
index 7c56cfdf3123..4883f647e214 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h
@@ -1,4 +1,4 @@
-//===-- M68kBaseInfo.h - Top level definitions for M68k MC --*- C++ -*-----===//
+//===-- M68kBaseInfo.h - Top level definitions for M68k MC ------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -244,4 +244,4 @@ static inline unsigned getMaskedSpillRegister(unsigned order) {
 
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KBASEINFO_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
index 4c9a3297424d..27f1b3a3fac8 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kELFObjectWriter.cpp
@@ -1,4 +1,4 @@
-//===---------- M68kELFObjectWriter.cpp - M68k ELF Writer ---*- C++ -*-===//
+//===-- M68kELFObjectWriter.cpp - M68k ELF Writer ---------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h
index 2b760dec9e41..54a0e98fea6e 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kFixupKinds.h
@@ -1,4 +1,4 @@
-//===-- M68kFixupKinds.h - M68k Specific Fixup Entries ------*- C++ -*-===//
+//===-- M68kFixupKinds.h - M68k Specific Fixup Entries ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -51,4 +51,4 @@ static inline MCFixupKind getFixupForSize(unsigned Size, bool isPCRel) {
 
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68k_MCTARGETDESC_M68kFIXUPKINDS_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
index a2e41437ee21..9ba28622b5b5 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- M68kInstPrinter.cpp - Convert M68k MCInst to asm ----*- C++ -*-===//
+//===-- M68kInstPrinter.cpp - Convert M68k MCInst to asm --------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h
index ec26bc4ddbfd..239268dd7159 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kInstPrinter.h
@@ -1,4 +1,4 @@
-//===-- M68kInstPrinter.h - Convert M68k MCInst to asm ------*- C++ -*-===//
+//===-- M68kInstPrinter.h - Convert M68k MCInst to asm ----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -166,4 +166,4 @@ private:
 };
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_INSTPRINTER_M68KINSTPRINTER_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
index ee2041012bb9..005d2d38f53d 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.cpp
@@ -1,4 +1,4 @@
-//===-- M68kMCAsmInfo.cpp - M68k Asm Properties -------------*- C++ -*-===//
+//===-- M68kMCAsmInfo.cpp - M68k Asm Properties -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
index b3a58cc61223..873264d88674 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCAsmInfo.h
@@ -1,4 +1,4 @@
-//===-- M68kMCAsmInfo.h - M68k Asm Info --------------------*- C++ -*--===//
+//===-- M68kMCAsmInfo.h - M68k Asm Info -------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -28,4 +28,4 @@ public:
 
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCASMINFO_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
index 9708abaadf98..9227bd6c3a78 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- M68kMCCodeEmitter.cpp - Convert M68k code emitter ---*- C++ -*-===//
+//===-- M68kMCCodeEmitter.cpp - Convert M68k code emitter -------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h
index 242a1297206a..aba705aa54b6 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.h
@@ -1,4 +1,4 @@
-//===-- M68kMCCodeEmitter.h - M68k Code Emitter ----------------*- C++ -*--===//
+//===-- M68kMCCodeEmitter.h - M68k Code Emitter -----------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -25,4 +25,4 @@ const uint8_t *getMCInstrBeads(unsigned);
 } // namespace M68k
 } // namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCCODEEMITTER_H
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
index 9f4db895a821..2606e22410fc 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.cpp
@@ -1,4 +1,4 @@
-//===-- M68kMCTargetDesc.cpp - M68k Target Descriptions -----*- C++ -*-===//
+//===-- M68kMCTargetDesc.cpp - M68k Target Descriptions ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
index a0ebca0ce36c..aa53e13af4fc 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCTargetDesc.h
@@ -1,4 +1,4 @@
-//===-- M68kMCTargetDesc.h - M68k Target Descriptions -------*- C++ -*-===//
+//===-- M68kMCTargetDesc.h - M68k Target Descriptions -----------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -58,4 +58,4 @@ std::unique_ptr<MCObjectTargetWriter> createM68kELFObjectWriter(uint8_t OSABI);
 #define GET_SUBTARGETINFO_ENUM
 #include "M68kGenSubtargetInfo.inc"
 
-#endif
+#endif // LLVM_LIB_TARGET_M68K_MCTARGETDESC_M68KMCTARGETDESC_H
diff --git a/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp b/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
index 2a225b8a43cd..4701f46b0298 100644
--- a/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
+++ b/llvm/lib/Target/M68k/TargetInfo/M68kTargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- M68kTargetInfo.cpp - M68k Target Implementation -----*- C++ -*-===//
+//===-- M68kTargetInfo.cpp - M68k Target Implementation ---------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index c1677baf52a7..13cba8b079a9 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -114,13 +114,14 @@ class MSP430Operand : public MCParsedAsmOperand {
 
 public:
   MSP430Operand(StringRef Tok, SMLoc const &S)
-      : Base(), Kind(k_Tok), Tok(Tok), Start(S), End(S) {}
+      : Kind(k_Tok), Tok(Tok), Start(S), End(S) {}
   MSP430Operand(KindTy Kind, unsigned Reg, SMLoc const &S, SMLoc const &E)
-      : Base(), Kind(Kind), Reg(Reg), Start(S), End(E) {}
+      : Kind(Kind), Reg(Reg), Start(S), End(E) {}
   MSP430Operand(MCExpr const *Imm, SMLoc const &S, SMLoc const &E)
-      : Base(), Kind(k_Imm), Imm(Imm), Start(S), End(E) {}
-  MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S, SMLoc const &E)
-      : Base(), Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {}
+      : Kind(k_Imm), Imm(Imm), Start(S), End(E) {}
+  MSP430Operand(unsigned Reg, MCExpr const *Expr, SMLoc const &S,
+                SMLoc const &E)
+      : Kind(k_Mem), Mem({Reg, Expr}), Start(S), End(E) {}
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
     assert((Kind == k_Reg || Kind == k_IndReg || Kind == k_PostIndReg) &&
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
index 0cdd1f4f701f..bb5351af6523 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFObjectWriter.cpp
@@ -9,7 +9,6 @@
 #include "MCTargetDesc/MSP430FixupKinds.h"
 #include "MCTargetDesc/MSP430MCTargetDesc.h"
 
-#include "MCTargetDesc/MSP430MCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCObjectWriter.h"
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index 4ef9a567d453..6a8dc3502496 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -190,7 +190,7 @@ bool MSP430FrameLowering::spillCalleeSavedRegisters(
   MFI->setCalleeSavedFrameSize(CSI.size() * 2);
 
   for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     BuildMI(MBB, MI, DL, TII.get(MSP430::PUSH16r))
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index c64a44a0ef95..aebfc6b0ae2e 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -705,7 +705,7 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     if (Ins[i].Flags.isSRet()) {
-      unsigned Reg = FuncInfo->getSRetReturnReg();
+      Register Reg = FuncInfo->getSRetReturnReg();
       if (!Reg) {
         Reg = MF.getRegInfo().createVirtualRegister(
             getRegClassFor(MVT::i16));
@@ -772,7 +772,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   if (MF.getFunction().hasStructRetAttr()) {
     MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
-    unsigned Reg = FuncInfo->getSRetReturnReg();
+    Register Reg = FuncInfo->getSRetReturnReg();
 
     if (!Reg)
       llvm_unreachable("sret virtual register not created in entry block");
@@ -1402,12 +1402,12 @@ bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
 
 bool MSP430TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
-  return 0 && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
+  return false && Ty1->isIntegerTy(8) && Ty2->isIntegerTy(16);
 }
 
 bool MSP430TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
-  return 0 && VT1 == MVT::i8 && VT2 == MVT::i16;
+  return false && VT1 == MVT::i8 && VT2 == MVT::i16;
 }
 
 bool MSP430TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
diff --git a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
index 2fd58717c4db..0604d47597e2 100644
--- a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -57,5 +57,5 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
 
 MSP430Subtarget::MSP430Subtarget(const Triple &TT, const std::string &CPU,
                                  const std::string &FS, const TargetMachine &TM)
-    : MSP430GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(),
+    : MSP430GenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {}
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 01b5dff2e448..736c41f8ac03 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -827,8 +827,7 @@ private:
   } Kind;
 
 public:
-  MipsOperand(KindTy K, MipsAsmParser &Parser)
-      : MCParsedAsmOperand(), Kind(K), AsmParser(Parser) {}
+  MipsOperand(KindTy K, MipsAsmParser &Parser) : Kind(K), AsmParser(Parser) {}
 
   ~MipsOperand() override {
     switch (Kind) {
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index bfe413a152b6..a3dbe6f84a1e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -197,7 +197,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 26-bit signed immediate.
     if (!isInt<26>(Value)) {
-      Ctx.reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+      Ctx.reportError(Fixup.getLoc(), "out of range PC26 fixup");
       return 0;
     }
     break;
diff --git a/llvm/lib/Target/Mips/Mips.h b/llvm/lib/Target/Mips/Mips.h
index b3faaab436f0..faf58545db62 100644
--- a/llvm/lib/Target/Mips/Mips.h
+++ b/llvm/lib/Target/Mips/Mips.h
@@ -38,6 +38,7 @@ namespace llvm {
   FunctionPass *createMicroMipsSizeReducePass();
   FunctionPass *createMipsExpandPseudoPass();
   FunctionPass *createMipsPreLegalizeCombiner();
+  FunctionPass *createMipsMulMulBugPass();
 
   InstructionSelector *createMipsInstructionSelector(const MipsTargetMachine &,
                                                      MipsSubtarget &,
@@ -47,6 +48,7 @@ namespace llvm {
   void initializeMipsBranchExpansionPass(PassRegistry &);
   void initializeMicroMipsSizeReducePass(PassRegistry &);
   void initializeMipsPreLegalizerCombinerPass(PassRegistry&);
+  void initializeMipsMulMulBugFixPass(PassRegistry&);
 } // end namespace llvm;
 
 #endif
diff --git a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
index 622f2039f9e4..4f4e3f3f2ed7 100644
--- a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -74,7 +74,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
 
     for (const CalleeSavedInfo &I : CSI) {
       int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
-      unsigned Reg = I.getReg();
+      Register Reg = I.getReg();
       unsigned DReg = MRI->getDwarfRegNum(Reg, true);
       unsigned CFIIndex = MF.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, DReg, Offset));
@@ -124,7 +124,7 @@ bool Mips16FrameLowering::spillCalleeSavedRegisters(
     // method MipsTargetLowering::lowerRETURNADDR.
     // It's killed at the spill, unless the register is RA and return address
     // is taken.
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
       && MF->getFrameInfo().isReturnAddressTaken();
     if (!IsRAAndRetAddrIsTaken)
diff --git a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index ddd28d095e51..50147c019bfd 100644
--- a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -121,7 +121,7 @@ bool Mips16DAGToDAGISel::selectAddr(bool SPAllowed, SDValue Addr, SDValue &Base,
   }
   // Addresses of the form FI+const or FI|const
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
-    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    auto *CN = cast<ConstantSDNode>(Addr.getOperand(1));
     if (isInt<16>(CN->getSExtValue())) {
       // If the first operand is a FI, get the TargetFI Node
       if (SPAllowed) {
diff --git a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
index 136612c59d96..78ffe00c020c 100644
--- a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -451,7 +451,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
           // So for now we always save S2. The optimization will be done
           // in a follow-on patch.
           //
-          if (1 || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet))
+          if (true || (Signature->RetSig != Mips16HardFloatInfo::NoFPRet))
             FuncInfo->setSaveS2();
         }
         // one more look at list of intrinsics
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index 3403ec01aef2..02d0e770ba66 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -190,7 +190,7 @@ static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
     // method MipsTargetLowering::lowerRETURNADDR.
     // It's killed at the spill, unless the register is RA and return address
     // is taken.
-    unsigned Reg = CSI[e-i-1].getReg();
+    Register Reg = CSI[e-i-1].getReg();
     switch (Reg) {
     case Mips::RA:
     case Mips::S0:
diff --git a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
index f6f43da9abf8..563118dfe627 100644
--- a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -37,7 +37,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips16-registerinfo"
 
-Mips16RegisterInfo::Mips16RegisterInfo() : MipsRegisterInfo() {}
+Mips16RegisterInfo::Mips16RegisterInfo() {}
 
 bool Mips16RegisterInfo::requiresRegisterScavenging
   (const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index ae2b83c414db..33da0ff31be8 100644
--- a/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/llvm/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -25,8 +25,8 @@ void MipsAnalyzeImmediate::AddInstr(InstSeqLs &SeqLs, const Inst &I) {
     return;
   }
 
-  for (InstSeqLs::iterator Iter = SeqLs.begin(); Iter != SeqLs.end(); ++Iter)
-    Iter->push_back(I);
+  for (auto &S : SeqLs)
+    S.push_back(I);
 }
 
 void MipsAnalyzeImmediate::GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize,
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 5d026785b921..4bd8845e9cb9 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -80,13 +80,9 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   MipsFI = MF.getInfo<MipsFunctionInfo>();
   if (Subtarget->inMips16Mode())
-    for (std::map<
-             const char *,
-             const Mips16HardFloatInfo::FuncSignature *>::const_iterator
-             it = MipsFI->StubsNeeded.begin();
-         it != MipsFI->StubsNeeded.end(); ++it) {
-      const char *Symbol = it->first;
-      const Mips16HardFloatInfo::FuncSignature *Signature = it->second;
+    for (const auto &I : MipsFI->StubsNeeded) {
+      const char *Symbol = I.first;
+      const Mips16HardFloatInfo::FuncSignature *Signature = I.second;
       if (StubsNeeded.find(Symbol) == StubsNeeded.end())
         StubsNeeded[Symbol] = Signature;
     }
@@ -341,7 +337,7 @@ void MipsAsmPrinter::printSavedRegsBitmask() {
   unsigned CSFPRegsSize = 0;
 
   for (const auto &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     unsigned RegNum = TRI->getEncodingValue(Reg);
 
     // If it's a floating point register, set the FPU Bitmask.
@@ -1279,11 +1275,11 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
   // Align all blocks that are jumped to through jump table.
   if (MachineJumpTableInfo *JtInfo = MF.getJumpTableInfo()) {
     const std::vector<MachineJumpTableEntry> &JT = JtInfo->getJumpTables();
-    for (unsigned I = 0; I < JT.size(); ++I) {
-      const std::vector<MachineBasicBlock*> &MBBs = JT[I].MBBs;
+    for (const auto &I : JT) {
+      const std::vector<MachineBasicBlock *> &MBBs = I.MBBs;
 
-      for (unsigned J = 0; J < MBBs.size(); ++J)
-        MBBs[J]->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
+      for (MachineBasicBlock *MBB : MBBs)
+        MBB->setAlignment(MIPS_NACL_BUNDLE_ALIGN);
     }
   }
 
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.h b/llvm/lib/Target/Mips/MipsCallLowering.h
index 1d1406da3201..9f114d55db4c 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.h
+++ b/llvm/lib/Target/Mips/MipsCallLowering.h
@@ -18,7 +18,6 @@
 
 namespace llvm {
 
-class MachineMemOperand;
 class MipsTargetLowering;
 
 class MipsCallLowering : public CallLowering {
diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 491d379bfe0b..1efbf5570287 100644
--- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -604,9 +604,9 @@ MipsConstantIslands::CPEntry
   std::vector<CPEntry> &CPEs = CPEntries[CPI];
   // Number of entries per constpool index should be small, just do a
   // linear search.
-  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
-    if (CPEs[i].CPEMI == CPEMI)
-      return &CPEs[i];
+  for (CPEntry &CPE : CPEs) {
+    if (CPE.CPEMI == CPEMI)
+      return &CPE;
   }
   return nullptr;
 }
@@ -1052,27 +1052,27 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
   // No.  Look for previously created clones of the CPE that are in range.
   unsigned CPI = CPEMI->getOperand(1).getIndex();
   std::vector<CPEntry> &CPEs = CPEntries[CPI];
-  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+  for (CPEntry &CPE : CPEs) {
     // We already tried this one
-    if (CPEs[i].CPEMI == CPEMI)
+    if (CPE.CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == nullptr)
+    if (CPE.CPEMI == nullptr)
       continue;
-    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
-                     U.NegOk)) {
-      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
-                        << CPEs[i].CPI << "\n");
+    if (isCPEntryInRange(UserMI, UserOffset, CPE.CPEMI, U.getMaxDisp(),
+                         U.NegOk)) {
+      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" << CPE.CPI
+                        << "\n");
       // Point the CPUser node to the replacement
-      U.CPEMI = CPEs[i].CPEMI;
+      U.CPEMI = CPE.CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
       for (MachineOperand &MO : UserMI->operands())
         if (MO.isCPI()) {
-          MO.setIndex(CPEs[i].CPI);
+          MO.setIndex(CPE.CPI);
           break;
         }
       // Adjust the refcount of the clone...
-      CPEs[i].RefCount++;
+      CPE.RefCount++;
       // ...and the original.  If we didn't remove the old entry, none of the
       // addresses changed, so we don't need another pass.
       return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
@@ -1108,27 +1108,27 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
   // No.  Look for previously created clones of the CPE that are in range.
   unsigned CPI = CPEMI->getOperand(1).getIndex();
   std::vector<CPEntry> &CPEs = CPEntries[CPI];
-  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+  for (CPEntry &CPE : CPEs) {
     // We already tried this one
-    if (CPEs[i].CPEMI == CPEMI)
+    if (CPE.CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == nullptr)
+    if (CPE.CPEMI == nullptr)
       continue;
-    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
-                         U.getLongFormMaxDisp(), U.NegOk)) {
-      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
-                        << CPEs[i].CPI << "\n");
+    if (isCPEntryInRange(UserMI, UserOffset, CPE.CPEMI, U.getLongFormMaxDisp(),
+                         U.NegOk)) {
+      LLVM_DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#" << CPE.CPI
+                        << "\n");
       // Point the CPUser node to the replacement
-      U.CPEMI = CPEs[i].CPEMI;
+      U.CPEMI = CPE.CPEMI;
       // Change the CPI in the instruction operand to refer to the clone.
       for (MachineOperand &MO : UserMI->operands())
         if (MO.isCPI()) {
-          MO.setIndex(CPEs[i].CPI);
+          MO.setIndex(CPE.CPI);
           break;
         }
       // Adjust the refcount of the clone...
-      CPEs[i].RefCount++;
+      CPE.RefCount++;
       // ...and the original.  If we didn't remove the old entry, none of the
       // addresses changed, so we don't need another pass.
       return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
@@ -1435,15 +1435,14 @@ void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
 /// are zero.
 bool MipsConstantIslands::removeUnusedCPEntries() {
   unsigned MadeChange = false;
-  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
-      std::vector<CPEntry> &CPEs = CPEntries[i];
-      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
-        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
-          removeDeadCPEMI(CPEs[j].CPEMI);
-          CPEs[j].CPEMI = nullptr;
-          MadeChange = true;
-        }
+  for (std::vector<CPEntry> &CPEs : CPEntries) {
+    for (CPEntry &CPE : CPEs) {
+      if (CPE.RefCount == 0 && CPE.CPEMI) {
+        removeDeadCPEMI(CPE.CPEMI);
+        CPE.CPEMI = nullptr;
+        MadeChange = true;
       }
+    }
   }
   return MadeChange;
 }
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 2d27d7553de6..cf6cec22308c 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -309,12 +309,12 @@ INITIALIZE_PASS(MipsDelaySlotFiller, DEBUG_TYPE,
 static void insertDelayFiller(Iter Filler, const BB2BrMap &BrMap) {
   MachineFunction *MF = Filler->getParent()->getParent();
 
-  for (BB2BrMap::const_iterator I = BrMap.begin(); I != BrMap.end(); ++I) {
-    if (I->second) {
-      MIBundleBuilder(I->second).append(MF->CloneMachineInstr(&*Filler));
+  for (const auto &I : BrMap) {
+    if (I.second) {
+      MIBundleBuilder(I.second).append(MF->CloneMachineInstr(&*Filler));
       ++UsefulSlots;
     } else {
-      I->first->insert(I->first->end(), MF->CloneMachineInstr(&*Filler));
+      I.first->push_back(MF->CloneMachineInstr(&*Filler));
     }
   }
 }
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 05c1c06ffefe..6ddfec5d0f79 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -313,7 +313,7 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
     llvm_unreachable("unexpected opcode");
   }
 
-  unsigned LHSReg = getRegForValue(LHS);
+  Register LHSReg = getRegForValue(LHS);
   if (!LHSReg)
     return 0;
 
@@ -325,7 +325,7 @@ unsigned MipsFastISel::emitLogicalOp(unsigned ISDOpc, MVT RetVT,
   if (!RHSReg)
     return 0;
 
-  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  Register ResultReg = createResultReg(&Mips::GPR32RegClass);
   if (!ResultReg)
     return 0;
 
@@ -341,7 +341,7 @@ unsigned MipsFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+    Register ResultReg = createResultReg(&Mips::GPR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LEA_ADDiu),
             ResultReg)
         .addFrameIndex(SI->second)
@@ -362,7 +362,7 @@ unsigned MipsFastISel::materializeInt(const Constant *C, MVT VT) {
 
 unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
                                            const TargetRegisterClass *RC) {
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
 
   if (isInt<16>(Imm)) {
     unsigned Opc = Mips::ADDiu;
@@ -376,7 +376,7 @@ unsigned MipsFastISel::materialize32BitInt(int64_t Imm,
   unsigned Hi = (Imm >> 16) & 0xFFFF;
   if (Lo) {
     // Both Lo and Hi have nonzero bits.
-    unsigned TmpReg = createResultReg(RC);
+    Register TmpReg = createResultReg(RC);
     emitInst(Mips::LUi, TmpReg).addImm(Hi);
     emitInst(Mips::ORi, ResultReg).addReg(TmpReg).addImm(Lo);
   } else {
@@ -391,13 +391,13 @@ unsigned MipsFastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
   int64_t Imm = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
   if (VT == MVT::f32) {
     const TargetRegisterClass *RC = &Mips::FGR32RegClass;
-    unsigned DestReg = createResultReg(RC);
+    Register DestReg = createResultReg(RC);
     unsigned TempReg = materialize32BitInt(Imm, &Mips::GPR32RegClass);
     emitInst(Mips::MTC1, DestReg).addReg(TempReg);
     return DestReg;
   } else if (VT == MVT::f64) {
     const TargetRegisterClass *RC = &Mips::AFGR64RegClass;
-    unsigned DestReg = createResultReg(RC);
+    Register DestReg = createResultReg(RC);
     unsigned TempReg1 = materialize32BitInt(Imm >> 32, &Mips::GPR32RegClass);
     unsigned TempReg2 =
         materialize32BitInt(Imm & 0xFFFFFFFF, &Mips::GPR32RegClass);
@@ -412,7 +412,7 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
   if (VT != MVT::i32)
     return 0;
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
-  unsigned DestReg = createResultReg(RC);
+  Register DestReg = createResultReg(RC);
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
   bool IsThreadLocal = GVar && GVar->isThreadLocal();
   // TLS not supported at this time.
@@ -423,7 +423,7 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
       .addGlobalAddress(GV, 0, MipsII::MO_GOT);
   if ((GV->hasInternalLinkage() ||
        (GV->hasLocalLinkage() && !isa<Function>(GV)))) {
-    unsigned TempReg = createResultReg(RC);
+    Register TempReg = createResultReg(RC);
     emitInst(Mips::ADDiu, TempReg)
         .addReg(DestReg)
         .addGlobalAddress(GV, 0, MipsII::MO_ABS_LO);
@@ -434,7 +434,7 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
 
 unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
-  unsigned DestReg = createResultReg(RC);
+  Register DestReg = createResultReg(RC);
   emitInst(Mips::LW, DestReg)
       .addReg(MFI->getGlobalBaseReg(*MF))
       .addSym(Sym, MipsII::MO_GOT);
@@ -649,13 +649,13 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
   default:
     return false;
   case CmpInst::ICMP_EQ: {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
     emitInst(Mips::SLTiu, ResultReg).addReg(TempReg).addImm(1);
     break;
   }
   case CmpInst::ICMP_NE: {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::XOR, TempReg).addReg(LeftReg).addReg(RightReg);
     emitInst(Mips::SLTu, ResultReg).addReg(Mips::ZERO).addReg(TempReg);
     break;
@@ -667,13 +667,13 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     emitInst(Mips::SLTu, ResultReg).addReg(LeftReg).addReg(RightReg);
     break;
   case CmpInst::ICMP_UGE: {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLTu, TempReg).addReg(LeftReg).addReg(RightReg);
     emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
     break;
   }
   case CmpInst::ICMP_ULE: {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLTu, TempReg).addReg(RightReg).addReg(LeftReg);
     emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
     break;
@@ -685,13 +685,13 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     emitInst(Mips::SLT, ResultReg).addReg(LeftReg).addReg(RightReg);
     break;
   case CmpInst::ICMP_SGE: {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLT, TempReg).addReg(LeftReg).addReg(RightReg);
     emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
     break;
   }
   case CmpInst::ICMP_SLE: {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::SLT, TempReg).addReg(RightReg).addReg(LeftReg);
     emitInst(Mips::XORi, ResultReg).addReg(TempReg).addImm(1);
     break;
@@ -737,8 +737,8 @@ bool MipsFastISel::emitCmp(unsigned ResultReg, const CmpInst *CI) {
     default:
       llvm_unreachable("Only switching of a subset of CCs.");
     }
-    unsigned RegWithZero = createResultReg(&Mips::GPR32RegClass);
-    unsigned RegWithOne = createResultReg(&Mips::GPR32RegClass);
+    Register RegWithZero = createResultReg(&Mips::GPR32RegClass);
+    Register RegWithOne = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::ADDiu, RegWithZero).addReg(Mips::ZERO).addImm(0);
     emitInst(Mips::ADDiu, RegWithOne).addReg(Mips::ZERO).addImm(1);
     emitInst(Opc).addReg(Mips::FCC0, RegState::Define).addReg(LeftReg)
@@ -964,7 +964,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
 
   // For the general case, we need to mask with 1.
   if (ZExtCondReg == 0) {
-    unsigned CondReg = getRegForValue(BI->getCondition());
+    Register CondReg = getRegForValue(BI->getCondition());
     if (CondReg == 0)
       return false;
 
@@ -982,7 +982,7 @@ bool MipsFastISel::selectBranch(const Instruction *I) {
 
 bool MipsFastISel::selectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
-  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  Register ResultReg = createResultReg(&Mips::GPR32RegClass);
   if (!emitCmp(ResultReg, CI))
     return false;
   updateValueMap(I, ResultReg);
@@ -1000,13 +1000,13 @@ bool MipsFastISel::selectFPExt(const Instruction *I) {
   if (SrcVT != MVT::f32 || DestVT != MVT::f64)
     return false;
 
-  unsigned SrcReg =
+  Register SrcReg =
       getRegForValue(Src); // this must be a 32bit floating point register class
                            // maybe we should handle this differently
   if (!SrcReg)
     return false;
 
-  unsigned DestReg = createResultReg(&Mips::AFGR64RegClass);
+  Register DestReg = createResultReg(&Mips::AFGR64RegClass);
   emitInst(Mips::CVT_D32_S, DestReg).addReg(SrcReg);
   updateValueMap(I, DestReg);
   return true;
@@ -1041,22 +1041,22 @@ bool MipsFastISel::selectSelect(const Instruction *I) {
 
   const SelectInst *SI = cast<SelectInst>(I);
   const Value *Cond = SI->getCondition();
-  unsigned Src1Reg = getRegForValue(SI->getTrueValue());
-  unsigned Src2Reg = getRegForValue(SI->getFalseValue());
-  unsigned CondReg = getRegForValue(Cond);
+  Register Src1Reg = getRegForValue(SI->getTrueValue());
+  Register Src2Reg = getRegForValue(SI->getFalseValue());
+  Register CondReg = getRegForValue(Cond);
 
   if (!Src1Reg || !Src2Reg || !CondReg)
     return false;
 
-  unsigned ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
+  Register ZExtCondReg = createResultReg(&Mips::GPR32RegClass);
   if (!ZExtCondReg)
     return false;
 
   if (!emitIntExt(MVT::i1, CondReg, MVT::i32, ZExtCondReg, true))
     return false;
 
-  unsigned ResultReg = createResultReg(RC);
-  unsigned TempReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
+  Register TempReg = createResultReg(RC);
 
   if (!ResultReg || !TempReg)
     return false;
@@ -1079,11 +1079,11 @@ bool MipsFastISel::selectFPTrunc(const Instruction *I) {
   if (SrcVT != MVT::f64 || DestVT != MVT::f32)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (!SrcReg)
     return false;
 
-  unsigned DestReg = createResultReg(&Mips::FGR32RegClass);
+  Register DestReg = createResultReg(&Mips::FGR32RegClass);
   if (!DestReg)
     return false;
 
@@ -1115,14 +1115,14 @@ bool MipsFastISel::selectFPToInt(const Instruction *I, bool IsSigned) {
   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (SrcReg == 0)
     return false;
 
   // Determine the opcode for the conversion, which takes place
   // entirely within FPRs.
-  unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
-  unsigned TempReg = createResultReg(&Mips::FGR32RegClass);
+  Register DestReg = createResultReg(&Mips::GPR32RegClass);
+  Register TempReg = createResultReg(&Mips::FGR32RegClass);
   unsigned Opc = (SrcVT == MVT::f32) ? Mips::TRUNC_W_S : Mips::TRUNC_W_D32;
 
   // Generate the convert.
@@ -1196,7 +1196,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
         break;
       }
     }
-    unsigned ArgReg = getRegForValue(ArgVal);
+    Register ArgReg = getRegForValue(ArgVal);
     if (!ArgReg)
       return false;
 
@@ -1294,7 +1294,7 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT,
     if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
       CopyVT = MVT::i32;
 
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
+    Register ResultReg = createResultReg(TLI.getRegClassFor(CopyVT));
     if (!ResultReg)
       return false;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1462,11 +1462,11 @@ bool MipsFastISel::fastLowerArguments() {
   for (const auto &FormalArg : F->args()) {
     unsigned ArgNo = FormalArg.getArgNo();
     unsigned SrcReg = Allocation[ArgNo].Reg;
-    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[ArgNo].RC);
+    Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, Allocation[ArgNo].RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
-    unsigned ResultReg = createResultReg(Allocation[ArgNo].RC);
+    Register ResultReg = createResultReg(Allocation[ArgNo].RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(DstReg, getKillRegState(true));
@@ -1594,10 +1594,10 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     if (!isTypeSupported(RetTy, VT))
       return false;
 
-    unsigned SrcReg = getRegForValue(II->getOperand(0));
+    Register SrcReg = getRegForValue(II->getOperand(0));
     if (SrcReg == 0)
       return false;
-    unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+    Register DestReg = createResultReg(&Mips::GPR32RegClass);
     if (DestReg == 0)
       return false;
     if (VT == MVT::i16) {
@@ -1607,9 +1607,9 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
         return true;
       } else {
         unsigned TempReg[3];
-        for (int i = 0; i < 3; i++) {
-          TempReg[i] = createResultReg(&Mips::GPR32RegClass);
-          if (TempReg[i] == 0)
+        for (unsigned &R : TempReg) {
+          R = createResultReg(&Mips::GPR32RegClass);
+          if (R == 0)
             return false;
         }
         emitInst(Mips::SLL, TempReg[0]).addReg(SrcReg).addImm(8);
@@ -1621,16 +1621,16 @@ bool MipsFastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       }
     } else if (VT == MVT::i32) {
       if (Subtarget->hasMips32r2()) {
-        unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+        Register TempReg = createResultReg(&Mips::GPR32RegClass);
         emitInst(Mips::WSBH, TempReg).addReg(SrcReg);
         emitInst(Mips::ROTR, DestReg).addReg(TempReg).addImm(16);
         updateValueMap(II, DestReg);
         return true;
       } else {
         unsigned TempReg[8];
-        for (int i = 0; i < 8; i++) {
-          TempReg[i] = createResultReg(&Mips::GPR32RegClass);
-          if (TempReg[i] == 0)
+        for (unsigned &R : TempReg) {
+          R = createResultReg(&Mips::GPR32RegClass);
+          if (R == 0)
             return false;
         }
 
@@ -1720,7 +1720,7 @@ bool MipsFastISel::selectRet(const Instruction *I) {
     if (!VA.isRegLoc())
       return false;
 
-    unsigned Reg = getRegForValue(RV);
+    Register Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
 
@@ -1788,7 +1788,7 @@ bool MipsFastISel::selectTrunc(const Instruction *I) {
   if (DestVT != MVT::i16 && DestVT != MVT::i8 && DestVT != MVT::i1)
     return false;
 
-  unsigned SrcReg = getRegForValue(Op);
+  Register SrcReg = getRegForValue(Op);
   if (!SrcReg)
     return false;
 
@@ -1804,7 +1804,7 @@ bool MipsFastISel::selectIntExt(const Instruction *I) {
   Type *SrcTy = Src->getType();
 
   bool isZExt = isa<ZExtInst>(I);
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (!SrcReg)
     return false;
 
@@ -1818,7 +1818,7 @@ bool MipsFastISel::selectIntExt(const Instruction *I) {
 
   MVT SrcVT = SrcEVT.getSimpleVT();
   MVT DestVT = DestEVT.getSimpleVT();
-  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  Register ResultReg = createResultReg(&Mips::GPR32RegClass);
 
   if (!emitIntExt(SrcVT, SrcReg, DestVT, ResultReg, isZExt))
     return false;
@@ -1839,7 +1839,7 @@ bool MipsFastISel::emitIntSExt32r1(MVT SrcVT, unsigned SrcReg, MVT DestVT,
     ShiftAmt = 16;
     break;
   }
-  unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+  Register TempReg = createResultReg(&Mips::GPR32RegClass);
   emitInst(Mips::SLL, TempReg).addReg(SrcReg).addImm(ShiftAmt);
   emitInst(Mips::SRA, DestReg).addReg(TempReg).addImm(ShiftAmt);
   return true;
@@ -1935,15 +1935,15 @@ bool MipsFastISel::selectDivRem(const Instruction *I, unsigned ISDOpcode) {
     break;
   }
 
-  unsigned Src0Reg = getRegForValue(I->getOperand(0));
-  unsigned Src1Reg = getRegForValue(I->getOperand(1));
+  Register Src0Reg = getRegForValue(I->getOperand(0));
+  Register Src1Reg = getRegForValue(I->getOperand(1));
   if (!Src0Reg || !Src1Reg)
     return false;
 
   emitInst(DivOpc).addReg(Src0Reg).addReg(Src1Reg);
   emitInst(Mips::TEQ).addReg(Src1Reg).addReg(Mips::ZERO).addImm(7);
 
-  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  Register ResultReg = createResultReg(&Mips::GPR32RegClass);
   if (!ResultReg)
     return false;
 
@@ -1962,19 +1962,19 @@ bool MipsFastISel::selectShift(const Instruction *I) {
   if (!isTypeSupported(I->getType(), RetVT))
     return false;
 
-  unsigned ResultReg = createResultReg(&Mips::GPR32RegClass);
+  Register ResultReg = createResultReg(&Mips::GPR32RegClass);
   if (!ResultReg)
     return false;
 
   unsigned Opcode = I->getOpcode();
   const Value *Op0 = I->getOperand(0);
-  unsigned Op0Reg = getRegForValue(Op0);
+  Register Op0Reg = getRegForValue(Op0);
   if (!Op0Reg)
     return false;
 
   // If AShr or LShr, then we need to make sure the operand0 is sign extended.
   if (Opcode == Instruction::AShr || Opcode == Instruction::LShr) {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     if (!TempReg)
       return false;
 
@@ -2008,7 +2008,7 @@ bool MipsFastISel::selectShift(const Instruction *I) {
     return true;
   }
 
-  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  Register Op1Reg = getRegForValue(I->getOperand(1));
   if (!Op1Reg)
     return false;
 
@@ -2091,7 +2091,7 @@ bool MipsFastISel::fastSelectInstruction(const Instruction *I) {
 
 unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
                                                            bool IsUnsigned) {
-  unsigned VReg = getRegForValue(V);
+  Register VReg = getRegForValue(V);
   if (VReg == 0)
     return 0;
   MVT VMVT = TLI.getValueType(DL, V->getType(), true).getSimpleVT();
@@ -2100,7 +2100,7 @@ unsigned MipsFastISel::getRegEnsuringSimpleIntegerWidening(const Value *V,
     return 0;
 
   if ((VMVT == MVT::i8) || (VMVT == MVT::i16)) {
-    unsigned TempReg = createResultReg(&Mips::GPR32RegClass);
+    Register TempReg = createResultReg(&Mips::GPR32RegClass);
     if (!emitIntExt(VMVT, VReg, MVT::i32, TempReg, IsUnsigned))
       return 0;
     VReg = TempReg;
@@ -2112,7 +2112,7 @@ void MipsFastISel::simplifyAddress(Address &Addr) {
   if (!isInt<16>(Addr.getOffset())) {
     unsigned TempReg =
         materialize32BitInt(Addr.getOffset(), &Mips::GPR32RegClass);
-    unsigned DestReg = createResultReg(&Mips::GPR32RegClass);
+    Register DestReg = createResultReg(&Mips::GPR32RegClass);
     emitInst(Mips::ADDu, DestReg).addReg(TempReg).addReg(Addr.getReg());
     Addr.setReg(DestReg);
     Addr.setOffset(0);
@@ -2129,7 +2129,7 @@ unsigned MipsFastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
   // followed by another instruction that defines the same registers too.
   // We can fix this by explicitly marking those registers as dead.
   if (MachineInstOpcode == Mips::MUL) {
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     const MCInstrDesc &II = TII.get(MachineInstOpcode);
     Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
     Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 9377e83524e1..0c2e129b8f1f 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -2523,7 +2523,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
   MFI.setReturnAddressIsTaken(true);
 
   // Return RA, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
+  Register Reg = MF.addLiveIn(RA, getRegClassFor(VT));
   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT);
 }
 
@@ -3051,17 +3051,15 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   // stuck together.
   SDValue InFlag;
 
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, RegsToPass[i].first,
-                                 RegsToPass[i].second, InFlag);
+  for (auto &R : RegsToPass) {
+    Chain = CLI.DAG.getCopyToReg(Chain, CLI.DL, R.first, R.second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   // Add argument registers to the end of the list so that they are
   // known live into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(CLI.DAG.getRegister(RegsToPass[i].first,
-                                      RegsToPass[i].second.getValueType()));
+  for (auto &R : RegsToPass)
+    Ops.push_back(CLI.DAG.getRegister(R.first, R.second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
index 6d44ce2ab563..59f158688b16 100644
--- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -80,8 +80,8 @@ private:
 MipsInstructionSelector::MipsInstructionSelector(
     const MipsTargetMachine &TM, const MipsSubtarget &STI,
     const MipsRegisterBankInfo &RBI)
-    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI),
+    : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
+      RBI(RBI),
 
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "MipsGenGlobalISel.inc"
diff --git a/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/llvm/lib/Target/Mips/MipsMachineFunction.cpp
index a7a2be30f58a..411a26e42713 100644
--- a/llvm/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -148,14 +148,14 @@ void MipsFunctionInfo::initGlobalBaseReg(MachineFunction &MF) {
 
 void MipsFunctionInfo::createEhDataRegsFI(MachineFunction &MF) {
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  for (int I = 0; I < 4; ++I) {
+  for (int &I : EhDataRegFI) {
     const TargetRegisterClass &RC =
         static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI().IsN64()
             ? Mips::GPR64RegClass
             : Mips::GPR32RegClass;
 
-    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
-        TRI.getSpillSize(RC), TRI.getSpillAlign(RC), false);
+    I = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC),
+                                            TRI.getSpillAlign(RC), false);
   }
 }
 
@@ -167,9 +167,9 @@ void MipsFunctionInfo::createISRRegFI(MachineFunction &MF) {
   const TargetRegisterClass &RC = Mips::GPR32RegClass;
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
 
-  for (int I = 0; I < 2; ++I)
-    ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
-        TRI.getSpillSize(RC), TRI.getSpillAlign(RC), false);
+  for (int &I : ISRDataRegFI)
+    I = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC),
+                                            TRI.getSpillAlign(RC), false);
 }
 
 bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
diff --git a/llvm/lib/Target/Mips/MipsMulMulBugPass.cpp b/llvm/lib/Target/Mips/MipsMulMulBugPass.cpp
new file mode 100644
index 000000000000..daaf1135c2b1
--- /dev/null
+++ b/llvm/lib/Target/Mips/MipsMulMulBugPass.cpp
@@ -0,0 +1,136 @@
+//===- MipsMulMulBugPass.cpp - Mips VR4300 mulmul bugfix pass -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Early revisions of the VR4300 have a hardware bug where two consecutive
+// multiplications can produce an incorrect result in the second multiply.
+//
+// This pass scans for mul instructions in each basic block and inserts
+// a nop whenever the following conditions are met:
+//
+// - The current instruction is a single or double-precision floating-point
+//   mul instruction.
+// - The next instruction is either a mul instruction (any kind)
+//   or a branch instruction.
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "mips-vr4300-mulmul-fix"
+
+using namespace llvm;
+
+namespace {
+
+class MipsMulMulBugFix : public MachineFunctionPass {
+public:
+  MipsMulMulBugFix() : MachineFunctionPass(ID) {
+    initializeMipsMulMulBugFixPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Mips VR4300 mulmul bugfix"; }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+
+private:
+  bool fixMulMulBB(MachineBasicBlock &MBB, const MipsInstrInfo &MipsII);
+};
+
+} // namespace
+
+INITIALIZE_PASS(MipsMulMulBugFix, "mips-vr4300-mulmul-fix",
+                "Mips VR4300 mulmul bugfix", false, false)
+
+char MipsMulMulBugFix::ID = 0;
+
+bool MipsMulMulBugFix::runOnMachineFunction(MachineFunction &MF) {
+  const MipsInstrInfo &MipsII =
+      *static_cast<const MipsInstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  bool Modified = false;
+
+  for (auto &MBB : MF)
+    Modified |= fixMulMulBB(MBB, MipsII);
+
+  return Modified;
+}
+
+static bool isFirstMul(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case Mips::FMUL_S:
+  case Mips::FMUL_D:
+  case Mips::FMUL_D32:
+  case Mips::FMUL_D64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool isSecondMulOrBranch(const MachineInstr &MI) {
+  if (MI.isBranch() || MI.isIndirectBranch() || MI.isCall())
+    return true;
+
+  switch (MI.getOpcode()) {
+  case Mips::MUL:
+  case Mips::FMUL_S:
+  case Mips::FMUL_D:
+  case Mips::FMUL_D32:
+  case Mips::FMUL_D64:
+  case Mips::MULT:
+  case Mips::MULTu:
+  case Mips::DMULT:
+  case Mips::DMULTu:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool MipsMulMulBugFix::fixMulMulBB(MachineBasicBlock &MBB,
+                                   const MipsInstrInfo &MipsII) {
+  bool Modified = false;
+
+  MachineBasicBlock::instr_iterator NextMII;
+
+  // Iterate through the instructions in the basic block
+  for (MachineBasicBlock::instr_iterator MII = MBB.instr_begin(),
+                                         E = MBB.instr_end();
+       MII != E; MII = NextMII) {
+
+    NextMII = next_nodbg(MII, E);
+
+    // Trigger when the current instruction is a mul and the next instruction
+    // is either a mul or a branch in case the branch target start with a mul
+    if (NextMII != E && isFirstMul(*MII) && isSecondMulOrBranch(*NextMII)) {
+      LLVM_DEBUG(dbgs() << "Found mulmul!\n");
+
+      const MCInstrDesc &NewMCID = MipsII.get(Mips::NOP);
+      BuildMI(MBB, NextMII, DebugLoc(), NewMCID);
+      Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createMipsMulMulBugPass() { return new MipsMulMulBugFix(); }
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 7cba3118cd62..390ab9d22024 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -159,8 +159,8 @@ getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const MipsSubtarget &Subtarget = MF.getSubtarget<MipsSubtarget>();
 
-  for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I)
-    Reserved.set(ReservedGPR32[I]);
+  for (MCPhysReg R : ReservedGPR32)
+    Reserved.set(R);
 
   // Reserve registers for the NaCl sandbox.
   if (Subtarget.isTargetNaCl()) {
@@ -169,8 +169,8 @@ getReservedRegs(const MachineFunction &MF) const {
     Reserved.set(Mips::T8);   // Reserved for thread pointer.
   }
 
-  for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
-    Reserved.set(ReservedGPR64[I]);
+  for (MCPhysReg R : ReservedGPR64)
+    Reserved.set(R);
 
   // For mno-abicalls, GP is a program invariant!
   if (!Subtarget.isABICalls()) {
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index 193d071447ff..7ee2ddf3605f 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -454,7 +454,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
     // directives.
     for (const CalleeSavedInfo &I : CSI) {
       int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
-      unsigned Reg = I.getReg();
+      Register Reg = I.getReg();
 
       // If Reg is a double precision register, emit two cfa_offsets,
       // one for each of the paired single precision registers.
@@ -801,7 +801,7 @@ bool MipsSEFrameLowering::spillCalleeSavedRegisters(
     // method MipsTargetLowering::lowerRETURNADDR.
     // It's killed at the spill, unless the register is RA and return address
     // is taken.
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA || Reg == Mips::RA_64)
         && MF->getFrameInfo().isReturnAddressTaken();
     if (!IsRAAndRetAddrIsTaken)
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 40b215a8204c..346ebe9664fc 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -85,18 +85,18 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
   if (Subtarget.hasDSP()) {
     MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
 
-    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
-      addRegisterClass(VecTys[i], &Mips::DSPRRegClass);
+    for (const auto &VecTy : VecTys) {
+      addRegisterClass(VecTy, &Mips::DSPRRegClass);
 
       // Expand all builtin opcodes.
       for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
-        setOperationAction(Opc, VecTys[i], Expand);
+        setOperationAction(Opc, VecTy, Expand);
 
-      setOperationAction(ISD::ADD, VecTys[i], Legal);
-      setOperationAction(ISD::SUB, VecTys[i], Legal);
-      setOperationAction(ISD::LOAD, VecTys[i], Legal);
-      setOperationAction(ISD::STORE, VecTys[i], Legal);
-      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+      setOperationAction(ISD::ADD, VecTy, Legal);
+      setOperationAction(ISD::SUB, VecTy, Legal);
+      setOperationAction(ISD::LOAD, VecTy, Legal);
+      setOperationAction(ISD::STORE, VecTy, Legal);
+      setOperationAction(ISD::BITCAST, VecTy, Legal);
     }
 
     setTargetDAGCombine(ISD::SHL);
@@ -2931,7 +2931,7 @@ static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
 // operand is unused and can be replaced with anything. We choose to replace it
 // with the used operand since this reduces the number of instructions overall.
 static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
-                                        SmallVector<int, 16> Indices,
+                                        const SmallVector<int, 16> &Indices,
                                         SelectionDAG &DAG) {
   SmallVector<SDValue, 16> Ops;
   SDValue Op0;
@@ -2953,9 +2953,8 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
       Using2ndVec = true;
   }
 
-  for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end();
-       ++I)
-    Ops.push_back(DAG.getTargetConstant(*I, DL, MaskEltTy));
+  for (int Idx : Indices)
+    Ops.push_back(DAG.getTargetConstant(Idx, DL, MaskEltTy));
 
   SDValue MaskVec = DAG.getBuildVector(MaskVecTy, DL, Ops);
 
diff --git a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
index b05e9ad827c4..d6481793ef49 100644
--- a/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -38,7 +38,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips-reg-info"
 
-MipsSERegisterInfo::MipsSERegisterInfo() : MipsRegisterInfo() {}
+MipsSERegisterInfo::MipsSERegisterInfo() {}
 
 bool MipsSERegisterInfo::
 requiresRegisterScavenging(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 8de3c9fd25bd..f9f662a00117 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -45,6 +45,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "mips"
 
+static cl::opt<bool>
+    EnableMulMulFix("mfix4300", cl::init(false),
+                    cl::desc("Enable the VR4300 mulmul bug fix."), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
   // Register the target.
   RegisterTargetMachine<MipsebTargetMachine> X(getTheMipsTarget());
@@ -58,6 +62,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
   initializeMipsBranchExpansionPass(*PR);
   initializeMicroMipsSizeReducePass(*PR);
   initializeMipsPreLegalizerCombinerPass(*PR);
+  initializeMipsMulMulBugFixPass(*PR);
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
@@ -292,6 +297,11 @@ void MipsPassConfig::addPreEmitPass() {
   // instructions which can be remapped to a 16 bit instruction.
   addPass(createMicroMipsSizeReducePass());
 
+  // This pass inserts a nop instruction between two back-to-back multiplication
+  // instructions when the "mfix4300" flag is passed.
+  if (EnableMulMulFix)
+    addPass(createMipsMulMulBugPass());
+
   // The delay slot filler pass can potientially create forbidden slot hazards
   // for MIPSR6 and therefore it should go before MipsBranchExpansion pass.
   addPass(createMipsDelaySlotFillerPass());
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index 82d332ab3f08..da0cbb32659c 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -108,6 +108,10 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
     // SAT flag
     if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
       O << ".sat";
+  } else if (strcmp(Modifier, "relu") == 0) {
+    // RELU flag
+    if (Imm & NVPTX::PTXCvtMode::RELU_FLAG)
+      O << ".relu";
   } else if (strcmp(Modifier, "base") == 0) {
     // Default operand
     switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
@@ -139,6 +143,9 @@ void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
     case NVPTX::PTXCvtMode::RP:
       O << ".rp";
       break;
+    case NVPTX::PTXCvtMode::RNA:
+      O << ".rna";
+      break;
     }
   } else {
     llvm_unreachable("Invalid conversion modifier");
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index c2fd090da084..41e9f375e536 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -137,10 +137,12 @@ enum CvtMode {
   RZ,
   RM,
   RP,
+  RNA,
 
   BASE_MASK = 0x0F,
   FTZ_FLAG = 0x10,
-  SAT_FLAG = 0x20
+  SAT_FLAG = 0x20,
+  RELU_FLAG = 0x40
 };
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 16add48d4602..3a59306c4998 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1214,9 +1214,9 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
 
   std::vector<const GlobalVariable *> &gvars = localDecls[f];
 
-  for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
+  for (const GlobalVariable *GV : gvars) {
     O << "\t// demoted variable\n\t";
-    printModuleLevelGV(gvars[i], O, true);
+    printModuleLevelGV(GV, O, true);
   }
 }
 
@@ -1454,7 +1454,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
 
           if (static_cast<NVPTXTargetMachine &>(TM).getDrvInterface() !=
               NVPTX::CUDA) {
-            Type *ETy = PTy->getElementType();
+            Type *ETy = PTy->getPointerElementType();
             int addrSpace = PTy->getAddressSpace();
             switch (addrSpace) {
             default:
@@ -1514,7 +1514,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     // param has byVal attribute. So should be a pointer
     auto *PTy = dyn_cast<PointerType>(Ty);
     assert(PTy && "Param with byval attribute should be a pointer type");
-    Type *ETy = PTy->getElementType();
+    Type *ETy = PTy->getPointerElementType();
 
     if (isABI || isKernelFunc) {
       // Just print .param .align <a> .b8 .param[size];
@@ -1613,7 +1613,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // We use the per class virtual register number in the ptx output.
   unsigned int numVRs = MRI->getNumVirtRegs();
   for (unsigned i = 0; i < numVRs; i++) {
-    unsigned int vr = Register::index2VirtReg(i);
+    Register vr = Register::index2VirtReg(i);
     const TargetRegisterClass *RC = MRI->getRegClass(vr);
     DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
     int n = regmap.size();
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index a9a5eae42c1d..888fc8ffac2c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -96,20 +96,18 @@ bool GenericToNVVM::runOnModule(Module &M) {
   // Walk through the instructions in function defitinions, and replace any use
   // of original global variables in GVMap with a use of the corresponding
   // copies in GVMap.  If necessary, promote constants to instructions.
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    if (I->isDeclaration()) {
+  for (Function &F : M) {
+    if (F.isDeclaration()) {
       continue;
     }
-    IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
-    for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
-         ++BBI) {
-      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
-           ++II) {
-        for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
-          Value *Operand = II->getOperand(i);
+    IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+    for (BasicBlock &BB : F) {
+      for (Instruction &II : BB) {
+        for (unsigned i = 0, e = II.getNumOperands(); i < e; ++i) {
+          Value *Operand = II.getOperand(i);
           if (isa<Constant>(Operand)) {
-            II->setOperand(
-                i, remapConstant(&M, &*I, cast<Constant>(Operand), Builder));
+            II.setOperand(
+                i, remapConstant(&M, &F, cast<Constant>(Operand), Builder));
           }
         }
       }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index e2f6b69fc530..eac237bb27bb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -553,17 +553,30 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // These map to corresponding instructions for f32/f64. f16 must be
   // promoted to f32. v2f16 is expanded to f16, which is then promoted
   // to f32.
-  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
-                         ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
+  for (const auto &Op :
+       {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS, ISD::FABS}) {
     setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::f32, Legal);
     setOperationAction(Op, MVT::f64, Legal);
     setOperationAction(Op, MVT::v2f16, Expand);
   }
-  setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
-  setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
-  setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
-  setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
+  // max.f16, max.f16x2 and max.NaN are supported on sm_80+.
+  auto GetMinMaxAction = [&](LegalizeAction NotSm80Action) {
+    bool IsAtLeastSm80 = STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
+    return IsAtLeastSm80 ? Legal : NotSm80Action;
+  };
+  for (const auto &Op : {ISD::FMINNUM, ISD::FMAXNUM}) {
+    setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Promote), Promote);
+    setOperationAction(Op, MVT::f32, Legal);
+    setOperationAction(Op, MVT::f64, Legal);
+    setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
+  }
+  for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
+    setFP16OperationAction(Op, MVT::f16, GetMinMaxAction(Expand), Expand);
+    setOperationAction(Op, MVT::f32, GetMinMaxAction(Expand));
+    setOperationAction(Op, MVT::f64, GetMinMaxAction(Expand));
+    setFP16OperationAction(Op, MVT::v2f16, GetMinMaxAction(Expand), Expand);
+  }
 
   // No FEXP2, FLOG2.  The PTX ex2 and log2 functions are always approximate.
   // No FPOW or FREM in PTX.
@@ -1341,7 +1354,7 @@ std::string NVPTXTargetLowering::getPrototype(
     }
     auto *PTy = dyn_cast<PointerType>(Ty);
     assert(PTy && "Param with byval attribute should be a pointer type");
-    Type *ETy = PTy->getElementType();
+    Type *ETy = PTy->getPointerElementType();
 
     Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
     unsigned sz = DL.getTypeAllocSize(ETy);
@@ -1564,7 +1577,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SmallVector<uint64_t, 16> Offsets;
     auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
     assert(PTy && "Type of a byval parameter should be pointer");
-    ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
+    ComputePTXValueVTs(*this, DL, PTy->getPointerElementType(), VTs, &Offsets,
+                       0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
@@ -2434,7 +2448,7 @@ static bool isImageOrSamplerVal(const Value *arg, const Module *context) {
   if (!context)
     return false;
 
-  auto *STy = dyn_cast<StructType>(PTy->getElementType());
+  auto *STy = dyn_cast<StructType>(PTy->getPointerElementType());
   if (!STy || STy->isLiteral())
     return false;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index fc0d5cc6fbfa..eeedce2d99cb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -57,12 +57,9 @@ bool NVPTXImageOptimizer::runOnFunction(Function &F) {
   InstrToDelete.clear();
 
   // Look for call instructions in the function
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;
-       ++BI) {
-    for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
-         I != E; ++I) {
-      Instruction &Instr = *I;
-      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &Instr : BB) {
+      if (CallInst *CI = dyn_cast<CallInst>(&Instr)) {
         Function *CalledF = CI->getCalledFunction();
         if (CalledF && CalledF->isIntrinsic()) {
           // This is an intrinsic function call, check if its an istypep
@@ -84,8 +81,8 @@ bool NVPTXImageOptimizer::runOnFunction(Function &F) {
   }
 
   // Delete any istypep instances we replaced in the IR
-  for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i)
-    InstrToDelete[i]->eraseFromParent();
+  for (Instruction *I : InstrToDelete)
+    I->eraseFromParent();
 
   return Changed;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 96386af569de..22e200e77831 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -48,6 +48,7 @@ def CvtRN   : PatLeaf<(i32 0x5)>;
 def CvtRZ   : PatLeaf<(i32 0x6)>;
 def CvtRM   : PatLeaf<(i32 0x7)>;
 def CvtRP   : PatLeaf<(i32 0x8)>;
+def CvtRNA   : PatLeaf<(i32 0x9)>;
 
 def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
 def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
@@ -62,6 +63,10 @@ def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
 def CvtSAT      : PatLeaf<(i32 0x20)>;
 def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
 
+def CvtNONE_RELU   : PatLeaf<(i32 0x40)>;
+def CvtRN_RELU   : PatLeaf<(i32 0x45)>;
+def CvtRZ_RELU   : PatLeaf<(i32 0x46)>;
+
 def CvtMode : Operand<i32> {
   let PrintMethod = "printCvtMode";
 }
@@ -249,6 +254,32 @@ multiclass F3<string OpcStr, SDNode OpNode> {
                (ins Float32Regs:$a, f32imm:$b),
                !strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
                [(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
+
+   def f16rr_ftz :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math, doF32FTZ]>;
+   def f16rr :
+     NVPTXInst<(outs Float16Regs:$dst),
+               (ins Float16Regs:$a, Float16Regs:$b),
+               !strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
+               [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
+               Requires<[useFP16Math]>;
+
+   def f16x2rr_ftz :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, doF32FTZ]>;
+   def f16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math]>;
 }
 
 // Template for instructions which take three FP args.  The
@@ -500,6 +531,29 @@ let hasSideEffects = false in {
                                     "cvt.s64.s16 \t$dst, $src;", []>;
   def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
                                     "cvt.s64.s32 \t$dst, $src;", []>;
+
+multiclass CVT_FROM_FLOAT_SM80<string FromName, RegisterClass RC> {
+    def _f32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float32Regs:$src, CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:relu}.",
+                FromName, ".f32 \t$dst, $src;"), []>,
+                Requires<[hasPTX70, hasSM80]>;
+  }
+
+  defm CVT_bf16 : CVT_FROM_FLOAT_SM80<"bf16", Int16Regs>;
+
+    multiclass CVT_FROM_FLOAT_V2_SM80<string FromName, RegisterClass RC> {
+    def _f32 :
+      NVPTXInst<(outs RC:$dst),
+                (ins Float32Regs:$src1, Float32Regs:$src2,  CvtMode:$mode),
+                !strconcat("cvt${mode:base}${mode:relu}.",
+                FromName, ".f32 \t$dst, $src1, $src2;"), []>,
+    Requires<[hasPTX70, hasSM80]>;
+  }
+
+  defm CVT_f16x2 : CVT_FROM_FLOAT_V2_SM80<"f16x2", Float16x2Regs>;
+  defm CVT_bf16x2 : CVT_FROM_FLOAT_V2_SM80<"bf16x2", Int32Regs>;
 }
 
 //-----------------------------------
@@ -842,6 +896,8 @@ defm FMUL : F3_fma_component<"mul", fmul>;
 
 defm FMIN : F3<"min", fminnum>;
 defm FMAX : F3<"max", fmaxnum>;
+defm FMINNAN : F3<"min.NaN", fminimum>;
+defm FMAXNAN : F3<"max.NaN", fmaximum>;
 
 defm FABS  : F2<"abs", fabs>;
 defm FNEG  : F2<"neg", fneg>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 511cd875ac55..ec069a0a02ae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1046,6 +1046,38 @@ def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a),
 def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
           (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
 
+def : Pat<(int_nvvm_ff2bf16x2_rn Float32Regs:$a, Float32Regs:$b),
+          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+def : Pat<(int_nvvm_ff2bf16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
+          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+def : Pat<(int_nvvm_ff2bf16x2_rz Float32Regs:$a, Float32Regs:$b),
+          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
+def : Pat<(int_nvvm_ff2bf16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
+          (CVT_bf16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
+
+def : Pat<(int_nvvm_ff2f16x2_rn Float32Regs:$a, Float32Regs:$b),
+          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN)>;
+def : Pat<(int_nvvm_ff2f16x2_rn_relu Float32Regs:$a, Float32Regs:$b),
+          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRN_RELU)>;
+def : Pat<(int_nvvm_ff2f16x2_rz Float32Regs:$a, Float32Regs:$b),
+          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ)>;
+def : Pat<(int_nvvm_ff2f16x2_rz_relu Float32Regs:$a, Float32Regs:$b),
+          (CVT_f16x2_f32 Float32Regs:$a, Float32Regs:$b, CvtRZ_RELU)>;
+
+def : Pat<(int_nvvm_f2bf16_rn Float32Regs:$a),
+          (CVT_bf16_f32 Float32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_f2bf16_rn_relu Float32Regs:$a),
+          (CVT_bf16_f32 Float32Regs:$a, CvtRN_RELU)>;
+def : Pat<(int_nvvm_f2bf16_rz Float32Regs:$a),
+          (CVT_bf16_f32 Float32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_f2bf16_rz_relu Float32Regs:$a),
+          (CVT_bf16_f32 Float32Regs:$a, CvtRZ_RELU)>;
+
+def CVT_tf32_f32 :
+   NVPTXInst<(outs Int32Regs:$dest), (ins Float32Regs:$a),
+                   "cvt.rna.tf32.f32 \t$dest, $a;",
+       [(set Int32Regs:$dest, (int_nvvm_f2tf32_rna Float32Regs:$a))]>;
+
 def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
   Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 6cf59d285e8d..f655f25602bc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -66,10 +66,9 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
       getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
   // Collect all aggregate loads and mem* calls.
-  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
-    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
-         ++II) {
-      if (LoadInst *LI = dyn_cast<LoadInst>(II)) {
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
         if (!LI->hasOneUse())
           continue;
 
@@ -81,7 +80,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
             continue;
           AggrLoads.push_back(LI);
         }
-      } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) {
+      } else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(&I)) {
         // Convert intrinsic calls with variable size or with constant size
         // larger than the MaxAggrCopySize threshold.
         if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index ddb7f097fe68..67aa49132016 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -233,7 +233,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
 
   assert(PType && "Expecting pointer type in handleByValParam");
 
-  Type *StructType = PType->getElementType();
+  Type *StructType = PType->getPointerElementType();
 
   auto IsALoadChain = [&](Value *Start) {
     SmallVector<Value *, 16> ValuesToCheck = {Start};
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 05c20369abf4..5a6440c91fca 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -49,8 +49,8 @@ NVPTXSubtarget::NVPTXSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS,
                                const NVPTXTargetMachine &TM)
     : NVPTXGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), PTXVersion(0),
-      SmVersion(20), TM(TM), InstrInfo(),
-      TLInfo(TM, initializeSubtargetDependencies(CPU, FS)), FrameLowering() {}
+      SmVersion(20), TM(TM),
+      TLInfo(TM, initializeSubtargetDependencies(CPU, FS)) {}
 
 bool NVPTXSubtarget::hasImageHandles() const {
   // Enable handles for Kepler+, where CUDA supports indirect surfaces and
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 366d92a5a805..4645671a0cd8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -17,7 +17,7 @@ namespace llvm {
 
 class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
 public:
-  NVPTXTargetObjectFile() : TargetLoweringObjectFile() {}
+  NVPTXTargetObjectFile() {}
 
   ~NVPTXTargetObjectFile() override;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 74d129d330f3..2d6d72777db2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -286,8 +286,7 @@ bool getAlign(const Function &F, unsigned index, unsigned &align) {
   bool retval = findAllNVVMAnnotation(&F, "align", Vs);
   if (!retval)
     return false;
-  for (int i = 0, e = Vs.size(); i < e; i++) {
-    unsigned v = Vs[i];
+  for (unsigned v : Vs) {
     if ((v >> 16) == index) {
       align = v & 0xFFFF;
       return true;
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index ded922329ebf..715cff72dcab 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -121,6 +121,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool ParseDirectiveMachine(SMLoc L);
   bool ParseDirectiveAbiVersion(SMLoc L);
   bool ParseDirectiveLocalEntry(SMLoc L);
+  bool ParseGNUAttribute(SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
@@ -201,7 +202,8 @@ struct PPCOperand : public MCParsedAsmOperand {
     struct TLSRegOp TLSReg;
   };
 
-  PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  PPCOperand(KindTy K) : Kind(K) {}
+
 public:
   PPCOperand(const PPCOperand &o) : MCParsedAsmOperand() {
     Kind = o.Kind;
@@ -1604,6 +1606,8 @@ bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
     ParseDirectiveAbiVersion(DirectiveID.getLoc());
   else if (IDVal == ".localentry")
     ParseDirectiveLocalEntry(DirectiveID.getLoc());
+  else if (IDVal.startswith(".gnu_attribute"))
+    ParseGNUAttribute(DirectiveID.getLoc());
   else
     return true;
   return false;
@@ -1719,7 +1723,16 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
   return false;
 }
 
+bool PPCAsmParser::ParseGNUAttribute(SMLoc L) {
+  int64_t Tag;
+  int64_t IntegerValue;
+  if (!getParser().parseGNUAttribute(L, Tag, IntegerValue))
+    return false;
+
+  getParser().getStreamer().emitGNUAttribute(Tag, IntegerValue);
 
+  return true;
+}
 
 /// Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() {
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
index 7d64816ed6c7..0cd8350e3fdd 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
@@ -65,8 +65,7 @@ private:
 PPCInstructionSelector::PPCInstructionSelector(const PPCTargetMachine &TM,
                                                const PPCSubtarget &STI,
                                                const PPCRegisterBankInfo &RBI)
-    : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI),
+    : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI),
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "PPCGenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_INIT
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
index 0ca8587ba483..b92b0fc342ec 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -40,9 +40,8 @@ PPCELFStreamer::PPCELFStreamer(MCContext &Context,
                                std::unique_ptr<MCAsmBackend> MAB,
                                std::unique_ptr<MCObjectWriter> OW,
                                std::unique_ptr<MCCodeEmitter> Emitter)
-    : MCELFStreamer(Context, std::move(MAB), std::move(OW),
-                    std::move(Emitter)), LastLabel(NULL) {
-}
+    : MCELFStreamer(Context, std::move(MAB), std::move(OW), std::move(Emitter)),
+      LastLabel(nullptr) {}
 
 void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
                                              const MCSubtargetInfo &STI) {
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index d6e02d0d0862..a651362f703b 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -271,14 +271,14 @@ private:
     MCAssembler &MCA = getStreamer().getAssembler();
     int64_t Offset;
     if (!LocalOffset->evaluateAsAbsolute(Offset, MCA))
-      MCA.getContext().reportFatalError(
-          LocalOffset->getLoc(), ".localentry expression must be absolute.");
+      MCA.getContext().reportError(LocalOffset->getLoc(),
+                                   ".localentry expression must be absolute");
 
     switch (Offset) {
     default:
-      MCA.getContext().reportFatalError(
-          LocalOffset->getLoc(),
-          ".localentry expression is not a valid power of 2.");
+      MCA.getContext().reportError(
+          LocalOffset->getLoc(), ".localentry expression must be a power of 2");
+      return 0;
     case 0:
       return 0;
     case 1:
diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
index f3ae0010ad8e..edd3b42d47e1 100644
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -409,8 +409,8 @@ def : InstRW<[P10W_DF_13C, P10W_DISP_ANY, P10DF_Read, P10DF_Read, P10DF_Read],
 // 13 Cycles Decimal Floating Point operations, and 3 Cycles Store operations, 2 input operands
 def : InstRW<[P10W_DF_13C, P10W_DISP_EVEN, P10W_ST_3C, P10W_DISP_ANY],
       (instrs
-    HASHST,
-    HASHSTP
+    HASHST, HASHST8,
+    HASHSTP, HASHSTP8
 )>;
 
 // 24 Cycles Decimal Floating Point operations, 1 input operands
@@ -619,6 +619,8 @@ def : InstRW<[P10W_DX_5C, P10W_DISP_ANY, P10DX_Read, P10DX_Read],
     XSCMPEXPQP,
     XSCMPOQP,
     XSCMPUQP,
+    XSMAXCQP,
+    XSMINCQP,
     XSTSTDCQP,
     XXGENPCVBM
 )>;
@@ -1336,8 +1338,8 @@ def : InstRW<[P10W_LD_6C, P10W_DISP_ANY, P10LD_Read, P10LD_Read],
 // 6 Cycles Load operations, and 13 Cycles Decimal Floating Point operations, 2 input operands
 def : InstRW<[P10W_LD_6C, P10W_DISP_EVEN, P10W_DF_13C, P10W_DISP_ANY],
       (instrs
-    HASHCHK,
-    HASHCHKP
+    HASHCHK, HASHCHK8,
+    HASHCHKP, HASHCHKP8
 )>;
 
 // Single crack instructions
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
index f7c049951c54..c088d7847ce4 100644
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -1415,7 +1415,7 @@ def : InstRW<[],
   (instregex "NOP_GT_PWR(6|7)$"),
   (instregex "TLB(IA|IVAX|SX|SX2|SX2D|LD|LI|RE|RE2|WE|WE2)$"),
   (instregex "WRTEE(I)?$"),
-  (instregex "HASH(ST|STP|CHK|CHKP)$"),
+  (instregex "HASH(ST|STP|CHK|CHKP)(8)?$"),
   ATTN,
   CLRBHRB,
   MFBHRBE,
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f26c15667a0b..780981806996 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -109,6 +109,23 @@ struct DenseMapInfo<std::pair<const MCSymbol *, MCSymbolRefExpr::VariantKind>> {
 
 namespace {
 
+enum {
+  // GNU attribute tags for PowerPC ABI
+  Tag_GNU_Power_ABI_FP = 4,
+  Tag_GNU_Power_ABI_Vector = 8,
+  Tag_GNU_Power_ABI_Struct_Return = 12,
+
+  // GNU attribute values for PowerPC float ABI, as combination of two parts
+  Val_GNU_Power_ABI_NoFloat = 0b00,
+  Val_GNU_Power_ABI_HardFloat_DP = 0b01,
+  Val_GNU_Power_ABI_SoftFloat_DP = 0b10,
+  Val_GNU_Power_ABI_HardFloat_SP = 0b11,
+
+  Val_GNU_Power_ABI_LDBL_IBM128 = 0b0100,
+  Val_GNU_Power_ABI_LDBL_64 = 0b1000,
+  Val_GNU_Power_ABI_LDBL_IEEE128 = 0b1100,
+};
+
 class PPCAsmPrinter : public AsmPrinter {
 protected:
   // For TLS on AIX, we need to be able to identify TOC entries of specific
@@ -178,6 +195,8 @@ public:
     return "Linux PPC Assembly Printer";
   }
 
+  void emitGNUAttributes(Module &M);
+
   void emitStartOfAsmFile(Module &M) override;
   void emitEndOfAsmFile(Module &) override;
 
@@ -1388,6 +1407,28 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+void PPCLinuxAsmPrinter::emitGNUAttributes(Module &M) {
+  // Emit float ABI into GNU attribute
+  Metadata *MD = M.getModuleFlag("float-abi");
+  MDString *FloatABI = dyn_cast_or_null<MDString>(MD);
+  if (!FloatABI)
+    return;
+  StringRef flt = FloatABI->getString();
+  // TODO: Support emitting soft-fp and hard double/single attributes.
+  if (flt == "doubledouble")
+    OutStreamer->emitGNUAttribute(Tag_GNU_Power_ABI_FP,
+                                  Val_GNU_Power_ABI_HardFloat_DP |
+                                      Val_GNU_Power_ABI_LDBL_IBM128);
+  else if (flt == "ieeequad")
+    OutStreamer->emitGNUAttribute(Tag_GNU_Power_ABI_FP,
+                                  Val_GNU_Power_ABI_HardFloat_DP |
+                                      Val_GNU_Power_ABI_LDBL_IEEE128);
+  else if (flt == "ieeedouble")
+    OutStreamer->emitGNUAttribute(Tag_GNU_Power_ABI_FP,
+                                  Val_GNU_Power_ABI_HardFloat_DP |
+                                      Val_GNU_Power_ABI_LDBL_64);
+}
+
 void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
   if (!Subtarget->isPPC64())
     return PPCAsmPrinter::emitInstruction(MI);
@@ -1642,6 +1683,8 @@ void PPCLinuxAsmPrinter::emitEndOfAsmFile(Module &M) {
   PPCTargetStreamer *TS =
       static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
 
+  emitGNUAttributes(M);
+
   if (!TOC.empty()) {
     const char *Name = isPPC64 ? ".toc" : ".got2";
     MCSectionELF *Section = OutContext.getELFSection(
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 856569bc8a73..e7cd107c5046 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -150,7 +150,7 @@ class PPCFastISel final : public FastISel {
     unsigned copyRegToRegClass(const TargetRegisterClass *ToRC,
                                unsigned SrcReg, unsigned Flag = 0,
                                unsigned SubReg = 0) {
-      unsigned TmpReg = createResultReg(ToRC);
+      Register TmpReg = createResultReg(ToRC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), TmpReg).addReg(SrcReg, Flag, SubReg);
       return TmpReg;
@@ -428,7 +428,7 @@ void PPCFastISel::PPCSimplifyAddress(Address &Addr, bool &UseOffset,
   // put the alloca address into a register, set the base type back to
   // register and continue. This should almost never happen.
   if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
-    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    Register ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
             ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
     Addr.Base.Reg = ResultReg;
@@ -604,7 +604,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
   // Look at the currently assigned register for this instruction
   // to determine the required register class.  This is necessary
   // to constrain RA from using R0/X0 when this is not legal.
-  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  Register AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
     AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
@@ -783,7 +783,7 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
         PPCPred = PPC::InvertPredicate(PPCPred);
       }
 
-      unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
+      Register CondReg = createResultReg(&PPC::CRRCRegClass);
 
       if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
                       CondReg, PPCPred))
@@ -847,7 +847,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     }
   }
 
-  unsigned SrcReg1 = getRegForValue(SrcValue1);
+  Register SrcReg1 = getRegForValue(SrcValue1);
   if (SrcReg1 == 0)
     return false;
 
@@ -928,13 +928,13 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
   }
 
   if (NeedsExt) {
-    unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+    Register ExtReg = createResultReg(&PPC::GPRCRegClass);
     if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
       return false;
     SrcReg1 = ExtReg;
 
     if (!UseImm) {
-      unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+      Register ExtReg = createResultReg(&PPC::GPRCRegClass);
       if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
         return false;
       SrcReg2 = ExtReg;
@@ -960,7 +960,7 @@ bool PPCFastISel::SelectFPExt(const Instruction *I) {
   if (SrcVT != MVT::f32 || DestVT != MVT::f64)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (!SrcReg)
     return false;
 
@@ -978,7 +978,7 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
   if (SrcVT != MVT::f64 || DestVT != MVT::f32)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (!SrcReg)
     return false;
 
@@ -1019,7 +1019,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
 
   // If necessary, extend 32-bit int to 64-bit.
   if (SrcVT == MVT::i32) {
-    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    Register TmpReg = createResultReg(&PPC::G8RCRegClass);
     if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
       return 0;
     SrcReg = TmpReg;
@@ -1079,7 +1079,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
       SrcVT != MVT::i32 && SrcVT != MVT::i64)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (SrcReg == 0)
     return false;
 
@@ -1091,7 +1091,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
     else
       Opc = IsSigned ? PPC::EFDCFSI : PPC::EFDCFUI;
 
-    unsigned DestReg = createResultReg(&PPC::SPERCRegClass);
+    Register DestReg = createResultReg(&PPC::SPERCRegClass);
     // Generate the convert.
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
       .addReg(SrcReg);
@@ -1114,7 +1114,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
 
   // Extend the input if necessary.
   if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
-    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    Register TmpReg = createResultReg(&PPC::G8RCRegClass);
     if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
       return false;
     SrcVT = MVT::i64;
@@ -1128,7 +1128,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
 
   // Determine the opcode for the conversion.
   const TargetRegisterClass *RC = &PPC::F8RCRegClass;
-  unsigned DestReg = createResultReg(RC);
+  Register DestReg = createResultReg(RC);
   unsigned Opc;
 
   if (DstVT == MVT::f32)
@@ -1170,7 +1170,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
 
   // Look at the currently assigned register for this instruction
   // to determine the required register class.
-  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  Register AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
     AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
@@ -1206,7 +1206,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
   if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (SrcReg == 0)
     return false;
 
@@ -1276,7 +1276,7 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   // Look at the currently assigned register for this instruction
   // to determine the required register class.  If there is no register,
   // make a conservative choice (don't assign R0).
-  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  Register AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
     (AssignedReg ? MRI.getRegClass(AssignedReg) :
      &PPC::GPRC_and_GPRC_NOR0RegClass);
@@ -1296,8 +1296,8 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
       break;
   }
 
-  unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
-  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  Register ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+  Register SrcReg1 = getRegForValue(I->getOperand(0));
   if (SrcReg1 == 0) return false;
 
   // Handle case of small immediate operand.
@@ -1355,7 +1355,7 @@ bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   }
 
   // Reg-reg case.
-  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  Register SrcReg2 = getRegForValue(I->getOperand(1));
   if (SrcReg2 == 0) return false;
 
   // Reverse operands for subtract-from.
@@ -1441,7 +1441,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
         MVT DestVT = VA.getLocVT();
         const TargetRegisterClass *RC =
           (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
-        unsigned TmpReg = createResultReg(RC);
+        Register TmpReg = createResultReg(RC);
         if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
           llvm_unreachable("Failed to emit a sext!");
         ArgVT = DestVT;
@@ -1453,7 +1453,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
         MVT DestVT = VA.getLocVT();
         const TargetRegisterClass *RC =
           (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
-        unsigned TmpReg = createResultReg(RC);
+        Register TmpReg = createResultReg(RC);
         if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
           llvm_unreachable("Failed to emit a zext!");
         ArgVT = DestVT;
@@ -1628,7 +1628,7 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (ArgVT.isVector() || ArgVT == MVT::f128)
       return false;
 
-    unsigned Arg = getRegForValue(ArgValue);
+    Register Arg = getRegForValue(ArgValue);
     if (Arg == 0)
       return false;
 
@@ -1734,7 +1734,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
       RetRegs.push_back(RetReg);
 
     } else {
-      unsigned Reg = getRegForValue(RV);
+      Register Reg = getRegForValue(RV);
 
       if (Reg == 0)
         return false;
@@ -1767,7 +1767,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
             case CCValAssign::ZExt: {
               const TargetRegisterClass *RC =
                 (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
-              unsigned TmpReg = createResultReg(RC);
+              Register TmpReg = createResultReg(RC);
               if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
                 return false;
               SrcReg = TmpReg;
@@ -1776,7 +1776,7 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
             case CCValAssign::SExt: {
               const TargetRegisterClass *RC =
                 (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
-              unsigned TmpReg = createResultReg(RC);
+              Register TmpReg = createResultReg(RC);
               if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
                 return false;
               SrcReg = TmpReg;
@@ -1857,7 +1857,7 @@ bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
 
 // Attempt to fast-select an indirect branch instruction.
 bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
-  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  Register AddrReg = getRegForValue(I->getOperand(0));
   if (AddrReg == 0)
     return false;
 
@@ -1884,7 +1884,7 @@ bool PPCFastISel::SelectTrunc(const Instruction *I) {
   if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
     return false;
 
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (!SrcReg)
     return false;
 
@@ -1903,7 +1903,7 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) {
   Type *SrcTy = Src->getType();
 
   bool IsZExt = isa<ZExtInst>(I);
-  unsigned SrcReg = getRegForValue(Src);
+  Register SrcReg = getRegForValue(Src);
   if (!SrcReg) return false;
 
   EVT SrcEVT, DestEVT;
@@ -1921,12 +1921,12 @@ bool PPCFastISel::SelectIntExt(const Instruction *I) {
   // instruction, use it.  Otherwise pick the register class of the
   // correct size that does not contain X0/R0, since we don't know
   // whether downstream uses permit that assignment.
-  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  Register AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
     (AssignedReg ? MRI.getRegClass(AssignedReg) :
      (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
       &PPC::GPRC_and_GPRC_NOR0RegClass));
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
 
   if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
     return false;
@@ -1966,15 +1966,6 @@ bool PPCFastISel::fastSelectInstruction(const Instruction *I) {
       return SelectBinaryIntOp(I, ISD::OR);
     case Instruction::Sub:
       return SelectBinaryIntOp(I, ISD::SUB);
-    case Instruction::Call:
-      // On AIX, call lowering uses the DAG-ISEL path currently so that the
-      // callee of the direct function call instruction will be mapped to the
-      // symbol for the function's entry point, which is distinct from the
-      // function descriptor symbol. The latter is the symbol whose XCOFF symbol
-      // name is the C-linkage name of the source level function.
-      if (TM.getTargetTriple().isOSAIX())
-        break;
-      return selectCall(I);
     case Instruction::Ret:
       return SelectRet(I);
     case Instruction::Trunc:
@@ -2012,7 +2003,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
   else
     RC = ((VT == MVT::f32) ? &PPC::F4RCRegClass : &PPC::F8RCRegClass);
 
-  unsigned DestReg = createResultReg(RC);
+  Register DestReg = createResultReg(RC);
   CodeModel::Model CModel = TM.getCodeModel();
 
   MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
@@ -2026,7 +2017,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
   else
     Opc = ((VT == MVT::f32) ? PPC::LFS : PPC::LFD);
 
-  unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+  Register TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
 
   PPCFuncInfo->setUsesTOCBasePtr();
   // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
@@ -2043,7 +2034,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
     // But for large code model, we must generate a LDtocL followed
     // by the LF[SD].
     if (CModel == CodeModel::Large) {
-      unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+      Register TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg)
@@ -2068,7 +2059,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
 
   assert(VT == MVT::i64 && "Non-address!");
   const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
-  unsigned DestReg = createResultReg(RC);
+  Register DestReg = createResultReg(RC);
 
   // Global values may be plain old object addresses, TLS object
   // addresses, constant pool entries, or jump tables.  How we generate
@@ -2083,6 +2074,12 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   if (GV->isThreadLocal())
     return 0;
 
+  // If the global has the toc-data attribute then fallback to DAG-ISEL.
+  if (TM.getTargetTriple().isOSAIX())
+    if (const GlobalVariable *Var = dyn_cast_or_null<GlobalVariable>(GV))
+      if (Var->hasAttribute("toc-data"))
+        return false;
+
   PPCFuncInfo->setUsesTOCBasePtr();
   // For small code model, generate a simple TOC load.
   if (CModel == CodeModel::Small)
@@ -2099,7 +2096,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // Otherwise we generate:
     //       ADDItocL(ADDIStocHA8(%x2, GV), GV)
     // Either way, start with the ADDIStocHA8:
-    unsigned HighPartReg = createResultReg(RC);
+    Register HighPartReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8),
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
@@ -2123,7 +2120,7 @@ unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
   unsigned Lo = Imm & 0xFFFF;
   unsigned Hi = (Imm >> 16) & 0xFFFF;
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
 
   if (isInt<16>(Imm))
@@ -2132,7 +2129,7 @@ unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
       .addImm(Imm);
   else if (Lo) {
     // Both Lo and Hi have nonzero bits.
-    unsigned TmpReg = createResultReg(RC);
+    Register TmpReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg)
       .addImm(Hi);
@@ -2195,7 +2192,7 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
     TmpReg3 = TmpReg2;
 
   if ((Lo = Remainder & 0xFFFF)) {
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ORI8),
             ResultReg).addReg(TmpReg3).addImm(Lo);
     return ResultReg;
@@ -2211,7 +2208,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
   if (VT == MVT::i1 && Subtarget->useCRBits()) {
-    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    Register ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
     return ImmReg;
@@ -2231,7 +2228,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
   // a range of 0..0x7fff.
   if (isInt<16>(Imm)) {
     unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
-    unsigned ImmReg = createResultReg(RC);
+    Register ImmReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ImmReg)
         .addImm(Imm);
     return ImmReg;
@@ -2283,7 +2280,7 @@ unsigned PPCFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
     FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    Register ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDI8),
             ResultReg).addFrameIndex(SI->second).addImm(0);
     return ResultReg;
@@ -2393,7 +2390,7 @@ unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
   if (VT == MVT::i1 && Subtarget->useCRBits()) {
-    unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
+    Register ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
     return ImmReg;
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 3ca563fee970..65c969c196e1 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -674,7 +674,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
                                                            : PPC::MFCR);
   const MCInstrDesc &StoreWordInst = TII.get(isPPC64 ? PPC::STW8 : PPC::STW);
   const MCInstrDesc &HashST =
-      TII.get(HasPrivileged ? PPC::HASHSTP : PPC::HASHST);
+      TII.get(isPPC64 ? (HasPrivileged ? PPC::HASHSTP8 : PPC::HASHST8)
+                      : (HasPrivileged ? PPC::HASHSTP : PPC::HASHST));
 
   // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
   // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
@@ -1172,7 +1173,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     // CFA.
     const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
     for (const CalleeSavedInfo &I : CSI) {
-      unsigned Reg = I.getReg();
+      Register Reg = I.getReg();
       if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
 
       // This is a bit of a hack: CR2LT, CR2GT, CR2EQ and CR2UN are just
@@ -1195,7 +1196,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         // In the ELFv1 ABI, only CR2 is noted in CFI and stands in for
         // the whole CR word.  In the ELFv2 ABI, every CR that was
         // actually saved gets its own CFI record.
-        unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
+        Register CRReg = isELFv2ABI? Reg : PPC::CR2;
         unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
             nullptr, MRI->getDwarfRegNum(CRReg, true), CRSaveOffset));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -1590,7 +1591,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   const MCInstrDesc& MoveToCRInst = TII.get( isPPC64 ? PPC::MTOCRF8
                                                      : PPC::MTOCRF);
   const MCInstrDesc &HashChk =
-      TII.get(HasPrivileged ? PPC::HASHCHKP : PPC::HASHCHK);
+      TII.get(isPPC64 ? (HasPrivileged ? PPC::HASHCHKP8 : PPC::HASHCHK8)
+                      : (HasPrivileged ? PPC::HASHCHKP : PPC::HASHCHK));
   int64_t LROffset = getReturnSaveOffset();
 
   int64_t FPOffset = 0;
@@ -2085,7 +2087,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   SmallVector<CalleeSavedInfo, 18> VRegs;
 
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     assert((!MF.getInfo<PPCFunctionInfo>()->mustSaveTOC() ||
             (Reg != PPC::X2 && Reg != PPC::R2)) &&
            "Not expecting to try to spill R2 in a function that must save TOC");
@@ -2337,7 +2339,7 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
     if (BVAllocatable.none())
       return false;
 
-    unsigned Reg = CS.getReg();
+    Register Reg = CS.getReg();
 
     if (!PPC::G8RCRegClass.contains(Reg)) {
       AllSpilledToReg = false;
@@ -2395,7 +2397,7 @@ bool PPCFrameLowering::spillCalleeSavedRegisters(
   });
 
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
 
     // CR2 through CR4 are the nonvolatile CR fields.
     bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
@@ -2581,7 +2583,7 @@ bool PPCFrameLowering::restoreCalleeSavedRegisters(
     --BeforeI;
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
+    Register Reg = CSI[i].getReg();
 
     if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
       continue;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index ba74af5ef5f7..fdcf6e7e80f2 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1365,8 +1365,7 @@ class BitPermutationSelector {
 
     ValueBit(SDValue V, unsigned I, Kind K = Variable)
       : V(V), Idx(I), K(K) {}
-    ValueBit(Kind K = Variable)
-      : V(SDValue(nullptr, 0)), Idx(UINT32_MAX), K(K) {}
+    ValueBit(Kind K = Variable) : Idx(UINT32_MAX), K(K) {}
 
     bool isZero() const {
       return K == ConstZero || K == VariableKnownToBeZero;
@@ -4438,7 +4437,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   // Force the ccreg into CR7.
   SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
 
-  SDValue InFlag(nullptr, 0);  // Null incoming flag value.
+  SDValue InFlag;  // Null incoming flag value.
   CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
                                InFlag).getValue(1);
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8d6edf07bc53..25cc34badda0 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2433,7 +2433,7 @@ unsigned PPC::getSplatIdxForPPCMnemonics(SDNode *N, unsigned EltSize,
 /// the constant being splatted.  The ByteSize field indicates the number of
 /// bytes of each element [124] -> [bhw].
 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
-  SDValue OpVal(nullptr, 0);
+  SDValue OpVal;
 
   // If ByteSize of the splat is bigger than the element size of the
   // build_vector, then we have a case where we are checking for a splat where
@@ -3508,8 +3508,9 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
       int ShuffV[] = {1, 0, 3, 2};
       SDValue Shuff =
           DAG.getVectorShuffle(MVT::v4i32, dl, SetCC32, SetCC32, ShuffV);
-      return DAG.getBitcast(
-          MVT::v2i64, DAG.getNode(ISD::AND, dl, MVT::v4i32, Shuff, SetCC32));
+      return DAG.getBitcast(MVT::v2i64,
+                            DAG.getNode(CC == ISD::SETEQ ? ISD::AND : ISD::OR,
+                                        dl, MVT::v4i32, Shuff, SetCC32));
     }
 
     // We handle most of these in the usual way.
@@ -4078,8 +4079,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
       // virtual ones.
       if (VA.getLocVT() == MVT::f64 && Subtarget.hasSPE()) {
         assert(i + 1 < e && "No second half of double precision argument");
-        unsigned RegLo = MF.addLiveIn(VA.getLocReg(), RC);
-        unsigned RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
+        Register RegLo = MF.addLiveIn(VA.getLocReg(), RC);
+        Register RegHi = MF.addLiveIn(ArgLocs[++i].getLocReg(), RC);
         SDValue ArgValueLo = DAG.getCopyFromReg(Chain, dl, RegLo, MVT::i32);
         SDValue ArgValueHi = DAG.getCopyFromReg(Chain, dl, RegHi, MVT::i32);
         if (!Subtarget.isLittleEndian())
@@ -4087,7 +4088,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
         ArgValue = DAG.getNode(PPCISD::BUILD_SPE64, dl, MVT::f64, ArgValueLo,
                                ArgValueHi);
       } else {
-        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
                                       ValVT == MVT::i1 ? MVT::i32 : ValVT);
         if (ValVT == MVT::i1)
@@ -4179,7 +4180,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
     // dereferencing the result of va_next.
     for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
       // Get an existing live-in vreg, or add a new one.
-      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
+      Register VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
       if (!VReg)
         VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
 
@@ -4198,7 +4199,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
     // on the stack.
     for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
       // Get an existing live-in vreg, or add a new one.
-      unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
+      Register VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
       if (!VReg)
         VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
 
@@ -4384,7 +4385,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
         InVals.push_back(Arg);
 
         if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+          Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
           FuncInfo->addLiveInAttr(VReg, Flags);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           EVT ObjType = EVT::getIntegerVT(*DAG.getContext(), ObjSize * 8);
@@ -4408,7 +4409,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
         if (GPR_idx == Num_GPR_Regs)
           break;
 
-        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         FuncInfo->addLiveInAttr(VReg, Flags);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
         SDValue Addr = FIN;
@@ -4432,7 +4433,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
     case MVT::i64:
       if (Flags.isNest()) {
         // The 'nest' parameter, if any, is passed in R11.
-        unsigned VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
+        Register VReg = MF.addLiveIn(PPC::X11, &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
         if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
@@ -4445,7 +4446,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
       // passed directly.  Clang may use those instead of "byval" aggregate
       // types to avoid forcing arguments to memory unnecessarily.
       if (GPR_idx != Num_GPR_Regs) {
-        unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+        Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
         FuncInfo->addLiveInAttr(VReg, Flags);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
@@ -4491,7 +4492,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
         // This can only ever happen in the presence of f32 array types,
         // since otherwise we never run out of FPRs before running out
         // of GPRs.
-        unsigned VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
+        Register VReg = MF.addLiveIn(GPR[GPR_idx++], &PPC::G8RCRegClass);
         FuncInfo->addLiveInAttr(VReg, Flags);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
@@ -4532,7 +4533,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
       // passed directly.  The latter are used to implement ELFv2 homogenous
       // vector aggregates.
       if (VR_idx != Num_VR_Regs) {
-        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        Register VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
         ++VR_idx;
       } else {
@@ -4591,7 +4592,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
     // the result of va_next.
     for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
          GPR_idx < Num_GPR_Regs; ++GPR_idx) {
-      unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      Register VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
       SDValue Store =
           DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
@@ -7059,7 +7060,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
 
       auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
                                                unsigned Offset) {
-        const unsigned VReg = MF.addLiveIn(PhysReg, RegClass);
+        const Register VReg = MF.addLiveIn(PhysReg, RegClass);
         // Since the callers side has left justified the aggregate in the
         // register, we can simply store the entire register into the stack
         // slot.
@@ -7156,7 +7157,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
              (CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize;
          GPRIndex < NumGPArgRegs; ++GPRIndex) {
 
-      const unsigned VReg =
+      const Register VReg =
           IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)
                   : MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);
 
@@ -11178,13 +11179,17 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::STRICT_FP_TO_SINT:
   case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_UINT: {
     // LowerFP_TO_INT() can only handle f32 and f64.
     if (N->getOperand(N->isStrictFPOpcode() ? 1 : 0).getValueType() ==
         MVT::ppcf128)
       return;
-    Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
+    SDValue LoweredValue = LowerFP_TO_INT(SDValue(N, 0), DAG, dl);
+    Results.push_back(LoweredValue);
+    if (N->isStrictFPOpcode())
+      Results.push_back(LoweredValue.getValue(1));
     return;
+  }
   case ISD::TRUNCATE: {
     if (!N->getValueType(0).isVector())
       return;
@@ -17890,7 +17895,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
   assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
          "Only support quadword now");
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType();
+  Type *ValTy = AlignedAddr->getType()->getPointerElementType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
   Function *RMW = Intrinsic::getDeclaration(
       M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
@@ -17915,7 +17920,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   assert(EnableQuadwordAtomics && Subtarget.hasQuadwordAtomics() &&
          "Only support quadword now");
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = cast<PointerType>(AlignedAddr->getType())->getElementType();
+  Type *ValTy = AlignedAddr->getType()->getPointerElementType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
   Function *IntCmpXchg =
       Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 87b7f96112ec..eb52e4aa6273 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1456,4 +1456,4 @@ namespace llvm {
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+#endif // LLVM_LIB_TARGET_POWERPC_PPCISELLOWERING_H
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 58af8037f59c..eae8e36e475e 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1760,26 +1760,27 @@ defm FCTIWUZ : XForm_26r<63, 143, (outs f8rc:$frD), (ins f8rc:$frB),
 
 // These instructions store a hash computed from the value of the link register
 // and the value of the stack pointer.
-let mayStore = 1 in {
-def HASHST : XForm_XD6_RA5_RB5<31, 722, (outs),
-                               (ins g8rc:$RB, memrihash:$D_RA_XD),
-                               "hashst $RB, $D_RA_XD", IIC_IntGeneral, []>;
-def HASHSTP : XForm_XD6_RA5_RB5<31, 658, (outs),
+let mayStore = 1, Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def HASHST8 : XForm_XD6_RA5_RB5<31, 722, (outs),
                                 (ins g8rc:$RB, memrihash:$D_RA_XD),
-                                "hashstp $RB, $D_RA_XD", IIC_IntGeneral, []>;
+                                "hashst $RB, $D_RA_XD", IIC_IntGeneral, []>;
+def HASHSTP8 : XForm_XD6_RA5_RB5<31, 658, (outs),
+                                 (ins g8rc:$RB, memrihash:$D_RA_XD),
+                                 "hashstp $RB, $D_RA_XD", IIC_IntGeneral, []>;
 }
 
 // These instructions check a hash computed from the value of the link register
 // and the value of the stack pointer. The hasSideEffects flag is needed as the
 // instruction may TRAP if the hash does not match the hash stored at the
 // specified address.
-let mayLoad = 1, hasSideEffects = 1 in {
-def HASHCHK : XForm_XD6_RA5_RB5<31, 754, (outs),
-                                (ins g8rc:$RB, memrihash:$D_RA_XD),
-                                "hashchk $RB, $D_RA_XD", IIC_IntGeneral, []>;
-def HASHCHKP : XForm_XD6_RA5_RB5<31, 690, (outs),
+let mayLoad = 1, hasSideEffects = 1,
+    Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+def HASHCHK8 : XForm_XD6_RA5_RB5<31, 754, (outs),
                                  (ins g8rc:$RB, memrihash:$D_RA_XD),
-                                 "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>;
+                                 "hashchk $RB, $D_RA_XD", IIC_IntGeneral, []>;
+def HASHCHKP8 : XForm_XD6_RA5_RB5<31, 690, (outs),
+                                  (ins g8rc:$RB, memrihash:$D_RA_XD),
+                                  "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>;
 }
 
 let Interpretation64Bit = 1, isCodeGenOnly = 1, hasSideEffects = 1 in
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index a0fd2111de11..eada872c2a7d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -2339,9 +2339,8 @@ bool PPCInstrInfo::ClobbersPredicate(MachineInstr &MI,
           Found = true;
         }
       } else if (MO.isRegMask()) {
-        for (TargetRegisterClass::iterator I = RC->begin(),
-             IE = RC->end(); I != IE; ++I)
-          if (MO.clobbersPhysReg(*I)) {
+        for (MCPhysReg R : *RC)
+          if (MO.clobbersPhysReg(R)) {
             Pred.push_back(MO);
             Found = true;
           }
@@ -3253,7 +3252,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
       Register Reg = MI.getOperand(i).getReg();
       if (!Register::isVirtualRegister(Reg))
         continue;
-      unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI);
+      Register TrueReg = TRI->lookThruCopyLike(Reg, MRI);
       if (Register::isVirtualRegister(TrueReg)) {
         DefMI = MRI->getVRegDef(TrueReg);
         if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8 ||
@@ -3502,8 +3501,8 @@ bool PPCInstrInfo::foldFrameOffset(MachineInstr &MI) const {
     return false;
 
   assert(ADDIMI && "There should be ADDIMI for valid ToBeChangedReg.");
-  unsigned ToBeChangedReg = ADDIMI->getOperand(0).getReg();
-  unsigned ScaleReg = ADDMI->getOperand(ScaleRegIdx).getReg();
+  Register ToBeChangedReg = ADDIMI->getOperand(0).getReg();
+  Register ScaleReg = ADDMI->getOperand(ScaleRegIdx).getReg();
   auto NewDefFor = [&](unsigned Reg, MachineBasicBlock::iterator Start,
                        MachineBasicBlock::iterator End) {
     for (auto It = ++Start; It != End; It++)
@@ -3720,7 +3719,7 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
 bool PPCInstrInfo::combineRLWINM(MachineInstr &MI,
                                  MachineInstr **ToErase) const {
   MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
-  unsigned FoldingReg = MI.getOperand(1).getReg();
+  Register FoldingReg = MI.getOperand(1).getReg();
   if (!Register::isVirtualRegister(FoldingReg))
     return false;
   MachineInstr *SrcMI = MRI->getVRegDef(FoldingReg);
@@ -5266,7 +5265,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
     if (!Register::isVirtualRegister(SrcReg))
       return false;
     const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
-    if (SrcMI != NULL)
+    if (SrcMI != nullptr)
       return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
 
     return false;
@@ -5290,7 +5289,7 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
     if (!Register::isVirtualRegister(SrcReg))
       return false;
     const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
-    if (SrcMI != NULL)
+    if (SrcMI != nullptr)
       return isSignOrZeroExtended(*SrcMI, SignExt, Depth);
 
     return false;
@@ -5319,7 +5318,8 @@ PPCInstrInfo::isSignOrZeroExtended(const MachineInstr &MI, bool SignExt,
         if (!Register::isVirtualRegister(SrcReg))
           return false;
         const MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
-        if (SrcMI == NULL || !isSignOrZeroExtended(*SrcMI, SignExt, Depth+1))
+        if (SrcMI == nullptr ||
+            !isSignOrZeroExtended(*SrcMI, SignExt, Depth + 1))
           return false;
       }
       else
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 2340be5b5915..c26b4f6ceb7d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -5530,6 +5530,30 @@ def DWBytes3210 {
     (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), Word, sub_32));
 }
 
+// These instructions store a hash computed from the value of the link register
+// and the value of the stack pointer.
+let mayStore = 1 in {
+def HASHST : XForm_XD6_RA5_RB5<31, 722, (outs),
+                               (ins gprc:$RB, memrihash:$D_RA_XD),
+                               "hashst $RB, $D_RA_XD", IIC_IntGeneral, []>;
+def HASHSTP : XForm_XD6_RA5_RB5<31, 658, (outs),
+                                (ins gprc:$RB, memrihash:$D_RA_XD),
+                                "hashstp $RB, $D_RA_XD", IIC_IntGeneral, []>;
+}
+
+// These instructions check a hash computed from the value of the link register
+// and the value of the stack pointer. The hasSideEffects flag is needed as the
+// instruction may TRAP if the hash does not match the hash stored at the
+// specified address.
+let mayLoad = 1, hasSideEffects = 1 in {
+def HASHCHK : XForm_XD6_RA5_RB5<31, 754, (outs),
+                                (ins gprc:$RB, memrihash:$D_RA_XD),
+                                "hashchk $RB, $D_RA_XD", IIC_IntGeneral, []>;
+def HASHCHKP : XForm_XD6_RA5_RB5<31, 690, (outs),
+                                 (ins gprc:$RB, memrihash:$D_RA_XD),
+                                 "hashchkp $RB, $D_RA_XD", IIC_IntGeneral, []>;
+}
+
 // Now both high word and low word are reversed, next
 // swap the high word and low word.
 def : Pat<(i64 (bitreverse i64:$A)),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index a19289e96b3e..fe354208533b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -2398,6 +2398,8 @@ let Predicates = [IsISA3_1] in {
 let Predicates = [IsISA3_1, HasVSX] in {
   def XVCVSPBF16 : XX2_XT6_XO5_XB6<60, 17, 475, "xvcvspbf16", vsrc, []>;
   def XVCVBF16SPN : XX2_XT6_XO5_XB6<60, 16, 475, "xvcvbf16spn", vsrc, []>;
+  def XSMAXCQP : X_VT5_VA5_VB5<63, 676, "xsmaxcqp", []>;
+  def XSMINCQP : X_VT5_VA5_VB5<63, 740, "xsmincqp", []>;
 }
 
 // Multiclass defining patterns for Set Boolean Extension Reverse Instructions.
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index d12a9b806fd0..e5fa02bc8ccf 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -107,10 +107,10 @@ private:
   void initialize(MachineFunction &MFParm);
 
   // Perform peepholes.
-  bool simplifyCode(void);
+  bool simplifyCode();
 
   // Perform peepholes.
-  bool eliminateRedundantCompare(void);
+  bool eliminateRedundantCompare();
   bool eliminateRedundantTOCSaves(std::map<MachineInstr *, bool> &TOCSaves);
   bool combineSEXTAndSHL(MachineInstr &MI, MachineInstr *&ToErase);
   bool emitRLDICWhenLoweringJumpTables(MachineInstr &MI);
@@ -258,12 +258,12 @@ void PPCMIPeephole::UpdateTOCSaves(
   }
 
   bool Keep = true;
-  for (auto It = TOCSaves.begin(); It != TOCSaves.end(); It++ ) {
-    MachineInstr *CurrInst = It->first;
+  for (auto &I : TOCSaves) {
+    MachineInstr *CurrInst = I.first;
     // If new instruction dominates an existing one, mark existing one as
     // redundant.
-    if (It->second && MDT->dominates(MI, CurrInst))
-      It->second = false;
+    if (I.second && MDT->dominates(MI, CurrInst))
+      I.second = false;
     // Check if the new instruction is redundant.
     if (MDT->dominates(CurrInst, MI)) {
       Keep = false;
@@ -381,7 +381,7 @@ static void convertUnprimedAccPHIs(const PPCInstrInfo *TII,
 }
 
 // Perform peephole optimizations.
-bool PPCMIPeephole::simplifyCode(void) {
+bool PPCMIPeephole::simplifyCode() {
   bool Simplified = false;
   bool TrapOpt = false;
   MachineInstr* ToErase = nullptr;
@@ -481,7 +481,7 @@ bool PPCMIPeephole::simplifyCode(void) {
         // PPC::ZERO.
         if (!MI.getOperand(1).isImm() || MI.getOperand(1).getImm() != 0)
           break;
-        unsigned MIDestReg = MI.getOperand(0).getReg();
+        Register MIDestReg = MI.getOperand(0).getReg();
         for (MachineInstr& UseMI : MRI->use_instructions(MIDestReg))
           Simplified |= TII->onlyFoldImmediate(UseMI, MI, MIDestReg);
         if (MRI->use_nodbg_empty(MIDestReg)) {
@@ -519,9 +519,9 @@ bool PPCMIPeephole::simplifyCode(void) {
         //   XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
         // We have to look through chains of COPY and SUBREG_TO_REG
         // to find the real source values for comparison.
-        unsigned TrueReg1 =
+        Register TrueReg1 =
           TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
-        unsigned TrueReg2 =
+        Register TrueReg2 =
           TRI->lookThruCopyLike(MI.getOperand(2).getReg(), MRI);
 
         if (!(TrueReg1 == TrueReg2 && Register::isVirtualRegister(TrueReg1)))
@@ -541,7 +541,7 @@ bool PPCMIPeephole::simplifyCode(void) {
         auto isConversionOfLoadAndSplat = [=]() -> bool {
           if (DefOpc != PPC::XVCVDPSXDS && DefOpc != PPC::XVCVDPUXDS)
             return false;
-          unsigned FeedReg1 =
+          Register FeedReg1 =
             TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
           if (Register::isVirtualRegister(FeedReg1)) {
             MachineInstr *LoadMI = MRI->getVRegDef(FeedReg1);
@@ -565,16 +565,16 @@ bool PPCMIPeephole::simplifyCode(void) {
         // If this is a splat or a swap fed by another splat, we
         // can replace it with a copy.
         if (DefOpc == PPC::XXPERMDI) {
-          unsigned DefReg1 = DefMI->getOperand(1).getReg();
-          unsigned DefReg2 = DefMI->getOperand(2).getReg();
+          Register DefReg1 = DefMI->getOperand(1).getReg();
+          Register DefReg2 = DefMI->getOperand(2).getReg();
           unsigned DefImmed = DefMI->getOperand(3).getImm();
 
           // If the two inputs are not the same register, check to see if
           // they originate from the same virtual register after only
           // copy-like instructions.
           if (DefReg1 != DefReg2) {
-            unsigned FeedReg1 = TRI->lookThruCopyLike(DefReg1, MRI);
-            unsigned FeedReg2 = TRI->lookThruCopyLike(DefReg2, MRI);
+            Register FeedReg1 = TRI->lookThruCopyLike(DefReg1, MRI);
+            Register FeedReg2 = TRI->lookThruCopyLike(DefReg2, MRI);
 
             if (!(FeedReg1 == FeedReg2 &&
                   Register::isVirtualRegister(FeedReg1)))
@@ -643,7 +643,7 @@ bool PPCMIPeephole::simplifyCode(void) {
       case PPC::XXSPLTW: {
         unsigned MyOpcode = MI.getOpcode();
         unsigned OpNo = MyOpcode == PPC::XXSPLTW ? 1 : 2;
-        unsigned TrueReg =
+        Register TrueReg =
           TRI->lookThruCopyLike(MI.getOperand(OpNo).getReg(), MRI);
         if (!Register::isVirtualRegister(TrueReg))
           break;
@@ -707,7 +707,7 @@ bool PPCMIPeephole::simplifyCode(void) {
       }
       case PPC::XVCVDPSP: {
         // If this is a DP->SP conversion fed by an FRSP, the FRSP is redundant.
-        unsigned TrueReg =
+        Register TrueReg =
           TRI->lookThruCopyLike(MI.getOperand(1).getReg(), MRI);
         if (!Register::isVirtualRegister(TrueReg))
           break;
@@ -716,9 +716,9 @@ bool PPCMIPeephole::simplifyCode(void) {
         // This can occur when building a vector of single precision or integer
         // values.
         if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
-          unsigned DefsReg1 =
+          Register DefsReg1 =
             TRI->lookThruCopyLike(DefMI->getOperand(1).getReg(), MRI);
-          unsigned DefsReg2 =
+          Register DefsReg2 =
             TRI->lookThruCopyLike(DefMI->getOperand(2).getReg(), MRI);
           if (!Register::isVirtualRegister(DefsReg1) ||
               !Register::isVirtualRegister(DefsReg2))
@@ -1178,7 +1178,7 @@ static unsigned getIncomingRegForBlock(MachineInstr *Phi,
 static unsigned getSrcVReg(unsigned Reg, MachineBasicBlock *BB1,
                            MachineBasicBlock *BB2, MachineRegisterInfo *MRI) {
   unsigned SrcReg = Reg;
-  while (1) {
+  while (true) {
     unsigned NextReg = SrcReg;
     MachineInstr *Inst = MRI->getVRegDef(SrcReg);
     if (BB1 && Inst->getOpcode() == PPC::PHI && Inst->getParent() == BB2) {
@@ -1334,7 +1334,7 @@ bool PPCMIPeephole::eliminateRedundantTOCSaves(
 //   cmpwi  r3, 0       ; greather than -1 means greater or equal to 0
 //   bge    0, .LBB0_4
 
-bool PPCMIPeephole::eliminateRedundantCompare(void) {
+bool PPCMIPeephole::eliminateRedundantCompare() {
   bool Simplified = false;
 
   for (MachineBasicBlock &MBB2 : *MF) {
@@ -1737,4 +1737,3 @@ INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
 char PPCMIPeephole::ID = 0;
 FunctionPass*
 llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); }
-
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 4bccc5596d2b..76b016c0ee79 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -390,6 +390,18 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+bool PPCRegisterInfo::isAsmClobberable(const MachineFunction &MF,
+                                       MCRegister PhysReg) const {
+  // We cannot use getReservedRegs() to find the registers that are not asm
+  // clobberable because there are some reserved registers which can be
+  // clobbered by inline asm. For example, when LR is clobbered, the register is
+  // saved and restored. We will hardcode the registers that are not asm
+  // cloberable in this function.
+
+  // The stack pointer (R1/X1) is not clobberable by inline asm
+  return PhysReg != PPC::R1 && PhysReg != PPC::X1;
+}
+
 bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
   const PPCInstrInfo *InstrInfo =  Subtarget.getInstrInfo();
@@ -423,7 +435,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co
       continue;
 
     int FrIdx = Info[i].getFrameIdx();
-    unsigned Reg = Info[i].getReg();
+    Register Reg = Info[i].getReg();
 
     const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
     unsigned Opcode = InstrInfo->getStoreOpcodeForSpill(RC);
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index 2e534dd1bcd5..114f6d0f4c66 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -91,6 +91,8 @@ public:
   void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
+  bool isAsmClobberable(const MachineFunction &MF,
+                        MCRegister PhysReg) const override;
   bool isCallerPreservedPhysReg(MCRegister PhysReg,
                                 const MachineFunction &MF) const override;
 
@@ -185,6 +187,10 @@ public:
 
     return RegName;
   }
+
+  bool isNonallocatableRegisterCalleeSave(MCRegister Reg) const override {
+    return Reg == PPC::LR || Reg == PPC::LR8;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index ed28731b8ef2..cc5738a5d7b6 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -374,11 +374,10 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
   // clobbers ctr.
   auto asmClobbersCTR = [](InlineAsm *IA) {
     InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
-    for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
-      InlineAsm::ConstraintInfo &C = CIV[i];
+    for (const InlineAsm::ConstraintInfo &C : CIV) {
       if (C.Type != InlineAsm::isInput)
-        for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
-          if (StringRef(C.Codes[j]).equals_insensitive("{ctr}"))
+        for (const auto &Code : C.Codes)
+          if (StringRef(Code).equals_insensitive("{ctr}"))
             return true;
     }
     return false;
@@ -653,11 +652,17 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
       }
 
       return true;
-    } else if (isa<BinaryOperator>(J) &&
-               (J->getType()->getScalarType()->isFP128Ty() ||
+    } else if ((J->getType()->getScalarType()->isFP128Ty() ||
                 J->getType()->getScalarType()->isPPC_FP128Ty())) {
       // Most operations on f128 or ppc_f128 values become calls.
       return true;
+    } else if (isa<FCmpInst>(J) &&
+               J->getOperand(0)->getType()->getScalarType()->isFP128Ty()) {
+      return true;
+    } else if ((isa<FPTruncInst>(J) || isa<FPExtInst>(J)) &&
+               (cast<CastInst>(J)->getSrcTy()->getScalarType()->isFP128Ty() ||
+                cast<CastInst>(J)->getDestTy()->getScalarType()->isFP128Ty())) {
+      return true;
     } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
                isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
       CastInst *CI = cast<CastInst>(J);
@@ -1295,8 +1300,8 @@ bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
                             LoopInfo *LI, DominatorTree *DT,
                             AssumptionCache *AC, TargetLibraryInfo *LibInfo) {
   // Process nested loops first.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
-    if (canSaveCmp(*I, BI, SE, LI, DT, AC, LibInfo))
+  for (Loop *I : *L)
+    if (canSaveCmp(I, BI, SE, LI, DT, AC, LibInfo))
       return false; // Stop search.
 
   HardwareLoopInfo HWLoopInfo(L);
diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 0be35adc35c7..8a7d324ddfe1 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -297,18 +297,16 @@ protected:
         // fma result.
 
         LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
-        for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
-             AI != AE; ++AI) {
+        for (auto &AI : FMAInt) {
           // Don't add the segment that corresponds to the original copy.
-          if (AI->valno == AddendValNo)
+          if (AI.valno == AddendValNo)
             continue;
 
           VNInfo *NewFMAValNo =
-            NewFMAInt.getNextValue(AI->start,
-                                   LIS->getVNInfoAllocator());
+              NewFMAInt.getNextValue(AI.start, LIS->getVNInfoAllocator());
 
-          NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
-                                                     NewFMAValNo));
+          NewFMAInt.addSegment(
+              LiveInterval::Segment(AI.start, AI.end, NewFMAValNo));
         }
         LLVM_DEBUG(dbgs() << "  extended: " << NewFMAInt << '\n');
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 75592dd4c6f5..a2ea34fe11c7 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -302,7 +302,7 @@ struct RISCVOperand : public MCParsedAsmOperand {
     struct VTypeOp VType;
   };
 
-  RISCVOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  RISCVOperand(KindTy K) : Kind(K) {}
 
 public:
   RISCVOperand(const RISCVOperand &o) : MCParsedAsmOperand() {
@@ -337,7 +337,6 @@ public:
   bool isImm() const override { return Kind == KindTy::Immediate; }
   bool isMem() const override { return false; }
   bool isSystemRegister() const { return Kind == KindTy::SystemRegister; }
-  bool isVType() const { return Kind == KindTy::VType; }
 
   bool isGPR() const {
     return Kind == KindTy::Register &&
@@ -421,7 +420,27 @@ public:
 
   bool isCSRSystemRegister() const { return isSystemRegister(); }
 
-  bool isVTypeI() const { return isVType(); }
+  bool isVTypeImm(unsigned N) const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (!isImm())
+      return false;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isUIntN(N, Imm) && VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
+  // If the last operand of the vsetvli/vsetvli instruction is a constant
+  // expression, KindTy is Immediate.
+  bool isVTypeI10() const {
+    if (Kind == KindTy::Immediate)
+      return isVTypeImm(10);
+    return Kind == KindTy::VType;
+  }
+  bool isVTypeI11() const {
+    if (Kind == KindTy::Immediate)
+      return isVTypeImm(11);
+    return Kind == KindTy::VType;
+  }
 
   /// Return true if the operand is a valid for the fence instruction e.g.
   /// ('iorw').
@@ -547,6 +566,16 @@ public:
     return IsConstantImm && isUInt<7>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
   }
 
+  bool isRnumArg() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (!isImm())
+      return false;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && Imm >= INT64_C(0) && Imm <= INT64_C(10) &&
+           VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isSImm5() const {
     if (!isImm())
       return false;
@@ -898,9 +927,21 @@ public:
     Inst.addOperand(MCOperand::createImm(SysReg.Encoding));
   }
 
+  // Support non-canonical syntax:
+  // "vsetivli rd, uimm, 0xabc" or "vsetvli rd, rs1, 0xabc"
+  // "vsetivli rd, uimm, (0xc << N)" or "vsetvli rd, rs1, (0xc << N)"
   void addVTypeIOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createImm(getVType()));
+    int64_t Imm = 0;
+    if (Kind == KindTy::Immediate) {
+      RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+      bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+      (void)IsConstantImm;
+      assert(IsConstantImm && "Invalid VTypeI Operand!");
+    } else {
+      Imm = getVType();
+    }
+    Inst.addOperand(MCOperand::createImm(Imm));
   }
 
   // Returns the rounding mode represented by this RISCVOperand. Should only
@@ -1209,6 +1250,9 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                       (1 << 4),
                                       "immediate must be in the range");
   }
+  case Match_InvalidRnumArg: {
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, 10);
+  }
   }
 
   llvm_unreachable("Unknown match type detected!");
@@ -1881,8 +1925,10 @@ bool RISCVAsmParser::ParseInstruction(ParseInstructionInfo &Info,
   Operands.push_back(RISCVOperand::createToken(Name, NameLoc, isRV64()));
 
   // If there are no more operands, then finish
-  if (getLexer().is(AsmToken::EndOfStatement))
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    getParser().Lex(); // Consume the EndOfStatement.
     return false;
+  }
 
   // Parse first operand
   if (parseOperand(Operands, Name))
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 9cfd36745f46..01c6bd90ea58 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -191,7 +191,8 @@ enum OperandType : unsigned {
   OPERAND_SIMM12,
   OPERAND_UIMM20,
   OPERAND_UIMMLOG2XLEN,
-  OPERAND_LAST_RISCV_IMM = OPERAND_UIMMLOG2XLEN,
+  OPERAND_RVKRNUM,
+  OPERAND_LAST_RISCV_IMM = OPERAND_RVKRNUM,
   // Operand is either a register or uimm5, this is used by V extension pseudo
   // instructions to represent a value that be passed as AVL to either vsetvli
   // or vsetivli.
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 59d8bb009d1c..7ce7dafb8ca1 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H
-#define LLVM_LIB_TARGET_RISCV_RISCVELFSTREAMER_H
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVELFSTREAMER_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVELFSTREAMER_H
 
 #include "RISCVTargetStreamer.h"
 #include "llvm/MC/MCELFStreamer.h"
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 89a7d54f60f8..3268740849f0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -85,7 +85,7 @@ void RISCVInstPrinter::printRegName(raw_ostream &O, unsigned RegNo) const {
 void RISCVInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                     const MCSubtargetInfo &STI, raw_ostream &O,
                                     const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &MO = MI->getOperand(OpNo);
 
   if (MO.isReg()) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 0ee6d8de78c9..18858209aa9b 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -141,6 +141,24 @@ static void generateInstSeqImpl(int64_t Val,
     Res.push_back(RISCVMatInt::Inst(RISCV::ADDI, Lo12));
 }
 
+static unsigned extractRotateInfo(int64_t Val) {
+  // for case: 0b111..1..xxxxxx1..1..
+  unsigned LeadingOnes = countLeadingOnes((uint64_t)Val);
+  unsigned TrailingOnes = countTrailingOnes((uint64_t)Val);
+  if (TrailingOnes > 0 && TrailingOnes < 64 &&
+      (LeadingOnes + TrailingOnes) > (64 - 12))
+    return 64 - TrailingOnes;
+
+  // for case: 0bxxx1..1..1...xxx
+  unsigned UpperTrailingOnes = countTrailingOnes(Hi_32(Val));
+  unsigned LowerLeadingOnes = countLeadingOnes(Lo_32(Val));
+  if (UpperTrailingOnes < 32 &&
+      (UpperTrailingOnes + LowerLeadingOnes) > (64 - 12))
+    return 32 - UpperTrailingOnes;
+
+  return 0;
+}
+
 namespace llvm {
 namespace RISCVMatInt {
 InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
@@ -312,6 +330,18 @@ InstSeq generateInstSeq(int64_t Val, const FeatureBitset &ActiveFeatures) {
     }
   }
 
+  // Perform optimization with rori in the Zbb extension.
+  if (Res.size() > 2 && ActiveFeatures[RISCV::FeatureStdExtZbb]) {
+    if (unsigned Rotate = extractRotateInfo(Val)) {
+      RISCVMatInt::InstSeq TmpSeq;
+      uint64_t NegImm12 =
+          ((uint64_t)Val >> (64 - Rotate)) | ((uint64_t)Val << Rotate);
+      assert(isInt<12>(NegImm12));
+      TmpSeq.push_back(RISCVMatInt::Inst(RISCV::ADDI, NegImm12));
+      TmpSeq.push_back(RISCVMatInt::Inst(RISCV::RORI, Rotate));
+      Res = TmpSeq;
+    }
+  }
   return Res;
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
index 02b4b18f54bd..6a8e0c640001 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_RISCV_MATINT_H
-#define LLVM_LIB_TARGET_RISCV_MATINT_H
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_MATINT_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_MATINT_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -15,7 +15,6 @@
 
 namespace llvm {
 class APInt;
-class MCSubtargetInfo;
 
 namespace RISCVMatInt {
 struct Inst {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
index 0bda3de0ce5d..171780d94ce7 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H
-#define LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H
+#ifndef LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVTARGETSTREAMER_H
+#define LLVM_LIB_TARGET_RISCV_MCTARGETDESC_RISCVTARGETSTREAMER_H
 
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index b415c9f35e7f..03462240fd93 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -40,6 +40,9 @@ FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM);
 FunctionPass *createRISCVGatherScatterLoweringPass();
 void initializeRISCVGatherScatterLoweringPass(PassRegistry &);
 
+FunctionPass *createRISCVSExtWRemovalPass();
+void initializeRISCVSExtWRemovalPass(PassRegistry &);
+
 FunctionPass *createRISCVMergeBaseOffsetOptPass();
 void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 6aa915c01929..5b0f27c5e937 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -42,7 +42,7 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                            "'D' (Double-Precision Floating-Point)">;
 
 def FeatureStdExtZfhmin
-    : SubtargetFeature<"experimental-zfhmin", "HasStdExtZfhmin", "true",
+    : SubtargetFeature<"zfhmin", "HasStdExtZfhmin", "true",
                        "'Zfhmin' (Half-Precision Floating-Point Minimal)",
                        [FeatureStdExtF]>;
 def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
@@ -50,7 +50,7 @@ def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
                              "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
 
 def FeatureStdExtZfh
-    : SubtargetFeature<"experimental-zfh", "HasStdExtZfh", "true",
+    : SubtargetFeature<"zfh", "HasStdExtZfh", "true",
                        "'Zfh' (Half-Precision Floating-Point)",
                        [FeatureStdExtZfhmin, FeatureStdExtF]>;
 def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
@@ -65,83 +65,217 @@ def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                            "'C' (Compressed Instructions)">;
 
 def FeatureStdExtZba
-    : SubtargetFeature<"experimental-zba", "HasStdExtZba", "true",
-                       "'Zba' (Address calculation 'B' Instructions)">;
+    : SubtargetFeature<"zba", "HasStdExtZba", "true",
+                       "'Zba' (Address Generation Instructions)">;
 def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
                              AssemblerPredicate<(all_of FeatureStdExtZba),
-                             "'Zba' (Address calculation 'B' Instructions)">;
+                             "'Zba' (Address Generation Instructions)">;
 def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
 
 def FeatureStdExtZbb
-    : SubtargetFeature<"experimental-zbb", "HasStdExtZbb", "true",
-                       "'Zbb' (Base 'B' Instructions)">;
+    : SubtargetFeature<"zbb", "HasStdExtZbb", "true",
+                       "'Zbb' (Basic Bit-Manipulation)">;
 def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbb),
-                             "'Zbb' (Base 'B' Instructions)">;
+                             "'Zbb' (Basic Bit-Manipulation)">;
 
 def FeatureStdExtZbc
-    : SubtargetFeature<"experimental-zbc", "HasStdExtZbc", "true",
-                       "'Zbc' (Carry-Less 'B' Instructions)">;
+    : SubtargetFeature<"zbc", "HasStdExtZbc", "true",
+                       "'Zbc' (Carry-Less Multiplication)">;
 def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbc),
-                             "'Zbc' (Carry-Less 'B' Instructions)">;
+                             "'Zbc' (Carry-Less Multiplication)">;
 
 def FeatureStdExtZbe
     : SubtargetFeature<"experimental-zbe", "HasStdExtZbe", "true",
-                       "'Zbe' (Extract-Deposit 'B' Instructions)">;
+                       "'Zbe' (Extract-Deposit 'Zb' Instructions)">;
 def HasStdExtZbe : Predicate<"Subtarget->hasStdExtZbe()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbe),
-                             "'Zbe' (Extract-Deposit 'B' Instructions)">;
+                             "'Zbe' (Extract-Deposit 'Zb' Instructions)">;
 
 def FeatureStdExtZbf
     : SubtargetFeature<"experimental-zbf", "HasStdExtZbf", "true",
-                       "'Zbf' (Bit-Field 'B' Instructions)">;
+                       "'Zbf' (Bit-Field 'Zb' Instructions)">;
 def HasStdExtZbf : Predicate<"Subtarget->hasStdExtZbf()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbf),
-                             "'Zbf' (Bit-Field 'B' Instructions)">;
+                             "'Zbf' (Bit-Field 'Zb' Instructions)">;
 
 def FeatureStdExtZbm
     : SubtargetFeature<"experimental-zbm", "HasStdExtZbm", "true",
-                       "'Zbm' (Matrix 'B' Instructions)">;
+                       "'Zbm' (Matrix 'Zb' Instructions)">;
 def HasStdExtZbm : Predicate<"Subtarget->hasStdExtZbm()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbm),
-                             "'Zbm' (Matrix 'B' Instructions)">;
+                             "'Zbm' (Matrix 'Zb' Instructions)">;
 
 def FeatureStdExtZbp
     : SubtargetFeature<"experimental-zbp", "HasStdExtZbp", "true",
-                       "'Zbp' (Permutation 'B' Instructions)">;
+                       "'Zbp' (Permutation 'Zb' Instructions)">;
 def HasStdExtZbp : Predicate<"Subtarget->hasStdExtZbp()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbp),
-                             "'Zbp' (Permutation 'B' Instructions)">;
+                             "'Zbp' (Permutation 'Zb' Instructions)">;
 
 def FeatureStdExtZbr
     : SubtargetFeature<"experimental-zbr", "HasStdExtZbr", "true",
-                       "'Zbr' (Polynomial Reduction 'B' Instructions)">;
+                       "'Zbr' (Polynomial Reduction 'Zb' Instructions)">;
 def HasStdExtZbr : Predicate<"Subtarget->hasStdExtZbr()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbr),
-                             "'Zbr' (Polynomial Reduction 'B' Instructions)">;
+                             "'Zbr' (Polynomial Reduction 'Zb' Instructions)">;
 
 def FeatureStdExtZbs
-    : SubtargetFeature<"experimental-zbs", "HasStdExtZbs", "true",
-                       "'Zbs' (Single-Bit 'B' Instructions)">;
+    : SubtargetFeature<"zbs", "HasStdExtZbs", "true",
+                       "'Zbs' (Single-Bit Instructions)">;
 def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbs),
-                             "'Zbs' (Single-Bit 'B' Instructions)">;
+                             "'Zbs' (Single-Bit Instructions)">;
 
 def FeatureStdExtZbt
     : SubtargetFeature<"experimental-zbt", "HasStdExtZbt", "true",
-                       "'Zbt' (Ternary 'B' Instructions)">;
+                       "'Zbt' (Ternary 'Zb' Instructions)">;
 def HasStdExtZbt : Predicate<"Subtarget->hasStdExtZbt()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbt),
-                             "'Zbt' (Ternary 'B' Instructions)">;
+                             "'Zbt' (Ternary 'Zb' Instructions)">;
 
 // Some instructions belong to both the basic and the permutation
 // subextensions. They should be enabled if either has been specified.
 def HasStdExtZbbOrZbp
     : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()">,
                 AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbp),
-                                   "'Zbb' (Base 'B' Instructions) or "
-                                   "'Zbp' (Permutation 'B' Instructions)">;
+                                   "'Zbb' (Basic Bit-Manipulation) or "
+                                   "'Zbp' (Permutation 'Zb' Instructions)">;
+
+def FeatureStdExtZbkb
+    : SubtargetFeature<"zbkb", "HasStdExtZbkb", "true",
+                       "'Zbkb' (Bitmanip instructions for Cryptography)">;
+def HasStdExtZbkb : Predicate<"Subtarget->hasStdExtZbkb()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZbkb),
+                             "'Zbkb' (Bitmanip instructions for Cryptography)">;
+
+def FeatureStdExtZbkx
+    : SubtargetFeature<"zbkx", "HasStdExtZbkx", "true",
+                       "'Zbkx' (Crossbar permutation instructions)">;
+def HasStdExtZbkx : Predicate<"Subtarget->hasStdExtZbkx()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZbkx),
+                             "'Zbkx' (Crossbar permutation instructions)">;
+
+def HasStdExtZbpOrZbkx
+    : Predicate<"Subtarget->hasStdExtZbp() || Subtarget->hasStdExtZbkx()">,
+                AssemblerPredicate<(any_of FeatureStdExtZbp, FeatureStdExtZbkx),
+                                   "'Zbp' (Permutation 'Zb' Instructions) or "
+                                   "'Zbkx' (Crossbar permutation instructions)">;
+
+def HasStdExtZbpOrZbkb
+    : Predicate<"Subtarget->hasStdExtZbp() || Subtarget->hasStdExtZbkb()">,
+                AssemblerPredicate<(any_of FeatureStdExtZbp, FeatureStdExtZbkb),
+                                   "'Zbp' (Permutation 'Zb' Instructions) or "
+                                   "'Zbkb' (Bitmanip instructions for Cryptography)">;
+
+def HasStdExtZbbOrZbkb
+    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb()">,
+                AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb),
+                                   "'Zbb' (Basic Bit-Manipulation) or "
+                                   "'Zbkb' (Bitmanip instructions for Cryptography)">;
+
+def HasStdExtZbbOrZbpOrZbkb
+    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp() || Subtarget->hasStdExtZbkb()">,
+                AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbp, FeatureStdExtZbkb),
+                                   "'Zbb' (Basic Bit-Manipulation) or "
+                                   "'Zbp' (Permutation 'Zb' Instructions) or "
+                                   "'Zbkb' (Bitmanip instructions for Cryptography)">;
+
+// The Carry-less multiply subextension for cryptography is a subset of basic carry-less multiply subextension. The former should be enabled if the latter is enabled.
+def FeatureStdExtZbkc
+    : SubtargetFeature<"zbkc", "HasStdExtZbkc", "true",
+                       "'Zbkc' (Carry-less multiply instructions for Cryptography)">;
+def HasStdExtZbkc
+    : Predicate<"Subtarget->hasStdExtZbkc()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZbkc),
+                             "'Zbkc' (Carry-less multiply instructions for Cryptography)">;
+
+def HasStdExtZbcOrZbkc
+    : Predicate<"Subtarget->hasStdExtZbc() || Subtarget->hasStdExtZbkc()">,
+                AssemblerPredicate<(any_of FeatureStdExtZbc, FeatureStdExtZbkc),
+                                   "'Zbc' (Carry-Less Multiplication) or "
+                                   "'Zbkc' (Carry-less multiply instructions for Cryptography)">;
+
+def FeatureStdExtZknd
+    : SubtargetFeature<"zknd", "HasStdExtZknd", "true",
+                       "'Zknd' (NIST Suite: AES Decryption)">;
+def HasStdExtZknd : Predicate<"Subtarget->hasStdExtZknd()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZknd),
+                             "'Zknd' (NIST Suite: AES Decryption)">;
+
+def FeatureStdExtZkne
+    : SubtargetFeature<"zkne", "HasStdExtZkne", "true",
+                       "'Zkne' (NIST Suite: AES Encryption)">;
+def HasStdExtZkne : Predicate<"Subtarget->hasStdExtZkne()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZkne),
+                             "'Zkne' (NIST Suite: AES Encryption)">;
+
+// Some instructions belong to both Zknd and Zkne subextensions.
+// They should be enabled if either has been specified.
+def HasStdExtZkndOrZkne
+    : Predicate<"Subtarget->hasStdExtZknd() || Subtarget->hasStdExtZkne()">,
+                AssemblerPredicate<(any_of FeatureStdExtZknd, FeatureStdExtZkne),
+                                   "'Zknd' (NIST Suite: AES Decryption) or "
+                                   "'Zkne' (NIST Suite: AES Encryption)">;
+
+def FeatureStdExtZknh
+    : SubtargetFeature<"zknh", "HasStdExtZknh", "true",
+                       "'Zknh' (NIST Suite: Hash Function Instructions)">;
+def HasStdExtZknh : Predicate<"Subtarget->hasStdExtZknh()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZknh),
+                             "'Zknh' (NIST Suite: Hash Function Instructions)">;
+
+def FeatureStdExtZksed
+    : SubtargetFeature<"zksed", "HasStdExtZksed", "true",
+                       "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
+def HasStdExtZksed : Predicate<"Subtarget->hasStdExtZksed()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZksed),
+                             "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
+
+def FeatureStdExtZksh
+    : SubtargetFeature<"zksh", "HasStdExtZksh", "true",
+                       "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
+def HasStdExtZksh : Predicate<"Subtarget->hasStdExtZksh()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZksh),
+                             "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
+
+def FeatureStdExtZkr
+    : SubtargetFeature<"zkr", "HasStdExtZkr", "true",
+                       "'Zkr' (Entropy Source Extension)">;
+def HasStdExtZkr : Predicate<"Subtarget->hasStdExtZkr()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZkr),
+                             "'Zkr' (Entropy Source Extension)">;
+
+def FeatureStdExtZkn
+    : SubtargetFeature<"zkn", "HasStdExtZkn", "true",
+                       "'Zkn' (NIST Algorithm Suite)",
+                       [FeatureStdExtZbkb,
+                        FeatureStdExtZbkc,
+                        FeatureStdExtZbkx,
+                        FeatureStdExtZkne,
+                        FeatureStdExtZknd,
+                        FeatureStdExtZknh]>;
+
+def FeatureStdExtZks
+    : SubtargetFeature<"zks", "HasStdExtZks", "true",
+                       "'Zks' (ShangMi Algorithm Suite)",
+                       [FeatureStdExtZbkb,
+                        FeatureStdExtZbkc,
+                        FeatureStdExtZbkx,
+                        FeatureStdExtZksed,
+                        FeatureStdExtZksh]>;
+
+def FeatureStdExtZkt
+    : SubtargetFeature<"zkt", "HasStdExtZkt", "true",
+                       "'Zkt' (Data Independent Execution Latency)">;
+
+def FeatureStdExtZk
+    : SubtargetFeature<"zk", "HasStdExtZk", "true",
+                       "'Zk' (Standard scalar cryptography extension)",
+                       [FeatureStdExtZkn,
+                        FeatureStdExtZkr,
+                        FeatureStdExtZkt]>;
 
 def FeatureNoRVCHints
     : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false",
@@ -150,23 +284,66 @@ def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
                   AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
                                       "RVC Hint Instructions">;
 
+def FeatureStdExtZvl32b : SubtargetFeature<"zvl32b", "ZvlLen", "ExtZvl::Zvl32b",
+                       "'Zvl' (Minimum Vector Length) 32">;
+
+foreach i = { 6-15 } in {
+    defvar I = !shl(1, i);
+    def FeatureStdExtZvl#I#b :
+        SubtargetFeature<"zvl"#I#"b", "ZvlLen", "ExtZvl::Zvl"#I#"b",
+                        "'Zvl' (Minimum Vector Length) "#I,
+                        [!cast<SubtargetFeature>("FeatureStdExtZvl"#!srl(I, 1)#"b")]>;
+}
+
+def FeatureStdExtZve32x
+    : SubtargetFeature<"zve32x", "HasStdExtZve32x", "true",
+                       "'Zve32x' (Vector Extensions for Embedded Processors "
+                       "with maximal 32 EEW)",
+                       [FeatureStdExtZvl32b]>;
+
+def FeatureStdExtZve32f
+    : SubtargetFeature<"zve32f", "HasStdExtZve32f", "true",
+                       "'Zve32f' (Vector Extensions for Embedded Processors "
+                       "with maximal 32 EEW and F extension)",
+                       [FeatureStdExtZve32x]>;
+
+def FeatureStdExtZve64x
+    : SubtargetFeature<"zve64x", "HasStdExtZve64x", "true",
+                       "'Zve64x' (Vector Extensions for Embedded Processors "
+                       "with maximal 64 EEW)", [FeatureStdExtZve32x, FeatureStdExtZvl64b]>;
+
+def FeatureStdExtZve64f
+    : SubtargetFeature<"zve64f", "HasStdExtZve64f", "true",
+                       "'Zve64f' (Vector Extensions for Embedded Processors "
+                       "with maximal 64 EEW and F extension)",
+                       [FeatureStdExtZve32f, FeatureStdExtZve64x]>;
+
+def FeatureStdExtZve64d
+    : SubtargetFeature<"zve64d", "HasStdExtZve64d", "true",
+                       "'Zve64d' (Vector Extensions for Embedded Processors "
+                       "with maximal 64 EEW, F and D extension)",
+                       [FeatureStdExtZve64f]>;
+
 def FeatureStdExtV
-    : SubtargetFeature<"experimental-v", "HasStdExtV", "true",
-                       "'V' (Vector Instructions)">;
-def HasStdExtV : Predicate<"Subtarget->hasStdExtV()">,
-                           AssemblerPredicate<(all_of FeatureStdExtV),
-                           "'V' (Vector Instructions)">;
-
-def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">;
-def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">;
-
-def FeatureStdExtZvlsseg
-    : SubtargetFeature<"experimental-zvlsseg", "HasStdExtZvlsseg", "true",
-                       "'Zvlsseg' (Vector segment load/store instructions)",
-                       [FeatureStdExtV]>;
-def HasStdExtZvlsseg : Predicate<"Subtarget->hasStdExtZvlsseg()">,
-                                 AssemblerPredicate<(all_of FeatureStdExtZvlsseg),
-                                 "'Zvlsseg' (Vector segment load/store instructions)">;
+    : SubtargetFeature<"v", "HasStdExtV", "true",
+                       "'V' (Vector Extension for Application Processors)",
+                       [FeatureStdExtZvl128b, FeatureStdExtF, FeatureStdExtD]>;
+
+def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">,
+      AssemblerPredicate<
+          (any_of FeatureStdExtZve32x, FeatureStdExtV),
+          "'V' (Vector Extension for Application Processors), 'Zve32x' or "
+          "'Zve64x' (Vector Extensions for Embedded Processors)">;
+def HasVInstructionsI64 : Predicate<"Subtarget->hasVInstructionsI64()">,
+      AssemblerPredicate<
+          (any_of FeatureStdExtZve64x, FeatureStdExtV),
+          "'V' (Vector Extension for Application Processors) or 'Zve64x' "
+          "(Vector Extensions for Embedded Processors)">;
+def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">,
+      AssemblerPredicate<
+          (any_of FeatureStdExtZve32f, FeatureStdExtV),
+          "'V' (Vector Extension for Application Processors), 'Zve32f', "
+          "'Zve64f' or 'Zve64d' (Vector Extensions for Embedded Processors)">;
 
 def Feature64Bit
     : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
@@ -198,6 +375,9 @@ foreach i = {1-31} in
 def FeatureSaveRestore : SubtargetFeature<"save-restore", "EnableSaveRestore",
                                           "true", "Enable save/restore.">;
 
+def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
+                                   "SiFive 7-Series processors">;
+
 //===----------------------------------------------------------------------===//
 // Named operands for CSR instructions.
 //===----------------------------------------------------------------------===//
@@ -226,8 +406,10 @@ def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
 def : ProcessorModel<"rocket-rv32", RocketModel, []>;
 def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>;
 
-def : ProcessorModel<"sifive-7-rv32", SiFive7Model, []>;
-def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit]>;
+def : ProcessorModel<"sifive-7-rv32", SiFive7Model, [],
+                     [TuneSiFive7]>;
+def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit],
+                     [TuneSiFive7]>;
 
 def : ProcessorModel<"sifive-e20", RocketModel, [FeatureStdExtM,
                                                  FeatureStdExtC]>;
@@ -253,7 +435,8 @@ def : ProcessorModel<"sifive-e34", RocketModel, [FeatureStdExtM,
 def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
                                                   FeatureStdExtA,
                                                   FeatureStdExtF,
-                                                  FeatureStdExtC]>;
+                                                  FeatureStdExtC],
+                     [TuneSiFive7]>;
 
 def : ProcessorModel<"sifive-s21", RocketModel, [Feature64Bit,
                                                  FeatureStdExtM,
@@ -277,7 +460,8 @@ def : ProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit,
                                                   FeatureStdExtA,
                                                   FeatureStdExtF,
                                                   FeatureStdExtD,
-                                                  FeatureStdExtC]>;
+                                                  FeatureStdExtC],
+                     [TuneSiFive7]>;
 
 def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
                                                  FeatureStdExtM,
@@ -291,7 +475,8 @@ def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit,
                                                   FeatureStdExtA,
                                                   FeatureStdExtF,
                                                   FeatureStdExtD,
-                                                  FeatureStdExtC]>;
+                                                  FeatureStdExtC],
+                     [TuneSiFive7]>;
 
 //===----------------------------------------------------------------------===//
 // Define the RISC-V target.
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index f5d491938050..ad003404d793 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -242,7 +242,8 @@ bool RISCVFrameLowering::hasBP(const MachineFunction &MF) const {
   // adjustment, we can not use SP to access the stack objects for the
   // arguments. Instead, use BP to access these stack objects.
   return (MFI.hasVarSizedObjects() ||
-          (!hasReservedCallFrame(MF) && MFI.getMaxCallFrameSize() != 0)) &&
+          (!hasReservedCallFrame(MF) && (!MFI.isMaxCallFrameSizeComputed() ||
+                                         MFI.getMaxCallFrameSize() != 0))) &&
          TRI->hasStackRealignment(MF);
 }
 
@@ -940,11 +941,22 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
 }
 
 static bool hasRVVFrameObject(const MachineFunction &MF) {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I)
-    if (MFI.getStackID(I) == TargetStackID::ScalableVector)
-      return true;
-  return false;
+  // Originally, the function will scan all the stack objects to check whether
+  // if there is any scalable vector object on the stack or not. However, it
+  // causes errors in the register allocator. In issue 53016, it returns false
+  // before RA because there is no RVV stack objects. After RA, it returns true
+  // because there are spilling slots for RVV values during RA. It will not
+  // reserve BP during register allocation and generate BP access in the PEI
+  // pass due to the inconsistent behavior of the function.
+  //
+  // The function is changed to use hasVInstructions() as the return value. It
+  // is not precise, but it can make the register allocation correct.
+  //
+  // FIXME: Find a better way to make the decision or revisit the solution in
+  // D103622.
+  //
+  // Refer to https://github.com/llvm/llvm-project/issues/53016.
+  return MF.getSubtarget<RISCVSubtarget>().hasVInstructions();
 }
 
 // Not preserve stack space within prologue for outgoing variables when the
diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
index d47bd739235f..ba91b16661a4 100644
--- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp
@@ -127,6 +127,41 @@ static std::pair<Value *, Value *> matchStridedConstant(Constant *StartC) {
   return std::make_pair(StartVal, Stride);
 }
 
+static std::pair<Value *, Value *> matchStridedStart(Value *Start,
+                                                     IRBuilder<> &Builder) {
+  // Base case, start is a strided constant.
+  auto *StartC = dyn_cast<Constant>(Start);
+  if (StartC)
+    return matchStridedConstant(StartC);
+
+  // Not a constant, maybe it's a strided constant with a splat added to it.
+  auto *BO = dyn_cast<BinaryOperator>(Start);
+  if (!BO || BO->getOpcode() != Instruction::Add)
+    return std::make_pair(nullptr, nullptr);
+
+  // Look for an operand that is splatted.
+  unsigned OtherIndex = 1;
+  Value *Splat = getSplatValue(BO->getOperand(0));
+  if (!Splat) {
+    Splat = getSplatValue(BO->getOperand(1));
+    OtherIndex = 0;
+  }
+  if (!Splat)
+    return std::make_pair(nullptr, nullptr);
+
+  Value *Stride;
+  std::tie(Start, Stride) = matchStridedStart(BO->getOperand(OtherIndex),
+                                              Builder);
+  if (!Start)
+    return std::make_pair(nullptr, nullptr);
+
+  // Add the splat value to the start.
+  Builder.SetInsertPoint(BO);
+  Builder.SetCurrentDebugLocation(DebugLoc());
+  Start = Builder.CreateAdd(Start, Splat);
+  return std::make_pair(Start, Stride);
+}
+
 // Recursively, walk about the use-def chain until we find a Phi with a strided
 // start value. Build and update a scalar recurrence as we unwind the recursion.
 // We also update the Stride as we unwind. Our goal is to move all of the
@@ -161,12 +196,7 @@ bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L,
     if (!Step)
       return false;
 
-    // Start should be a strided constant.
-    auto *StartC = dyn_cast<Constant>(Start);
-    if (!StartC)
-      return false;
-
-    std::tie(Start, Stride) = matchStridedConstant(StartC);
+    std::tie(Start, Stride) = matchStridedStart(Start, Builder);
     if (!Start)
       return false;
     assert(Stride != nullptr);
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index b24eb5f7bbf4..5870502d74d5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -86,8 +86,12 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
     SDVTList VTs = CurDAG->getVTList({VT, MVT::Other});
     SDValue IntID =
         CurDAG->getTargetConstant(Intrinsic::riscv_vlse, DL, MVT::i64);
-    SDValue Ops[] = {Chain, IntID, StackSlot,
-                     CurDAG->getRegister(RISCV::X0, MVT::i64), VL};
+    SDValue Ops[] = {Chain,
+                     IntID,
+                     CurDAG->getUNDEF(VT),
+                     StackSlot,
+                     CurDAG->getRegister(RISCV::X0, MVT::i64),
+                     VL};
 
     SDValue Result = CurDAG->getMemIntrinsicNode(
         ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, MVT::i64, MPI, Align(8),
@@ -125,12 +129,37 @@ void RISCVDAGToDAGISel::PostprocessISelDAG() {
     CurDAG->RemoveDeadNodes();
 }
 
-static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, int64_t Imm,
-                         const RISCVSubtarget &Subtarget) {
+static SDNode *selectImmWithConstantPool(SelectionDAG *CurDAG, const SDLoc &DL,
+                                         const MVT VT, int64_t Imm,
+                                         const RISCVSubtarget &Subtarget) {
+  assert(VT == MVT::i64 && "Expecting MVT::i64");
+  const RISCVTargetLowering *TLI = Subtarget.getTargetLowering();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(CurDAG->getConstantPool(
+      ConstantInt::get(EVT(VT).getTypeForEVT(*CurDAG->getContext()), Imm), VT));
+  SDValue Addr = TLI->getAddr(CP, *CurDAG);
+  SDValue Offset = CurDAG->getTargetConstant(0, DL, VT);
+  // Since there is no data race, the chain can be the entry node.
+  SDNode *Load = CurDAG->getMachineNode(RISCV::LD, DL, VT, Addr, Offset,
+                                        CurDAG->getEntryNode());
+  MachineFunction &MF = CurDAG->getMachineFunction();
+  MachineMemOperand *MemOp = MF.getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
+      LLT(VT), CP->getAlign());
+  CurDAG->setNodeMemRefs(cast<MachineSDNode>(Load), {MemOp});
+  return Load;
+}
+
+static SDNode *selectImm(SelectionDAG *CurDAG, const SDLoc &DL, const MVT VT,
+                         int64_t Imm, const RISCVSubtarget &Subtarget) {
   MVT XLenVT = Subtarget.getXLenVT();
   RISCVMatInt::InstSeq Seq =
       RISCVMatInt::generateInstSeq(Imm, Subtarget.getFeatureBits());
 
+  // If Imm is expensive to build, then we put it into constant pool.
+  if (Subtarget.useConstantPoolForLargeInts() &&
+      Seq.size() > Subtarget.getMaxBuildIntsCost())
+    return selectImmWithConstantPool(CurDAG, DL, VT, Imm, Subtarget);
+
   SDNode *Result = nullptr;
   SDValue SrcReg = CurDAG->getRegister(RISCV::X0, XLenVT);
   for (RISCVMatInt::Inst &Inst : Seq) {
@@ -372,6 +401,10 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, bool IsMasked,
 
   RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+  if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+    report_fatal_error("The V extension does not support EEW=64 for index "
+                       "values when XLEN=32");
+  }
   const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
       static_cast<unsigned>(IndexLMUL));
@@ -450,6 +483,10 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, bool IsMasked,
 
   RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+  if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+    report_fatal_error("The V extension does not support EEW=64 for index "
+                       "values when XLEN=32");
+  }
   const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
       static_cast<unsigned>(IndexLMUL));
@@ -462,6 +499,75 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, bool IsMasked,
   ReplaceNode(Node, Store);
 }
 
+void RISCVDAGToDAGISel::selectVSETVLI(SDNode *Node) {
+  if (!Subtarget->hasVInstructions())
+    return;
+
+  assert((Node->getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Node->getOpcode() == ISD::INTRINSIC_WO_CHAIN) &&
+         "Unexpected opcode");
+
+  SDLoc DL(Node);
+  MVT XLenVT = Subtarget->getXLenVT();
+
+  bool HasChain = Node->getOpcode() == ISD::INTRINSIC_W_CHAIN;
+  unsigned IntNoOffset = HasChain ? 1 : 0;
+  unsigned IntNo = Node->getConstantOperandVal(IntNoOffset);
+
+  assert((IntNo == Intrinsic::riscv_vsetvli ||
+          IntNo == Intrinsic::riscv_vsetvlimax ||
+          IntNo == Intrinsic::riscv_vsetvli_opt ||
+          IntNo == Intrinsic::riscv_vsetvlimax_opt) &&
+         "Unexpected vsetvli intrinsic");
+
+  bool VLMax = IntNo == Intrinsic::riscv_vsetvlimax ||
+               IntNo == Intrinsic::riscv_vsetvlimax_opt;
+  unsigned Offset = IntNoOffset + (VLMax ? 1 : 2);
+
+  assert(Node->getNumOperands() == Offset + 2 &&
+         "Unexpected number of operands");
+
+  unsigned SEW =
+      RISCVVType::decodeVSEW(Node->getConstantOperandVal(Offset) & 0x7);
+  RISCVII::VLMUL VLMul = static_cast<RISCVII::VLMUL>(
+      Node->getConstantOperandVal(Offset + 1) & 0x7);
+
+  unsigned VTypeI = RISCVVType::encodeVTYPE(VLMul, SEW, /*TailAgnostic*/ true,
+                                            /*MaskAgnostic*/ false);
+  SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
+
+  SmallVector<EVT, 2> VTs = {XLenVT};
+  if (HasChain)
+    VTs.push_back(MVT::Other);
+
+  SDValue VLOperand;
+  unsigned Opcode = RISCV::PseudoVSETVLI;
+  if (VLMax) {
+    VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT);
+    Opcode = RISCV::PseudoVSETVLIX0;
+  } else {
+    VLOperand = Node->getOperand(IntNoOffset + 1);
+
+    if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) {
+      uint64_t AVL = C->getZExtValue();
+      if (isUInt<5>(AVL)) {
+        SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT);
+        SmallVector<SDValue, 3> Ops = {VLImm, VTypeIOp};
+        if (HasChain)
+          Ops.push_back(Node->getOperand(0));
+        ReplaceNode(
+            Node, CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, VTs, Ops));
+        return;
+      }
+    }
+  }
+
+  SmallVector<SDValue, 3> Ops = {VLOperand, VTypeIOp};
+  if (HasChain)
+    Ops.push_back(Node->getOperand(0));
+
+  ReplaceNode(Node, CurDAG->getMachineNode(Opcode, DL, VTs, Ops));
+}
 
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we have already selected.
@@ -498,7 +604,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (!isInt<32>(Imm) && isUInt<32>(Imm) && hasAllWUsers(Node))
       Imm = SignExtend64(Imm, 32);
 
-    ReplaceNode(Node, selectImm(CurDAG, DL, Imm, *Subtarget));
+    ReplaceNode(Node, selectImm(CurDAG, DL, VT, Imm, *Subtarget));
     return;
   }
   case ISD::FrameIndex: {
@@ -509,38 +615,69 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
   case ISD::SRL: {
-    // We don't need this transform if zext.h is supported.
-    if (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())
+    // Optimize (srl (and X, C2), C) ->
+    //          (srli (slli X, (XLen-C3), (XLen-C3) + C)
+    // Where C2 is a mask with C3 trailing ones.
+    // Taking into account that the C2 may have had lower bits unset by
+    // SimplifyDemandedBits. This avoids materializing the C2 immediate.
+    // This pattern occurs when type legalizing right shifts for types with
+    // less than XLen bits.
+    auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+    if (!N1C)
+      break;
+    SDValue N0 = Node->getOperand(0);
+    if (N0.getOpcode() != ISD::AND || !N0.hasOneUse() ||
+        !isa<ConstantSDNode>(N0.getOperand(1)))
+      break;
+    unsigned ShAmt = N1C->getZExtValue();
+    uint64_t Mask = N0.getConstantOperandVal(1);
+    Mask |= maskTrailingOnes<uint64_t>(ShAmt);
+    if (!isMask_64(Mask))
+      break;
+    unsigned TrailingOnes = countTrailingOnes(Mask);
+    // 32 trailing ones should use srliw via tablegen pattern.
+    if (TrailingOnes == 32 || ShAmt >= TrailingOnes)
       break;
-    // Optimize (srl (and X, 0xffff), C) ->
-    //          (srli (slli X, (XLen-16), (XLen-16) + C)
-    // Taking into account that the 0xffff may have had lower bits unset by
-    // SimplifyDemandedBits. This avoids materializing the 0xffff immediate.
-    // This pattern occurs when type legalizing i16 right shifts.
-    // FIXME: This could be extended to other AND masks.
+    unsigned LShAmt = Subtarget->getXLen() - TrailingOnes;
+    SDNode *SLLI =
+        CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
+                               CurDAG->getTargetConstant(LShAmt, DL, VT));
+    SDNode *SRLI = CurDAG->getMachineNode(
+        RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
+        CurDAG->getTargetConstant(LShAmt + ShAmt, DL, VT));
+    ReplaceNode(Node, SRLI);
+    return;
+  }
+  case ISD::SRA: {
+    // Optimize (sra (sext_inreg X, i16), C) ->
+    //          (srai (slli X, (XLen-16), (XLen-16) + C)
+    // And      (sra (sext_inreg X, i8), C) ->
+    //          (srai (slli X, (XLen-8), (XLen-8) + C)
+    // This can occur when Zbb is enabled, which makes sext_inreg i16/i8 legal.
+    // This transform matches the code we get without Zbb. The shifts are more
+    // compressible, and this can help expose CSE opportunities in the sdiv by
+    // constant optimization.
     auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
-    if (N1C) {
-      uint64_t ShAmt = N1C->getZExtValue();
-      SDValue N0 = Node->getOperand(0);
-      if (ShAmt < 16 && N0.getOpcode() == ISD::AND && N0.hasOneUse() &&
-          isa<ConstantSDNode>(N0.getOperand(1))) {
-        uint64_t Mask = N0.getConstantOperandVal(1);
-        Mask |= maskTrailingOnes<uint64_t>(ShAmt);
-        if (Mask == 0xffff) {
-          unsigned LShAmt = Subtarget->getXLen() - 16;
-          SDNode *SLLI =
-              CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
-                                     CurDAG->getTargetConstant(LShAmt, DL, VT));
-          SDNode *SRLI = CurDAG->getMachineNode(
-              RISCV::SRLI, DL, VT, SDValue(SLLI, 0),
-              CurDAG->getTargetConstant(LShAmt + ShAmt, DL, VT));
-          ReplaceNode(Node, SRLI);
-          return;
-        }
-      }
-    }
-
-    break;
+    if (!N1C)
+      break;
+    SDValue N0 = Node->getOperand(0);
+    if (N0.getOpcode() != ISD::SIGN_EXTEND_INREG || !N0.hasOneUse())
+      break;
+    unsigned ShAmt = N1C->getZExtValue();
+    unsigned ExtSize =
+        cast<VTSDNode>(N0.getOperand(1))->getVT().getSizeInBits();
+    // ExtSize of 32 should use sraiw via tablegen pattern.
+    if (ExtSize >= 32 || ShAmt >= ExtSize)
+      break;
+    unsigned LShAmt = Subtarget->getXLen() - ExtSize;
+    SDNode *SLLI =
+        CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
+                               CurDAG->getTargetConstant(LShAmt, DL, VT));
+    SDNode *SRAI = CurDAG->getMachineNode(
+        RISCV::SRAI, DL, VT, SDValue(SLLI, 0),
+        CurDAG->getTargetConstant(LShAmt + ShAmt, DL, VT));
+    ReplaceNode(Node, SRAI);
+    return;
   }
   case ISD::AND: {
     auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
@@ -774,7 +911,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       ShiftedC1 = SignExtend64(ShiftedC1, 32);
 
     // Create (mulhu (slli X, lzcnt(C2)), C1 << (XLen - lzcnt(C2))).
-    SDNode *Imm = selectImm(CurDAG, DL, ShiftedC1, *Subtarget);
+    SDNode *Imm = selectImm(CurDAG, DL, VT, ShiftedC1, *Subtarget);
     SDNode *SLLI =
         CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
                                CurDAG->getTargetConstant(LeadingZeros, DL, VT));
@@ -793,62 +930,52 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::riscv_vmsge: {
       SDValue Src1 = Node->getOperand(1);
       SDValue Src2 = Node->getOperand(2);
+      bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu;
+      bool IsCmpUnsignedZero = false;
       // Only custom select scalar second operand.
       if (Src2.getValueType() != XLenVT)
         break;
       // Small constants are handled with patterns.
       if (auto *C = dyn_cast<ConstantSDNode>(Src2)) {
         int64_t CVal = C->getSExtValue();
-        if (CVal >= -15 && CVal <= 16)
-          break;
+        if (CVal >= -15 && CVal <= 16) {
+          if (!IsUnsigned || CVal != 0)
+            break;
+          IsCmpUnsignedZero = true;
+        }
       }
-      bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu;
       MVT Src1VT = Src1.getSimpleValueType();
-      unsigned VMSLTOpcode, VMNANDOpcode;
+      unsigned VMSLTOpcode, VMNANDOpcode, VMSetOpcode;
       switch (RISCVTargetLowering::getLMUL(Src1VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
-      case RISCVII::VLMUL::LMUL_F8:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8 : RISCV::PseudoVMSLT_VX_MF8;
-        VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF8;
-        break;
-      case RISCVII::VLMUL::LMUL_F4:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4 : RISCV::PseudoVMSLT_VX_MF4;
-        VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF4;
-        break;
-      case RISCVII::VLMUL::LMUL_F2:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2 : RISCV::PseudoVMSLT_VX_MF2;
-        VMNANDOpcode = RISCV::PseudoVMNAND_MM_MF2;
-        break;
-      case RISCVII::VLMUL::LMUL_1:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1 : RISCV::PseudoVMSLT_VX_M1;
-        VMNANDOpcode = RISCV::PseudoVMNAND_MM_M1;
-        break;
-      case RISCVII::VLMUL::LMUL_2:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2 : RISCV::PseudoVMSLT_VX_M2;
-        VMNANDOpcode = RISCV::PseudoVMNAND_MM_M2;
-        break;
-      case RISCVII::VLMUL::LMUL_4:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4 : RISCV::PseudoVMSLT_VX_M4;
-        VMNANDOpcode = RISCV::PseudoVMNAND_MM_M4;
-        break;
-      case RISCVII::VLMUL::LMUL_8:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8 : RISCV::PseudoVMSLT_VX_M8;
-        VMNANDOpcode = RISCV::PseudoVMNAND_MM_M8;
-        break;
+#define CASE_VMSLT_VMNAND_VMSET_OPCODES(lmulenum, suffix, suffix_b)            \
+  case RISCVII::VLMUL::lmulenum:                                               \
+    VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix                 \
+                             : RISCV::PseudoVMSLT_VX_##suffix;                 \
+    VMNANDOpcode = RISCV::PseudoVMNAND_MM_##suffix;                            \
+    VMSetOpcode = RISCV::PseudoVMSET_M_##suffix_b;                             \
+    break;
+        CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_F8, MF8, B1)
+        CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_F4, MF4, B2)
+        CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_F2, MF2, B4)
+        CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_1, M1, B8)
+        CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_2, M2, B16)
+        CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_4, M4, B32)
+        CASE_VMSLT_VMNAND_VMSET_OPCODES(LMUL_8, M8, B64)
+#undef CASE_VMSLT_VMNAND_VMSET_OPCODES
       }
       SDValue SEW = CurDAG->getTargetConstant(
           Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT);
       SDValue VL;
       selectVLOp(Node->getOperand(3), VL);
 
+      // If vmsgeu with 0 immediate, expand it to vmset.
+      if (IsCmpUnsignedZero) {
+        ReplaceNode(Node, CurDAG->getMachineNode(VMSetOpcode, DL, VT, VL, SEW));
+        return;
+      }
+
       // Expand to
       // vmslt{u}.vx vd, va, x; vmnand.mm vd, vd, vd
       SDValue Cmp = SDValue(
@@ -862,96 +989,61 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::riscv_vmsge_mask: {
       SDValue Src1 = Node->getOperand(2);
       SDValue Src2 = Node->getOperand(3);
+      bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu_mask;
+      bool IsCmpUnsignedZero = false;
       // Only custom select scalar second operand.
       if (Src2.getValueType() != XLenVT)
         break;
       // Small constants are handled with patterns.
       if (auto *C = dyn_cast<ConstantSDNode>(Src2)) {
         int64_t CVal = C->getSExtValue();
-        if (CVal >= -15 && CVal <= 16)
-          break;
+        if (CVal >= -15 && CVal <= 16) {
+          if (!IsUnsigned || CVal != 0)
+            break;
+          IsCmpUnsignedZero = true;
+        }
       }
-      bool IsUnsigned = IntNo == Intrinsic::riscv_vmsgeu_mask;
       MVT Src1VT = Src1.getSimpleValueType();
-      unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOpcode;
+      unsigned VMSLTOpcode, VMSLTMaskOpcode, VMXOROpcode, VMANDNOpcode,
+          VMSetOpcode, VMANDOpcode;
       switch (RISCVTargetLowering::getLMUL(Src1VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
-      case RISCVII::VLMUL::LMUL_F8:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8 : RISCV::PseudoVMSLT_VX_MF8;
-        VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF8_MASK
-                                     : RISCV::PseudoVMSLT_VX_MF8_MASK;
-        break;
-      case RISCVII::VLMUL::LMUL_F4:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4 : RISCV::PseudoVMSLT_VX_MF4;
-        VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF4_MASK
-                                     : RISCV::PseudoVMSLT_VX_MF4_MASK;
-        break;
-      case RISCVII::VLMUL::LMUL_F2:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2 : RISCV::PseudoVMSLT_VX_MF2;
-        VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_MF2_MASK
-                                     : RISCV::PseudoVMSLT_VX_MF2_MASK;
-        break;
-      case RISCVII::VLMUL::LMUL_1:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1 : RISCV::PseudoVMSLT_VX_M1;
-        VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M1_MASK
-                                     : RISCV::PseudoVMSLT_VX_M1_MASK;
-        break;
-      case RISCVII::VLMUL::LMUL_2:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2 : RISCV::PseudoVMSLT_VX_M2;
-        VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M2_MASK
-                                     : RISCV::PseudoVMSLT_VX_M2_MASK;
-        break;
-      case RISCVII::VLMUL::LMUL_4:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4 : RISCV::PseudoVMSLT_VX_M4;
-        VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M4_MASK
-                                     : RISCV::PseudoVMSLT_VX_M4_MASK;
-        break;
-      case RISCVII::VLMUL::LMUL_8:
-        VMSLTOpcode =
-            IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8 : RISCV::PseudoVMSLT_VX_M8;
-        VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_M8_MASK
-                                     : RISCV::PseudoVMSLT_VX_M8_MASK;
-        break;
+#define CASE_VMSLT_VMSET_OPCODES(lmulenum, suffix, suffix_b)                   \
+  case RISCVII::VLMUL::lmulenum:                                               \
+    VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix                 \
+                             : RISCV::PseudoVMSLT_VX_##suffix;                 \
+    VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix##_MASK      \
+                                 : RISCV::PseudoVMSLT_VX_##suffix##_MASK;      \
+    VMSetOpcode = RISCV::PseudoVMSET_M_##suffix_b;                             \
+    break;
+        CASE_VMSLT_VMSET_OPCODES(LMUL_F8, MF8, B1)
+        CASE_VMSLT_VMSET_OPCODES(LMUL_F4, MF4, B2)
+        CASE_VMSLT_VMSET_OPCODES(LMUL_F2, MF2, B4)
+        CASE_VMSLT_VMSET_OPCODES(LMUL_1, M1, B8)
+        CASE_VMSLT_VMSET_OPCODES(LMUL_2, M2, B16)
+        CASE_VMSLT_VMSET_OPCODES(LMUL_4, M4, B32)
+        CASE_VMSLT_VMSET_OPCODES(LMUL_8, M8, B64)
+#undef CASE_VMSLT_VMSET_OPCODES
       }
       // Mask operations use the LMUL from the mask type.
       switch (RISCVTargetLowering::getLMUL(VT)) {
       default:
         llvm_unreachable("Unexpected LMUL!");
-      case RISCVII::VLMUL::LMUL_F8:
-        VMXOROpcode = RISCV::PseudoVMXOR_MM_MF8;
-        VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF8;
-        break;
-      case RISCVII::VLMUL::LMUL_F4:
-        VMXOROpcode = RISCV::PseudoVMXOR_MM_MF4;
-        VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF4;
-        break;
-      case RISCVII::VLMUL::LMUL_F2:
-        VMXOROpcode = RISCV::PseudoVMXOR_MM_MF2;
-        VMANDNOpcode = RISCV::PseudoVMANDN_MM_MF2;
-        break;
-      case RISCVII::VLMUL::LMUL_1:
-        VMXOROpcode = RISCV::PseudoVMXOR_MM_M1;
-        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M1;
-        break;
-      case RISCVII::VLMUL::LMUL_2:
-        VMXOROpcode = RISCV::PseudoVMXOR_MM_M2;
-        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M2;
-        break;
-      case RISCVII::VLMUL::LMUL_4:
-        VMXOROpcode = RISCV::PseudoVMXOR_MM_M4;
-        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M4;
-        break;
-      case RISCVII::VLMUL::LMUL_8:
-        VMXOROpcode = RISCV::PseudoVMXOR_MM_M8;
-        VMANDNOpcode = RISCV::PseudoVMANDN_MM_M8;
-        break;
+#define CASE_VMXOR_VMANDN_VMAND_OPCODES(lmulenum, suffix)                       \
+  case RISCVII::VLMUL::lmulenum:                                               \
+    VMXOROpcode = RISCV::PseudoVMXOR_MM_##suffix;                              \
+    VMANDNOpcode = RISCV::PseudoVMANDN_MM_##suffix;                            \
+    VMANDOpcode = RISCV::PseudoVMAND_MM_##suffix;                              \
+    break;
+        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F8, MF8)
+        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F4, MF4)
+        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_F2, MF2)
+        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_1, M1)
+        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_2, M2)
+        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_4, M4)
+        CASE_VMXOR_VMANDN_VMAND_OPCODES(LMUL_8, M8)
+#undef CASE_VMXOR_VMANDN_VMAND_OPCODES
       }
       SDValue SEW = CurDAG->getTargetConstant(
           Log2_32(Src1VT.getScalarSizeInBits()), DL, XLenVT);
@@ -960,6 +1052,16 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       selectVLOp(Node->getOperand(5), VL);
       SDValue MaskedOff = Node->getOperand(1);
       SDValue Mask = Node->getOperand(4);
+
+      // If vmsgeu_mask with 0 immediate, expand it to {vmset, vmand}.
+      if (IsCmpUnsignedZero) {
+        SDValue VMSet =
+            SDValue(CurDAG->getMachineNode(VMSetOpcode, DL, VT, VL, SEW), 0);
+        ReplaceNode(Node, CurDAG->getMachineNode(VMANDOpcode, DL, VT,
+                                                 {Mask, VMSet, VL, MaskSEW}));
+        return;
+      }
+
       // If the MaskedOff value and the Mask are the same value use
       // vmslt{u}.vx vt, va, x;  vmandn.mm vd, vd, vt
       // This avoids needing to copy v0 to vd before starting the next sequence.
@@ -988,6 +1090,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
                                                {Cmp, Mask, VL, MaskSEW}));
       return;
     }
+    case Intrinsic::riscv_vsetvli_opt:
+    case Intrinsic::riscv_vsetvlimax_opt:
+      return selectVSETVLI(Node);
     }
     break;
   }
@@ -997,54 +1102,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       // By default we do not custom select any intrinsic.
     default:
       break;
-
     case Intrinsic::riscv_vsetvli:
-    case Intrinsic::riscv_vsetvlimax: {
-      if (!Subtarget->hasVInstructions())
-        break;
-
-      bool VLMax = IntNo == Intrinsic::riscv_vsetvlimax;
-      unsigned Offset = VLMax ? 2 : 3;
-
-      assert(Node->getNumOperands() == Offset + 2 &&
-             "Unexpected number of operands");
-
-      unsigned SEW =
-          RISCVVType::decodeVSEW(Node->getConstantOperandVal(Offset) & 0x7);
-      RISCVII::VLMUL VLMul = static_cast<RISCVII::VLMUL>(
-          Node->getConstantOperandVal(Offset + 1) & 0x7);
-
-      unsigned VTypeI = RISCVVType::encodeVTYPE(
-          VLMul, SEW, /*TailAgnostic*/ true, /*MaskAgnostic*/ false);
-      SDValue VTypeIOp = CurDAG->getTargetConstant(VTypeI, DL, XLenVT);
-
-      SDValue VLOperand;
-      unsigned Opcode = RISCV::PseudoVSETVLI;
-      if (VLMax) {
-        VLOperand = CurDAG->getRegister(RISCV::X0, XLenVT);
-        Opcode = RISCV::PseudoVSETVLIX0;
-      } else {
-        VLOperand = Node->getOperand(2);
-
-        if (auto *C = dyn_cast<ConstantSDNode>(VLOperand)) {
-          uint64_t AVL = C->getZExtValue();
-          if (isUInt<5>(AVL)) {
-            SDValue VLImm = CurDAG->getTargetConstant(AVL, DL, XLenVT);
-            ReplaceNode(
-                Node, CurDAG->getMachineNode(RISCV::PseudoVSETIVLI, DL, XLenVT,
-                                             MVT::Other, VLImm, VTypeIOp,
-                                             /* Chain */ Node->getOperand(0)));
-            return;
-          }
-        }
-      }
-
-      ReplaceNode(Node,
-                  CurDAG->getMachineNode(Opcode, DL, XLenVT,
-                                         MVT::Other, VLOperand, VTypeIOp,
-                                         /* Chain */ Node->getOperand(0)));
-      return;
-    }
+    case Intrinsic::riscv_vsetvlimax:
+      return selectVSETVLI(Node);
     case Intrinsic::riscv_vlseg2:
     case Intrinsic::riscv_vlseg3:
     case Intrinsic::riscv_vlseg4:
@@ -1154,9 +1214,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
 
       unsigned CurOp = 2;
+      // Masked intrinsic only have TU version pseduo instructions.
+      bool IsTU = IsMasked || (!IsMasked && !Node->getOperand(CurOp).isUndef());
       SmallVector<SDValue, 8> Operands;
-      if (IsMasked)
+      if (IsTU)
         Operands.push_back(Node->getOperand(CurOp++));
+      else
+        // Skip the undef passthru operand for nomask TA version pseudo
+        CurOp++;
 
       MVT IndexVT;
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
@@ -1169,8 +1234,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+      if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+        report_fatal_error("The V extension does not support EEW=64 for index "
+                           "values when XLEN=32");
+      }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
-          IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+          IsMasked, IsTU, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
           static_cast<unsigned>(IndexLMUL));
       MachineSDNode *Load =
           CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
@@ -1195,16 +1264,25 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
 
       unsigned CurOp = 2;
+      // The riscv_vlm intrinsic are always tail agnostic and no passthru operand.
+      bool HasPassthruOperand = IntNo != Intrinsic::riscv_vlm;
+      // Masked intrinsic only have TU version pseduo instructions.
+      bool IsTU =
+          HasPassthruOperand &&
+          ((!IsMasked && !Node->getOperand(CurOp).isUndef()) || IsMasked);
       SmallVector<SDValue, 8> Operands;
-      if (IsMasked)
+      if (IsTU)
         Operands.push_back(Node->getOperand(CurOp++));
+      else if (HasPassthruOperand)
+        // Skip the undef passthru operand for nomask TA version pseudo
+        CurOp++;
 
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
                                  Operands, /*IsLoad=*/true);
 
       RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       const RISCV::VLEPseudo *P =
-          RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW,
+          RISCV::getVLEPseudo(IsMasked, IsTU, IsStrided, /*FF*/ false, Log2SEW,
                               static_cast<unsigned>(LMUL));
       MachineSDNode *Load =
           CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
@@ -1223,9 +1301,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
 
       unsigned CurOp = 2;
+      // Masked intrinsic only have TU version pseduo instructions.
+      bool IsTU = IsMasked || (!IsMasked && !Node->getOperand(CurOp).isUndef());
       SmallVector<SDValue, 7> Operands;
-      if (IsMasked)
+      if (IsTU)
         Operands.push_back(Node->getOperand(CurOp++));
+      else
+        // Skip the undef passthru operand for nomask TA version pseudo
+        CurOp++;
 
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked,
                                  /*IsStridedOrIndexed*/ false, Operands,
@@ -1233,8 +1316,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
       RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       const RISCV::VLEPseudo *P =
-          RISCV::getVLEPseudo(IsMasked, /*Strided*/ false, /*FF*/ true, Log2SEW,
-                              static_cast<unsigned>(LMUL));
+          RISCV::getVLEPseudo(IsMasked, IsTU, /*Strided*/ false, /*FF*/ true,
+                              Log2SEW, static_cast<unsigned>(LMUL));
       MachineSDNode *Load =
           CurDAG->getMachineNode(P->Pseudo, DL, Node->getValueType(0),
                                  MVT::Other, MVT::Glue, Operands);
@@ -1359,9 +1442,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+      if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+        report_fatal_error("The V extension does not support EEW=64 for index "
+                           "values when XLEN=32");
+      }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
-          IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
-          static_cast<unsigned>(IndexLMUL));
+          IsMasked, /*TU*/ false, IsOrdered, IndexLog2EEW,
+          static_cast<unsigned>(LMUL), static_cast<unsigned>(IndexLMUL));
       MachineSDNode *Store =
           CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
 
@@ -1516,10 +1603,16 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
   case ISD::SPLAT_VECTOR:
+  case RISCVISD::VMV_S_X_VL:
+  case RISCVISD::VFMV_S_F_VL:
   case RISCVISD::VMV_V_X_VL:
   case RISCVISD::VFMV_V_F_VL: {
     // Try to match splat of a scalar load to a strided load with stride of x0.
-    SDValue Src = Node->getOperand(0);
+    bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL ||
+                        Node->getOpcode() == RISCVISD::VFMV_S_F_VL;
+    if (IsScalarMove && !Node->getOperand(0).isUndef())
+      break;
+    SDValue Src = IsScalarMove ? Node->getOperand(1) : Node->getOperand(0);
     auto *Ld = dyn_cast<LoadSDNode>(Src);
     if (!Ld)
       break;
@@ -1534,7 +1627,13 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     SDValue VL;
     if (Node->getOpcode() == ISD::SPLAT_VECTOR)
       VL = CurDAG->getTargetConstant(RISCV::VLMaxSentinel, DL, XLenVT);
-    else
+    else if (IsScalarMove) {
+      // We could deal with more VL if we update the VSETVLI insert pass to
+      // avoid introducing more VSETVLI.
+      if (!isOneConstant(Node->getOperand(2)))
+        break;
+      selectVLOp(Node->getOperand(2), VL);
+    } else
       selectVLOp(Node->getOperand(1), VL);
 
     unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
@@ -1546,8 +1645,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
     RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
     const RISCV::VLEPseudo *P = RISCV::getVLEPseudo(
-        /*IsMasked*/ false, /*IsStrided*/ true, /*FF*/ false, Log2SEW,
-        static_cast<unsigned>(LMUL));
+        /*IsMasked*/ false, /*IsTU*/ false, /*IsStrided*/ true, /*FF*/ false,
+        Log2SEW, static_cast<unsigned>(LMUL));
     MachineSDNode *Load =
         CurDAG->getMachineNode(P->Pseudo, DL, Node->getVTList(), Operands);
 
@@ -1727,6 +1826,20 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
       if (Bits < Subtarget->getXLen() - User->getConstantOperandVal(1))
         return false;
       break;
+    case RISCV::ANDI:
+      if (Bits < (64 - countLeadingZeros(User->getConstantOperandVal(1))))
+        return false;
+      break;
+    case RISCV::SEXTB:
+      if (Bits < 8)
+        return false;
+      break;
+    case RISCV::SEXTH:
+    case RISCV::ZEXTH_RV32:
+    case RISCV::ZEXTH_RV64:
+      if (Bits < 16)
+        return false;
+      break;
     case RISCV::ADDUW:
     case RISCV::SH1ADDUW:
     case RISCV::SH2ADDUW:
@@ -1758,7 +1871,8 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits) const {
 // allows us to choose betwen VSETIVLI or VSETVLI later.
 bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
   auto *C = dyn_cast<ConstantSDNode>(N);
-  if (C && isUInt<5>(C->getZExtValue()))
+  if (C && (isUInt<5>(C->getZExtValue()) ||
+            C->getSExtValue() == RISCV::VLMaxSentinel))
     VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N),
                                    N->getValueType(0));
   else
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index a2770089995d..c429a9298739 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -87,6 +87,8 @@ public:
   void selectVSSEG(SDNode *Node, bool IsMasked, bool IsStrided);
   void selectVSXSEG(SDNode *Node, bool IsMasked, bool IsOrdered);
 
+  void selectVSETVLI(SDNode *Node);
+
   // Return the RISC-V condition code that matches the given DAG integer
   // condition code. The CondCode must be one of those supported by the RISC-V
   // ISA (see translateSetCCForBranch).
@@ -159,6 +161,7 @@ struct VSXSEGPseudo {
 
 struct VLEPseudo {
   uint16_t Masked : 1;
+  uint16_t IsTU : 1;
   uint16_t Strided : 1;
   uint16_t FF : 1;
   uint16_t Log2SEW : 3;
@@ -176,6 +179,7 @@ struct VSEPseudo {
 
 struct VLX_VSXPseudo {
   uint16_t Masked : 1;
+  uint16_t IsTU : 1;
   uint16_t Ordered : 1;
   uint16_t Log2SEW : 3;
   uint16_t LMUL : 3;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 4f5512e6fb37..5cc3aa35d4d2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
@@ -249,7 +250,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL_PARTS, XLenVT, Custom);
   setOperationAction(ISD::SRA_PARTS, XLenVT, Custom);
 
-  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
+  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp() ||
+      Subtarget.hasStdExtZbkb()) {
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::ROTL, MVT::i32, Custom);
       setOperationAction(ISD::ROTR, MVT::i32, Custom);
@@ -277,7 +279,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
     // pattern match it directly in isel.
     setOperationAction(ISD::BSWAP, XLenVT,
-                       Subtarget.hasStdExtZbb() ? Legal : Expand);
+                       (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb())
+                           ? Legal
+                           : Expand);
   }
 
   if (Subtarget.hasStdExtZbb()) {
@@ -330,6 +334,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f16, Legal);
     setOperationAction(ISD::LROUND, MVT::f16, Legal);
     setOperationAction(ISD::LLROUND, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_LRINT, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_LLRINT, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_LROUND, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_LLROUND, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FADD, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FMA, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FSUB, MVT::f16, Legal);
@@ -338,6 +346,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Legal);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Legal);
     for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f16, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
@@ -363,6 +373,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FLOG2,      MVT::f16, Promote);
     setOperationAction(ISD::FLOG10,     MVT::f16, Promote);
 
+    // FIXME: Need to promote f16 STRICT_* to f32 libcalls, but we don't have
+    // complete support for all operations in LegalizeDAG.
+
     // We need to custom promote this.
     if (Subtarget.is64Bit())
       setOperationAction(ISD::FPOWI, MVT::i32, Custom);
@@ -375,12 +388,18 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f32, Legal);
     setOperationAction(ISD::LROUND, MVT::f32, Legal);
     setOperationAction(ISD::LLROUND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_LRINT, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_LLRINT, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_LROUND, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_LLROUND, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
     for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
@@ -402,6 +421,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::LLRINT, MVT::f64, Legal);
     setOperationAction(ISD::LROUND, MVT::f64, Legal);
     setOperationAction(ISD::LLROUND, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_LRINT, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_LLRINT, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_LROUND, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_LLROUND, MVT::f64, Legal);
     setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
     setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
     setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
@@ -410,6 +433,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
     setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
     for (auto CC : FPCCToExpand)
       setCondCodeAction(CC, MVT::f64, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
@@ -499,12 +524,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_SHL,         ISD::VP_REDUCE_ADD,  ISD::VP_REDUCE_AND,
         ISD::VP_REDUCE_OR,   ISD::VP_REDUCE_XOR,  ISD::VP_REDUCE_SMAX,
         ISD::VP_REDUCE_SMIN, ISD::VP_REDUCE_UMAX, ISD::VP_REDUCE_UMIN,
-        ISD::VP_SELECT};
+        ISD::VP_MERGE,       ISD::VP_SELECT};
 
     static const unsigned FloatingPointVPOps[] = {
         ISD::VP_FADD,        ISD::VP_FSUB,        ISD::VP_FMUL,
         ISD::VP_FDIV,        ISD::VP_REDUCE_FADD, ISD::VP_REDUCE_SEQ_FADD,
-        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_SELECT};
+        ISD::VP_REDUCE_FMIN, ISD::VP_REDUCE_FMAX, ISD::VP_MERGE,
+        ISD::VP_SELECT};
 
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
@@ -546,6 +572,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::SELECT_CC, VT, Expand);
       setOperationAction(ISD::VSELECT, VT, Expand);
+      setOperationAction(ISD::VP_MERGE, VT, Expand);
+      setOperationAction(ISD::VP_SELECT, VT, Expand);
 
       setOperationAction(ISD::VP_AND, VT, Custom);
       setOperationAction(ISD::VP_OR, VT, Custom);
@@ -590,6 +618,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
+      // nxvXi64 MULHS/MULHU requires the V extension instead of Zve64*.
+      if (VT.getVectorElementType() == MVT::i64 && !Subtarget.hasStdExtV()) {
+        setOperationAction(ISD::MULHU, VT, Expand);
+        setOperationAction(ISD::MULHS, VT, Expand);
+      }
+
       setOperationAction(ISD::SMIN, VT, Legal);
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMIN, VT, Legal);
@@ -886,8 +920,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction(ISD::UMAX, VT, Custom);
         setOperationAction(ISD::ABS,  VT, Custom);
 
-        setOperationAction(ISD::MULHS, VT, Custom);
-        setOperationAction(ISD::MULHU, VT, Custom);
+        // vXi64 MULHS/MULHU requires the V extension instead of Zve64*.
+        if (VT.getVectorElementType() != MVT::i64 || Subtarget.hasStdExtV()) {
+          setOperationAction(ISD::MULHS, VT, Custom);
+          setOperationAction(ISD::MULHU, VT, Custom);
+        }
 
         setOperationAction(ISD::SADDSAT, VT, Custom);
         setOperationAction(ISD::UADDSAT, VT, Custom);
@@ -1002,9 +1039,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITCAST, MVT::i16, Custom);
       setOperationAction(ISD::BITCAST, MVT::i32, Custom);
       setOperationAction(ISD::BITCAST, MVT::i64, Custom);
-      setOperationAction(ISD::BITCAST, MVT::f16, Custom);
-      setOperationAction(ISD::BITCAST, MVT::f32, Custom);
-      setOperationAction(ISD::BITCAST, MVT::f64, Custom);
+      if (Subtarget.hasStdExtZfh())
+        setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+      if (Subtarget.hasStdExtF())
+        setOperationAction(ISD::BITCAST, MVT::f32, Custom);
+      if (Subtarget.hasStdExtD())
+        setOperationAction(ISD::BITCAST, MVT::f64, Custom);
     }
   }
 
@@ -1024,7 +1064,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::ANY_EXTEND);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  if (Subtarget.hasStdExtF()) {
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::FP_TO_SINT);
+    setTargetDAGCombine(ISD::FP_TO_UINT);
+    setTargetDAGCombine(ISD::FP_TO_SINT_SAT);
+    setTargetDAGCombine(ISD::FP_TO_UINT_SAT);
+  }
   if (Subtarget.hasVInstructions()) {
     setTargetDAGCombine(ISD::FCOPYSIGN);
     setTargetDAGCombine(ISD::MGATHER);
@@ -1072,7 +1118,7 @@ bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::riscv_masked_cmpxchg_i32: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.memVT = MVT::getVT(PtrTy->getPointerElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = Align(4);
@@ -1158,10 +1204,11 @@ bool RISCVTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
 
 bool RISCVTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   // Zexts are free if they can be combined with a load.
+  // Don't advertise i32->i64 zextload as being free for RV64. It interacts
+  // poorly with type legalization of compares preferring sext.
   if (auto *LD = dyn_cast<LoadSDNode>(Val)) {
     EVT MemVT = LD->getMemoryVT();
-    if ((MemVT == MVT::i8 || MemVT == MVT::i16 ||
-         (Subtarget.is64Bit() && MemVT == MVT::i32)) &&
+    if ((MemVT == MVT::i8 || MemVT == MVT::i16) &&
         (LD->getExtensionType() == ISD::NON_EXTLOAD ||
          LD->getExtensionType() == ISD::ZEXTLOAD))
       return true;
@@ -1189,7 +1236,9 @@ bool RISCVTargetLowering::hasAndNotCompare(SDValue Y) const {
   if (VT.isVector())
     return false;
 
-  return Subtarget.hasStdExtZbb() && !isa<ConstantSDNode>(Y);
+  return (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp() ||
+          Subtarget.hasStdExtZbkb()) &&
+         !isa<ConstantSDNode>(Y);
 }
 
 /// Check if sinking \p I's operands to I's basic block is profitable, because
@@ -1230,6 +1279,30 @@ bool RISCVTargetLowering::shouldSinkOperands(
         switch (II->getIntrinsicID()) {
         case Intrinsic::fma:
           return Operand == 0 || Operand == 1;
+        // FIXME: Our patterns can only match vx/vf instructions when the splat
+        // it on the RHS, because TableGen doesn't recognize our VP operations
+        // as commutative.
+        case Intrinsic::vp_add:
+        case Intrinsic::vp_mul:
+        case Intrinsic::vp_and:
+        case Intrinsic::vp_or:
+        case Intrinsic::vp_xor:
+        case Intrinsic::vp_fadd:
+        case Intrinsic::vp_fmul:
+        case Intrinsic::vp_shl:
+        case Intrinsic::vp_lshr:
+        case Intrinsic::vp_ashr:
+        case Intrinsic::vp_udiv:
+        case Intrinsic::vp_sdiv:
+        case Intrinsic::vp_urem:
+        case Intrinsic::vp_srem:
+          return Operand == 1;
+        // ... with the exception of vp.sub/vp.fsub/vp.fdiv, which have
+        // explicit patterns for both LHS and RHS (as 'vr' versions).
+        case Intrinsic::vp_sub:
+        case Intrinsic::vp_fsub:
+        case Intrinsic::vp_fdiv:
+          return Operand == 0 || Operand == 1;
         default:
           return false;
         }
@@ -1277,8 +1350,6 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   if (VT == MVT::f64 && !Subtarget.hasStdExtD())
     return false;
-  if (Imm.isNegZero())
-    return false;
   return Imm.isZero();
 }
 
@@ -1482,6 +1553,19 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(Type *ScalarTy) const {
   return false;
 }
 
+static SDValue getVLOperand(SDValue Op) {
+  assert((Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN) &&
+         "Unexpected opcode");
+  bool HasChain = Op.getOpcode() == ISD::INTRINSIC_W_CHAIN;
+  unsigned IntNo = Op.getConstantOperandVal(HasChain ? 1 : 0);
+  const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
+      RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo);
+  if (!II)
+    return SDValue();
+  return Op.getOperand(II->VLOperand + 1 + HasChain);
+}
+
 static bool useRVVForFixedLengthVectorVT(MVT VT,
                                          const RISCVSubtarget &Subtarget) {
   assert(VT.isFixedLengthVector() && "Expected a fixed length vector type!");
@@ -1667,7 +1751,8 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   return false;
 }
 
-static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) {
+static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG,
+                                  const RISCVSubtarget &Subtarget) {
   // RISCV FP-to-int conversions saturate to the destination register size, but
   // don't produce 0 for nan. We can use a conversion instruction and fix the
   // nan case with a compare and a select.
@@ -1679,15 +1764,17 @@ static SDValue lowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) {
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
   unsigned Opc;
   if (SatVT == DstVT)
-    Opc = IsSigned ? RISCVISD::FCVT_X_RTZ : RISCVISD::FCVT_XU_RTZ;
+    Opc = IsSigned ? RISCVISD::FCVT_X : RISCVISD::FCVT_XU;
   else if (DstVT == MVT::i64 && SatVT == MVT::i32)
-    Opc = IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64;
+    Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64;
   else
     return SDValue();
   // FIXME: Support other SatVTs by clamping before or after the conversion.
 
   SDLoc DL(Op);
-  SDValue FpToInt = DAG.getNode(Opc, DL, DstVT, Src);
+  SDValue FpToInt = DAG.getNode(
+      Opc, DL, DstVT, Src,
+      DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, Subtarget.getXLenVT()));
 
   SDValue ZeroInt = DAG.getConstant(0, DL, DstVT);
   return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
@@ -1898,6 +1985,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     // codegen across RV32 and RV64.
     unsigned NumViaIntegerBits =
         std::min(std::max(NumElts, 8u), Subtarget.getXLen());
+    NumViaIntegerBits = std::min(NumViaIntegerBits,
+                                 Subtarget.getMaxELENForFixedLengthVectors());
     if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
       // If we have to use more than one INSERT_VECTOR_ELT then this
       // optimization is likely to increase code size; avoid peforming it in
@@ -2190,6 +2279,17 @@ static SDValue splatPartsI64WithVL(const SDLoc &DL, MVT VT, SDValue Lo,
     // node in order to try and match RVV vector/scalar instructions.
     if ((LoC >> 31) == HiC)
       return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Lo, VL);
+
+    // If vl is equal to VLMax and Hi constant is equal to Lo, we could use
+    // vmv.v.x whose EEW = 32 to lower it.
+    auto *Const = dyn_cast<ConstantSDNode>(VL);
+    if (LoC == HiC && Const && Const->getSExtValue() == RISCV::VLMaxSentinel) {
+      MVT InterVT = MVT::getVectorVT(MVT::i32, VT.getVectorElementCount() * 2);
+      // TODO: if vl <= min(VLMAX), we can also do this. But we could not
+      // access the subtarget here now.
+      auto InterVec = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, InterVT, Lo, VL);
+      return DAG.getNode(ISD::BITCAST, DL, VT, InterVec);
+    }
   }
 
   // Fall back to a stack store and stride x0 vector load.
@@ -2215,8 +2315,13 @@ static SDValue splatSplitI64WithVL(const SDLoc &DL, MVT VT, SDValue Scalar,
 static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
                                 SelectionDAG &DAG,
                                 const RISCVSubtarget &Subtarget) {
-  if (VT.isFloatingPoint())
+  if (VT.isFloatingPoint()) {
+    // If VL is 1, we could use vfmv.s.f.
+    if (isOneConstant(VL))
+      return DAG.getNode(RISCVISD::VFMV_S_F_VL, DL, VT, DAG.getUNDEF(VT),
+                         Scalar, VL);
     return DAG.getNode(RISCVISD::VFMV_V_F_VL, DL, VT, Scalar, VL);
+  }
 
   MVT XLenVT = Subtarget.getXLenVT();
 
@@ -2229,16 +2334,98 @@ static SDValue lowerScalarSplat(SDValue Scalar, SDValue VL, MVT VT, SDLoc DL,
     unsigned ExtOpc =
         isa<ConstantSDNode>(Scalar) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
     Scalar = DAG.getNode(ExtOpc, DL, XLenVT, Scalar);
+    ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Scalar);
+    // If VL is 1 and the scalar value won't benefit from immediate, we could
+    // use vmv.s.x.
+    if (isOneConstant(VL) &&
+        (!Const || isNullConstant(Scalar) || !isInt<5>(Const->getSExtValue())))
+      return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT), Scalar,
+                         VL);
     return DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT, Scalar, VL);
   }
 
   assert(XLenVT == MVT::i32 && Scalar.getValueType() == MVT::i64 &&
          "Unexpected scalar for splat lowering!");
 
+  if (isOneConstant(VL) && isNullConstant(Scalar))
+    return DAG.getNode(RISCVISD::VMV_S_X_VL, DL, VT, DAG.getUNDEF(VT),
+                       DAG.getConstant(0, DL, XLenVT), VL);
+
   // Otherwise use the more complicated splatting algorithm.
   return splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
 }
 
+// Is the mask a slidedown that shifts in undefs.
+static int matchShuffleAsSlideDown(ArrayRef<int> Mask) {
+  int Size = Mask.size();
+
+  // Elements shifted in should be undef.
+  auto CheckUndefs = [&](int Shift) {
+    for (int i = Size - Shift; i != Size; ++i)
+      if (Mask[i] >= 0)
+        return false;
+    return true;
+  };
+
+  // Elements should be shifted or undef.
+  auto MatchShift = [&](int Shift) {
+    for (int i = 0; i != Size - Shift; ++i)
+       if (Mask[i] >= 0 && Mask[i] != Shift + i)
+         return false;
+    return true;
+  };
+
+  // Try all possible shifts.
+  for (int Shift = 1; Shift != Size; ++Shift)
+    if (CheckUndefs(Shift) && MatchShift(Shift))
+      return Shift;
+
+  // No match.
+  return -1;
+}
+
+static bool isInterleaveShuffle(ArrayRef<int> Mask, MVT VT, bool &SwapSources,
+                                const RISCVSubtarget &Subtarget) {
+  // We need to be able to widen elements to the next larger integer type.
+  if (VT.getScalarSizeInBits() >= Subtarget.getMaxELENForFixedLengthVectors())
+    return false;
+
+  int Size = Mask.size();
+  assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
+
+  int Srcs[] = {-1, -1};
+  for (int i = 0; i != Size; ++i) {
+    // Ignore undef elements.
+    if (Mask[i] < 0)
+      continue;
+
+    // Is this an even or odd element.
+    int Pol = i % 2;
+
+    // Ensure we consistently use the same source for this element polarity.
+    int Src = Mask[i] / Size;
+    if (Srcs[Pol] < 0)
+      Srcs[Pol] = Src;
+    if (Srcs[Pol] != Src)
+      return false;
+
+    // Make sure the element within the source is appropriate for this element
+    // in the destination.
+    int Elt = Mask[i] % Size;
+    if (Elt != i / 2)
+      return false;
+  }
+
+  // We need to find a source for each polarity and they can't be the same.
+  if (Srcs[0] < 0 || Srcs[1] < 0 || Srcs[0] == Srcs[1])
+    return false;
+
+  // Swap the sources if the second source was in the even polarity.
+  SwapSources = Srcs[0] > Srcs[1];
+
+  return true;
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -2284,8 +2471,12 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
           SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
           SDValue IntID =
               DAG.getTargetConstant(Intrinsic::riscv_vlse, DL, XLenVT);
-          SDValue Ops[] = {Ld->getChain(), IntID, NewAddr,
-                           DAG.getRegister(RISCV::X0, XLenVT), VL};
+          SDValue Ops[] = {Ld->getChain(),
+                           IntID,
+                           DAG.getUNDEF(ContainerVT),
+                           NewAddr,
+                           DAG.getRegister(RISCV::X0, XLenVT),
+                           VL};
           SDValue NewLoad = DAG.getMemIntrinsicNode(
               ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, SVT,
               DAG.getMachineFunction().getMachineMemOperand(
@@ -2324,10 +2515,97 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     }
   }
 
+  ArrayRef<int> Mask = SVN->getMask();
+
+  // Try to match as a slidedown.
+  int SlideAmt = matchShuffleAsSlideDown(Mask);
+  if (SlideAmt >= 0) {
+    // TODO: Should we reduce the VL to account for the upper undef elements?
+    // Requires additional vsetvlis, but might be faster to execute.
+    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+    SDValue SlideDown =
+        DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT,
+                    DAG.getUNDEF(ContainerVT), V1,
+                    DAG.getConstant(SlideAmt, DL, XLenVT),
+                    TrueMask, VL);
+    return convertFromScalableVector(VT, SlideDown, DAG, Subtarget);
+  }
+
+  // Detect an interleave shuffle and lower to
+  // (vmaccu.vx (vwaddu.vx lohalf(V1), lohalf(V2)), lohalf(V2), (2^eltbits - 1))
+  bool SwapSources;
+  if (isInterleaveShuffle(Mask, VT, SwapSources, Subtarget)) {
+    // Swap sources if needed.
+    if (SwapSources)
+      std::swap(V1, V2);
+
+    // Extract the lower half of the vectors.
+    MVT HalfVT = VT.getHalfNumVectorElementsVT();
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
+                     DAG.getConstant(0, DL, XLenVT));
+    V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V2,
+                     DAG.getConstant(0, DL, XLenVT));
+
+    // Double the element width and halve the number of elements in an int type.
+    unsigned EltBits = VT.getScalarSizeInBits();
+    MVT WideIntEltVT = MVT::getIntegerVT(EltBits * 2);
+    MVT WideIntVT =
+        MVT::getVectorVT(WideIntEltVT, VT.getVectorNumElements() / 2);
+    // Convert this to a scalable vector. We need to base this on the
+    // destination size to ensure there's always a type with a smaller LMUL.
+    MVT WideIntContainerVT =
+        getContainerForFixedLengthVector(DAG, WideIntVT, Subtarget);
+
+    // Convert sources to scalable vectors with the same element count as the
+    // larger type.
+    MVT HalfContainerVT = MVT::getVectorVT(
+        VT.getVectorElementType(), WideIntContainerVT.getVectorElementCount());
+    V1 = convertToScalableVector(HalfContainerVT, V1, DAG, Subtarget);
+    V2 = convertToScalableVector(HalfContainerVT, V2, DAG, Subtarget);
+
+    // Cast sources to integer.
+    MVT IntEltVT = MVT::getIntegerVT(EltBits);
+    MVT IntHalfVT =
+        MVT::getVectorVT(IntEltVT, HalfContainerVT.getVectorElementCount());
+    V1 = DAG.getBitcast(IntHalfVT, V1);
+    V2 = DAG.getBitcast(IntHalfVT, V2);
+
+    // Freeze V2 since we use it twice and we need to be sure that the add and
+    // multiply see the same value.
+    V2 = DAG.getNode(ISD::FREEZE, DL, IntHalfVT, V2);
+
+    // Recreate TrueMask using the widened type's element count.
+    MVT MaskVT =
+        MVT::getVectorVT(MVT::i1, HalfContainerVT.getVectorElementCount());
+    TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, MaskVT, VL);
+
+    // Widen V1 and V2 with 0s and add one copy of V2 to V1.
+    SDValue Add = DAG.getNode(RISCVISD::VWADDU_VL, DL, WideIntContainerVT, V1,
+                              V2, TrueMask, VL);
+    // Create 2^eltbits - 1 copies of V2 by multiplying by the largest integer.
+    SDValue Multiplier = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, IntHalfVT,
+                                     DAG.getAllOnesConstant(DL, XLenVT));
+    SDValue WidenMul = DAG.getNode(RISCVISD::VWMULU_VL, DL, WideIntContainerVT,
+                                   V2, Multiplier, TrueMask, VL);
+    // Add the new copies to our previous addition giving us 2^eltbits copies of
+    // V2. This is equivalent to shifting V2 left by eltbits. This should
+    // combine with the vwmulu.vv above to form vwmaccu.vv.
+    Add = DAG.getNode(RISCVISD::ADD_VL, DL, WideIntContainerVT, Add, WidenMul,
+                      TrueMask, VL);
+    // Cast back to ContainerVT. We need to re-create a new ContainerVT in case
+    // WideIntContainerVT is a larger fractional LMUL than implied by the fixed
+    // vector VT.
+    ContainerVT =
+        MVT::getVectorVT(VT.getVectorElementType(),
+                         WideIntContainerVT.getVectorElementCount() * 2);
+    Add = DAG.getBitcast(ContainerVT, Add);
+    return convertFromScalableVector(VT, Add, DAG, Subtarget);
+  }
+
   // Detect shuffles which can be re-expressed as vector selects; these are
   // shuffles in which each element in the destination is taken from an element
   // at the corresponding index in either source vectors.
-  bool IsSelect = all_of(enumerate(SVN->getMask()), [&](const auto &MaskIdx) {
+  bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) {
     int MaskIndex = MaskIdx.value();
     return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
   });
@@ -2353,7 +2631,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   // Now construct the mask that will be used by the vselect or blended
   // vrgather operation. For vrgathers, construct the appropriate indices into
   // each vector.
-  for (int MaskIndex : SVN->getMask()) {
+  for (int MaskIndex : Mask) {
     bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask;
     MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
     if (!IsSelect) {
@@ -2691,15 +2969,25 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     MVT VT = Op.getSimpleValueType();
     assert(VT == Subtarget.getXLenVT() && "Unexpected custom legalization");
     SDLoc DL(Op);
-    if (Op.getOperand(2).getOpcode() == ISD::Constant)
-      return Op;
     // FSL/FSR take a log2(XLen)+1 bit shift amount but XLenVT FSHL/FSHR only
-    // use log(XLen) bits. Mask the shift amount accordingly.
+    // use log(XLen) bits. Mask the shift amount accordingly to prevent
+    // accidentally setting the extra bit.
     unsigned ShAmtWidth = Subtarget.getXLen() - 1;
     SDValue ShAmt = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(2),
                                 DAG.getConstant(ShAmtWidth, DL, VT));
-    unsigned Opc = Op.getOpcode() == ISD::FSHL ? RISCVISD::FSL : RISCVISD::FSR;
-    return DAG.getNode(Opc, DL, VT, Op.getOperand(0), Op.getOperand(1), ShAmt);
+    // fshl and fshr concatenate their operands in the same order. fsr and fsl
+    // instruction use different orders. fshl will return its first operand for
+    // shift of zero, fshr will return its second operand. fsl and fsr both
+    // return rs1 so the ISD nodes need to have different operand orders.
+    // Shift amount is in rs2.
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    unsigned Opc = RISCVISD::FSL;
+    if (Op.getOpcode() == ISD::FSHR) {
+      std::swap(Op0, Op1);
+      Opc = RISCVISD::FSR;
+    }
+    return DAG.getNode(Opc, DL, VT, Op0, Op1, ShAmt);
   }
   case ISD::TRUNCATE: {
     SDLoc DL(Op);
@@ -2774,7 +3062,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     // We define our scalable vector types for lmul=1 to use a 64 bit known
     // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
     // vscale as VLENB / 8.
-    assert(RISCV::RVVBitsPerBlock == 64 && "Unexpected bits per block!");
+    static_assert(RISCV::RVVBitsPerBlock == 64, "Unexpected bits per block!");
     if (isa<ConstantSDNode>(Op.getOperand(0))) {
       // We assume VLENB is a multiple of 8. We manually choose the best shift
       // here because SimplifyDemandedBits isn't always able to simplify it.
@@ -3001,7 +3289,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   }
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
-    return lowerFP_TO_INT_SAT(Op, DAG);
+    return lowerFP_TO_INT_SAT(Op, DAG, Subtarget);
   case ISD::FTRUNC:
   case ISD::FCEIL:
   case ISD::FFLOOR:
@@ -3063,9 +3351,14 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     unsigned NumOpElts =
         Op.getOperand(0).getSimpleValueType().getVectorMinNumElements();
     SDValue Vec = DAG.getUNDEF(VT);
-    for (const auto &OpIdx : enumerate(Op->ops()))
-      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, OpIdx.value(),
+    for (const auto &OpIdx : enumerate(Op->ops())) {
+      SDValue SubVec = OpIdx.value();
+      // Don't insert undef subvectors.
+      if (SubVec.isUndef())
+        continue;
+      Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, SubVec,
                         DAG.getIntPtrConstant(OpIdx.index() * NumOpElts, DL));
+    }
     return Vec;
   }
   case ISD::LOAD:
@@ -3181,6 +3474,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerSET_ROUNDING(Op, DAG);
   case ISD::VP_SELECT:
     return lowerVPOp(Op, DAG, RISCVISD::VSELECT_VL);
+  case ISD::VP_MERGE:
+    return lowerVPOp(Op, DAG, RISCVISD::VP_MERGE_VL);
   case ISD::VP_ADD:
     return lowerVPOp(Op, DAG, RISCVISD::ADD_VL);
   case ISD::VP_SUB:
@@ -4044,10 +4339,10 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
 
   const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
       RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo);
-  if (!II || !II->SplatOperand)
+  if (!II || !II->hasSplatOperand())
     return SDValue();
 
-  unsigned SplatOp = II->SplatOperand + HasChain;
+  unsigned SplatOp = II->SplatOperand + 1 + HasChain;
   assert(SplatOp < Op.getNumOperands());
 
   SmallVector<SDValue, 8> Operands(Op->op_begin(), Op->op_end());
@@ -4077,7 +4372,7 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
   // that a widening operation never uses SEW=64.
   // NOTE: If this fails the below assert, we can probably just find the
   // element count from any operand or result and use it to construct the VT.
-  assert(II->SplatOperand > 1 && "Unexpected splat operand!");
+  assert(II->SplatOperand > 0 && "Unexpected splat operand!");
   MVT VT = Op.getOperand(SplatOp - 1).getSimpleValueType();
 
   // The more complex case is when the scalar is larger than XLenVT.
@@ -4096,8 +4391,7 @@ static SDValue lowerVectorIntrinsicSplats(SDValue Op, SelectionDAG &DAG,
   // We need to convert the scalar to a splat vector.
   // FIXME: Can we implicitly truncate the scalar if it is known to
   // be sign extended?
-  // VL should be the last operand.
-  SDValue VL = Op.getOperand(Op.getNumOperands() - 1);
+  SDValue VL = getVLOperand(Op);
   assert(VL.getValueType() == XLenVT);
   ScalarOp = splatSplitI64WithVL(DL, VT, ScalarOp, VL, DAG);
   return DAG.getNode(Op->getOpcode(), DL, Op->getVTList(), Operands);
@@ -4138,6 +4432,15 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                        : RISCVISD::BDECOMPRESS;
     return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2));
   }
+  case Intrinsic::riscv_bfp:
+    return DAG.getNode(RISCVISD::BFP, DL, XLenVT, Op.getOperand(1),
+                       Op.getOperand(2));
+  case Intrinsic::riscv_fsl:
+    return DAG.getNode(RISCVISD::FSL, DL, XLenVT, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::riscv_fsr:
+    return DAG.getNode(RISCVISD::FSR, DL, XLenVT, Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::riscv_vmv_x_s:
     assert(Op.getValueType() == XLenVT && "Unexpected VT!");
     return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
@@ -4176,7 +4479,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     //   vmerge.vvm vDest, vSrc, vVal, mMask
     MVT VT = Op.getSimpleValueType();
     SDValue Vec = Op.getOperand(1);
-    SDValue VL = Op.getOperand(3);
+    SDValue VL = getVLOperand(Op);
 
     SDValue SplattedVal = splatSplitI64WithVL(DL, VT, Scalar, VL, DAG);
     SDValue SplattedIdx = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, VT,
@@ -4222,7 +4525,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                    DAG.getConstant(1, DL, XLenVT));
 
     // Double the VL since we halved SEW.
-    SDValue VL = Op.getOperand(NumOps - (1 + OpOffset));
+    SDValue VL = getVLOperand(Op);
     SDValue I32VL =
         DAG.getNode(ISD::SHL, DL, XLenVT, VL, DAG.getConstant(1, DL, XLenVT));
 
@@ -4294,7 +4597,9 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     auto *Load = cast<MemIntrinsicSDNode>(Op);
     SmallVector<SDValue, 8> Ops{Load->getChain(), IntID};
-    if (!IsUnmasked)
+    if (IsUnmasked)
+      Ops.push_back(DAG.getUNDEF(ContainerVT));
+    else
       Ops.push_back(PassThru);
     Ops.push_back(Op.getOperand(3)); // Ptr
     Ops.push_back(Op.getOperand(4)); // Stride
@@ -4720,7 +5025,7 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
   // register size. Therefore we must slide the vector group up the full
   // amount.
   if (SubVecVT.isFixedLengthVector()) {
-    if (OrigIdx == 0 && Vec.isUndef())
+    if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector())
       return Op;
     MVT ContainerVT = VecVT;
     if (VecVT.isFixedLengthVector()) {
@@ -4730,6 +5035,10 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT,
                          DAG.getUNDEF(ContainerVT), SubVec,
                          DAG.getConstant(0, DL, XLenVT));
+    if (OrigIdx == 0 && Vec.isUndef() && VecVT.isFixedLengthVector()) {
+      SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
+      return DAG.getBitcast(Op.getValueType(), SubVec);
+    }
     SDValue Mask =
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
     // Set the vector length to only the number of elements we care about. Note
@@ -5148,7 +5457,9 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
   unsigned IntID =
       IsUnmasked ? Intrinsic::riscv_vle : Intrinsic::riscv_vle_mask;
   SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
-  if (!IsUnmasked)
+  if (IsUnmasked)
+    Ops.push_back(DAG.getUNDEF(ContainerVT));
+  else
     Ops.push_back(PassThru);
   Ops.push_back(BasePtr);
   if (!IsUnmasked)
@@ -5518,13 +5829,20 @@ SDValue RISCVTargetLowering::lowerMaskedGather(SDValue Op,
     }
   }
 
+  if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) {
+      IndexVT = IndexVT.changeVectorElementType(XLenVT);
+      Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
+  }
+
   if (!VL)
     VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
 
   unsigned IntID =
       IsUnmasked ? Intrinsic::riscv_vluxei : Intrinsic::riscv_vluxei_mask;
   SmallVector<SDValue, 8> Ops{Chain, DAG.getTargetConstant(IntID, DL, XLenVT)};
-  if (!IsUnmasked)
+  if (IsUnmasked)
+    Ops.push_back(DAG.getUNDEF(ContainerVT));
+  else
     Ops.push_back(PassThru);
   Ops.push_back(BasePtr);
   Ops.push_back(Index);
@@ -5619,6 +5937,11 @@ SDValue RISCVTargetLowering::lowerMaskedScatter(SDValue Op,
     }
   }
 
+  if (XLenVT == MVT::i32 && IndexVT.getVectorElementType().bitsGT(XLenVT)) {
+      IndexVT = IndexVT.changeVectorElementType(XLenVT);
+      Index = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
+  }
+
   if (!VL)
     VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
 
@@ -5697,6 +6020,39 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op,
                      RMValue);
 }
 
+static RISCVISD::NodeType getRISCVWOpcodeByIntr(unsigned IntNo) {
+  switch (IntNo) {
+  default:
+    llvm_unreachable("Unexpected Intrinsic");
+  case Intrinsic::riscv_grev:
+    return RISCVISD::GREVW;
+  case Intrinsic::riscv_gorc:
+    return RISCVISD::GORCW;
+  case Intrinsic::riscv_bcompress:
+    return RISCVISD::BCOMPRESSW;
+  case Intrinsic::riscv_bdecompress:
+    return RISCVISD::BDECOMPRESSW;
+  case Intrinsic::riscv_bfp:
+    return RISCVISD::BFPW;
+  case Intrinsic::riscv_fsl:
+    return RISCVISD::FSLW;
+  case Intrinsic::riscv_fsr:
+    return RISCVISD::FSRW;
+  }
+}
+
+// Converts the given intrinsic to a i64 operation with any extension.
+static SDValue customLegalizeToWOpByIntr(SDNode *N, SelectionDAG &DAG,
+                                         unsigned IntNo) {
+  SDLoc DL(N);
+  RISCVISD::NodeType WOpcode = getRISCVWOpcodeByIntr(IntNo);
+  SDValue NewOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+  SDValue NewOp2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+  SDValue NewRes = DAG.getNode(WOpcode, DL, MVT::i64, NewOp1, NewOp2);
+  // ReplaceNodeResults requires we maintain the same type for the return value.
+  return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewRes);
+}
+
 // Returns the opcode of the target-specific SDNode that implements the 32-bit
 // form of the given Opcode.
 static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
@@ -5776,17 +6132,20 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       if (!isTypeLegal(Op0.getValueType()))
         return;
       if (IsStrict) {
-        unsigned Opc = IsSigned ? RISCVISD::STRICT_FCVT_W_RTZ_RV64
-                                : RISCVISD::STRICT_FCVT_WU_RTZ_RV64;
+        unsigned Opc = IsSigned ? RISCVISD::STRICT_FCVT_W_RV64
+                                : RISCVISD::STRICT_FCVT_WU_RV64;
         SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
-        SDValue Res = DAG.getNode(Opc, DL, VTs, N->getOperand(0), Op0);
+        SDValue Res = DAG.getNode(
+            Opc, DL, VTs, N->getOperand(0), Op0,
+            DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, MVT::i64));
         Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
         Results.push_back(Res.getValue(1));
         return;
       }
-      unsigned Opc =
-          IsSigned ? RISCVISD::FCVT_W_RTZ_RV64 : RISCVISD::FCVT_WU_RTZ_RV64;
-      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, Op0);
+      unsigned Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64;
+      SDValue Res =
+          DAG.getNode(Opc, DL, MVT::i64, Op0,
+                      DAG.getTargetConstant(RISCVFPRndMode::RTZ, DL, MVT::i64));
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
@@ -6078,15 +6437,23 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
         DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
     SDValue NewOp1 =
         DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-    SDValue NewOp2 =
+    SDValue NewShAmt =
         DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
     // FSLW/FSRW take a 6 bit shift amount but i32 FSHL/FSHR only use 5 bits.
-    // Mask the shift amount to 5 bits.
-    NewOp2 = DAG.getNode(ISD::AND, DL, MVT::i64, NewOp2,
-                         DAG.getConstant(0x1f, DL, MVT::i64));
-    unsigned Opc =
-        N->getOpcode() == ISD::FSHL ? RISCVISD::FSLW : RISCVISD::FSRW;
-    SDValue NewOp = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1, NewOp2);
+    // Mask the shift amount to 5 bits to prevent accidentally setting bit 5.
+    NewShAmt = DAG.getNode(ISD::AND, DL, MVT::i64, NewShAmt,
+                           DAG.getConstant(0x1f, DL, MVT::i64));
+    // fshl and fshr concatenate their operands in the same order. fsrw and fslw
+    // instruction use different orders. fshl will return its first operand for
+    // shift of zero, fshr will return its second operand. fsl and fsr both
+    // return rs1 so the ISD nodes need to have different operand orders.
+    // Shift amount is in rs2.
+    unsigned Opc = RISCVISD::FSLW;
+    if (N->getOpcode() == ISD::FSHR) {
+      std::swap(NewOp0, NewOp1);
+      Opc = RISCVISD::FSRW;
+    }
+    SDValue NewOp = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1, NewShAmt);
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, NewOp));
     break;
   }
@@ -6154,6 +6521,31 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     default:
       llvm_unreachable(
           "Don't know how to custom type legalize this intrinsic!");
+    case Intrinsic::riscv_grev:
+    case Intrinsic::riscv_gorc:
+    case Intrinsic::riscv_bcompress:
+    case Intrinsic::riscv_bdecompress:
+    case Intrinsic::riscv_bfp: {
+      assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+             "Unexpected custom legalisation");
+      Results.push_back(customLegalizeToWOpByIntr(N, DAG, IntNo));
+      break;
+    }
+    case Intrinsic::riscv_fsl:
+    case Intrinsic::riscv_fsr: {
+      assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+             "Unexpected custom legalisation");
+      SDValue NewOp1 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
+      SDValue NewOp2 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
+      SDValue NewOp3 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(3));
+      unsigned Opc = getRISCVWOpcodeByIntr(IntNo);
+      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2, NewOp3);
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+      break;
+    }
     case Intrinsic::riscv_orc_b: {
       // Lower to the GORCI encoding for orc.b with the operand extended.
       SDValue NewOp =
@@ -6166,20 +6558,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
-    case Intrinsic::riscv_grev:
-    case Intrinsic::riscv_gorc: {
-      assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
-             "Unexpected custom legalisation");
-      SDValue NewOp1 =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-      SDValue NewOp2 =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
-      unsigned Opc =
-          IntNo == Intrinsic::riscv_grev ? RISCVISD::GREVW : RISCVISD::GORCW;
-      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2);
-      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
-      break;
-    }
     case Intrinsic::riscv_shfl:
     case Intrinsic::riscv_unshfl: {
       assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
@@ -6200,21 +6578,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       break;
     }
-    case Intrinsic::riscv_bcompress:
-    case Intrinsic::riscv_bdecompress: {
-      assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
-             "Unexpected custom legalisation");
-      SDValue NewOp1 =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(1));
-      SDValue NewOp2 =
-          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(2));
-      unsigned Opc = IntNo == Intrinsic::riscv_bcompress
-                         ? RISCVISD::BCOMPRESSW
-                         : RISCVISD::BDECOMPRESSW;
-      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp1, NewOp2);
-      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
-      break;
-    }
     case Intrinsic::riscv_vmv_x_s: {
       EVT VT = N->getValueType(0);
       MVT XLenVT = Subtarget.getXLenVT();
@@ -6923,9 +7286,14 @@ static SDValue performANY_EXTENDCombine(SDNode *N,
 
 // Try to form VWMUL or VWMULU.
 // FIXME: Support VWMULSU.
-static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1,
-                                    SelectionDAG &DAG) {
+static SDValue combineMUL_VLToVWMUL_VL(SDNode *N, SelectionDAG &DAG,
+                                       bool Commute) {
   assert(N->getOpcode() == RISCVISD::MUL_VL && "Unexpected opcode");
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  if (Commute)
+    std::swap(Op0, Op1);
+
   bool IsSignExt = Op0.getOpcode() == RISCVISD::VSEXT_VL;
   bool IsZeroExt = Op0.getOpcode() == RISCVISD::VZEXT_VL;
   if ((!IsSignExt && !IsZeroExt) || !Op0.hasOneUse())
@@ -7002,6 +7370,123 @@ static SDValue combineMUL_VLToVWMUL(SDNode *N, SDValue Op0, SDValue Op1,
   return DAG.getNode(WMulOpc, DL, VT, Op0, Op1, Mask, VL);
 }
 
+static RISCVFPRndMode::RoundingMode matchRoundingOp(SDValue Op) {
+  switch (Op.getOpcode()) {
+  case ISD::FROUNDEVEN: return RISCVFPRndMode::RNE;
+  case ISD::FTRUNC:     return RISCVFPRndMode::RTZ;
+  case ISD::FFLOOR:     return RISCVFPRndMode::RDN;
+  case ISD::FCEIL:      return RISCVFPRndMode::RUP;
+  case ISD::FROUND:     return RISCVFPRndMode::RMM;
+  }
+
+  return RISCVFPRndMode::Invalid;
+}
+
+// Fold
+//   (fp_to_int (froundeven X)) -> fcvt X, rne
+//   (fp_to_int (ftrunc X))     -> fcvt X, rtz
+//   (fp_to_int (ffloor X))     -> fcvt X, rdn
+//   (fp_to_int (fceil X))      -> fcvt X, rup
+//   (fp_to_int (fround X))     -> fcvt X, rmm
+static SDValue performFP_TO_INTCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const RISCVSubtarget &Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  // Only handle XLen or i32 types. Other types narrower than XLen will
+  // eventually be legalized to XLenVT.
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && VT != XLenVT)
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+
+  // Ensure the FP type is also legal.
+  if (!TLI.isTypeLegal(Src.getValueType()))
+    return SDValue();
+
+  // Don't do this for f16 with Zfhmin and not Zfh.
+  if (Src.getValueType() == MVT::f16 && !Subtarget.hasStdExtZfh())
+    return SDValue();
+
+  RISCVFPRndMode::RoundingMode FRM = matchRoundingOp(Src);
+  if (FRM == RISCVFPRndMode::Invalid)
+    return SDValue();
+
+  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+  unsigned Opc;
+  if (VT == XLenVT)
+    Opc = IsSigned ? RISCVISD::FCVT_X : RISCVISD::FCVT_XU;
+  else
+    Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64;
+
+  SDLoc DL(N);
+  SDValue FpToInt = DAG.getNode(Opc, DL, XLenVT, Src.getOperand(0),
+                                DAG.getTargetConstant(FRM, DL, XLenVT));
+  return DAG.getNode(ISD::TRUNCATE, DL, VT, FpToInt);
+}
+
+// Fold
+//   (fp_to_int_sat (froundeven X)) -> (select X == nan, 0, (fcvt X, rne))
+//   (fp_to_int_sat (ftrunc X))     -> (select X == nan, 0, (fcvt X, rtz))
+//   (fp_to_int_sat (ffloor X))     -> (select X == nan, 0, (fcvt X, rdn))
+//   (fp_to_int_sat (fceil X))      -> (select X == nan, 0, (fcvt X, rup))
+//   (fp_to_int_sat (fround X))     -> (select X == nan, 0, (fcvt X, rmm))
+static SDValue performFP_TO_INT_SATCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const RISCVSubtarget &Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  // Only handle XLen types. Other types narrower than XLen will eventually be
+  // legalized to XLenVT.
+  EVT DstVT = N->getValueType(0);
+  if (DstVT != XLenVT)
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+
+  // Ensure the FP type is also legal.
+  if (!TLI.isTypeLegal(Src.getValueType()))
+    return SDValue();
+
+  // Don't do this for f16 with Zfhmin and not Zfh.
+  if (Src.getValueType() == MVT::f16 && !Subtarget.hasStdExtZfh())
+    return SDValue();
+
+  EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  RISCVFPRndMode::RoundingMode FRM = matchRoundingOp(Src);
+  if (FRM == RISCVFPRndMode::Invalid)
+    return SDValue();
+
+  bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT_SAT;
+
+  unsigned Opc;
+  if (SatVT == DstVT)
+    Opc = IsSigned ? RISCVISD::FCVT_X : RISCVISD::FCVT_XU;
+  else if (DstVT == MVT::i64 && SatVT == MVT::i32)
+    Opc = IsSigned ? RISCVISD::FCVT_W_RV64 : RISCVISD::FCVT_WU_RV64;
+  else
+    return SDValue();
+  // FIXME: Support other SatVTs by clamping before or after the conversion.
+
+  Src = Src.getOperand(0);
+
+  SDLoc DL(N);
+  SDValue FpToInt = DAG.getNode(Opc, DL, XLenVT, Src,
+                                DAG.getTargetConstant(FRM, DL, XLenVT));
+
+  // RISCV FP-to-int conversions saturate to the destination register size, but
+  // don't produce 0 for nan.
+  SDValue ZeroInt = DAG.getConstant(0, DL, DstVT);
+  return DAG.getSelectCC(DL, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
+}
+
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -7083,25 +7568,6 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       return SDValue(N, 0);
     break;
   }
-  case RISCVISD::FSL:
-  case RISCVISD::FSR: {
-    // Only the lower log2(Bitwidth)+1 bits of the the shift amount are read.
-    unsigned BitWidth = N->getOperand(2).getValueSizeInBits();
-    assert(isPowerOf2_32(BitWidth) && "Unexpected bit width");
-    if (SimplifyDemandedLowBitsHelper(2, Log2_32(BitWidth) + 1))
-      return SDValue(N, 0);
-    break;
-  }
-  case RISCVISD::FSLW:
-  case RISCVISD::FSRW: {
-    // Only the lower 32 bits of Values and lower 6 bits of shift amount are
-    // read.
-    if (SimplifyDemandedLowBitsHelper(0, 32) ||
-        SimplifyDemandedLowBitsHelper(1, 32) ||
-        SimplifyDemandedLowBitsHelper(2, 6))
-      return SDValue(N, 0);
-    break;
-  }
   case RISCVISD::GREV:
   case RISCVISD::GORC: {
     // Only the lower log2(Bitwidth) bits of the the shift amount are read.
@@ -7331,6 +7797,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     }
     break;
   }
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return performFP_TO_INTCombine(N, DCI, Subtarget);
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    return performFP_TO_INT_SATCombine(N, DCI, Subtarget);
   case ISD::FCOPYSIGN: {
     EVT VT = N->getValueType(0);
     if (!VT.isVector())
@@ -7464,15 +7936,11 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     }
     break;
   }
-  case RISCVISD::MUL_VL: {
-    SDValue Op0 = N->getOperand(0);
-    SDValue Op1 = N->getOperand(1);
-    if (SDValue V = combineMUL_VLToVWMUL(N, Op0, Op1, DAG))
+  case RISCVISD::MUL_VL:
+    if (SDValue V = combineMUL_VLToVWMUL_VL(N, DAG, /*Commute*/ false))
       return V;
-    if (SDValue V = combineMUL_VLToVWMUL(N, Op1, Op0, DAG))
-      return V;
-    return SDValue();
-  }
+    // Mul is commutative.
+    return combineMUL_VLToVWMUL_VL(N, DAG, /*Commute*/ true);
   case ISD::STORE: {
     auto *Store = cast<StoreSDNode>(N);
     SDValue Val = Store->getValue();
@@ -7486,12 +7954,12 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
       if (VecVT.getVectorElementType() == MemVT) {
         SDLoc DL(N);
         MVT MaskVT = MVT::getVectorVT(MVT::i1, VecVT.getVectorElementCount());
-        return DAG.getStoreVP(Store->getChain(), DL, Src, Store->getBasePtr(),
-                              DAG.getConstant(1, DL, MaskVT),
-                              DAG.getConstant(1, DL, Subtarget.getXLenVT()),
-                              Store->getPointerInfo(),
-                              Store->getOriginalAlign(),
-                              Store->getMemOperand()->getFlags());
+        return DAG.getStoreVP(
+            Store->getChain(), DL, Src, Store->getBasePtr(), Store->getOffset(),
+            DAG.getConstant(1, DL, MaskVT),
+            DAG.getConstant(1, DL, Subtarget.getXLenVT()), MemVT,
+            Store->getMemOperand(), Store->getAddressingMode(),
+            Store->isTruncatingStore(), /*IsCompress*/ false);
       }
     }
 
@@ -7732,14 +8200,18 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     // We assume VLENB is no more than 65536 / 8 bytes.
     Known.Zero.setBitsFrom(14);
     break;
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = Op.getConstantOperandVal(1);
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo =
+        Op.getConstantOperandVal(Opc == ISD::INTRINSIC_WO_CHAIN ? 0 : 1);
     switch (IntNo) {
     default:
       // We can't do anything for most intrinsics.
       break;
     case Intrinsic::riscv_vsetvli:
     case Intrinsic::riscv_vsetvlimax:
+    case Intrinsic::riscv_vsetvli_opt:
+    case Intrinsic::riscv_vsetvlimax_opt:
       // Assume that VL output is positive and would fit in an int32_t.
       // TODO: VLEN might be capped at 16 bits in a future V spec update.
       if (BitWidth >= 32)
@@ -7779,10 +8251,11 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
   case RISCVISD::UNSHFLW:
   case RISCVISD::BCOMPRESSW:
   case RISCVISD::BDECOMPRESSW:
-  case RISCVISD::FCVT_W_RTZ_RV64:
-  case RISCVISD::FCVT_WU_RTZ_RV64:
-  case RISCVISD::STRICT_FCVT_W_RTZ_RV64:
-  case RISCVISD::STRICT_FCVT_WU_RTZ_RV64:
+  case RISCVISD::BFPW:
+  case RISCVISD::FCVT_W_RV64:
+  case RISCVISD::FCVT_WU_RV64:
+  case RISCVISD::STRICT_FCVT_W_RV64:
+  case RISCVISD::STRICT_FCVT_WU_RV64:
     // TODO: As the result is sign-extended, this is conservatively correct. A
     // more precise answer could be calculated for SRAW depending on known
     // bits in the shift amount.
@@ -7958,6 +8431,42 @@ static bool isSelectPseudo(MachineInstr &MI) {
   }
 }
 
+static MachineBasicBlock *emitQuietFCMP(MachineInstr &MI, MachineBasicBlock *BB,
+                                        unsigned RelOpcode, unsigned EqOpcode,
+                                        const RISCVSubtarget &Subtarget) {
+  DebugLoc DL = MI.getDebugLoc();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src1Reg = MI.getOperand(1).getReg();
+  Register Src2Reg = MI.getOperand(2).getReg();
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  Register SavedFFlags = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+  const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo();
+
+  // Save the current FFLAGS.
+  BuildMI(*BB, MI, DL, TII.get(RISCV::ReadFFLAGS), SavedFFlags);
+
+  auto MIB = BuildMI(*BB, MI, DL, TII.get(RelOpcode), DstReg)
+                 .addReg(Src1Reg)
+                 .addReg(Src2Reg);
+  if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept))
+    MIB->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
+  // Restore the FFLAGS.
+  BuildMI(*BB, MI, DL, TII.get(RISCV::WriteFFLAGS))
+      .addReg(SavedFFlags, RegState::Kill);
+
+  // Issue a dummy FEQ opcode to raise exception for signaling NaNs.
+  auto MIB2 = BuildMI(*BB, MI, DL, TII.get(EqOpcode), RISCV::X0)
+                  .addReg(Src1Reg, getKillRegState(MI.getOperand(1).isKill()))
+                  .addReg(Src2Reg, getKillRegState(MI.getOperand(2).isKill()));
+  if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept))
+    MIB2->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
+  // Erase the pseudoinstruction.
+  MI.eraseFromParent();
+  return BB;
+}
+
 static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
                                            MachineBasicBlock *BB,
                                            const RISCVSubtarget &Subtarget) {
@@ -8099,6 +8608,18 @@ RISCVTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitBuildPairF64Pseudo(MI, BB);
   case RISCV::SplitF64Pseudo:
     return emitSplitF64Pseudo(MI, BB);
+  case RISCV::PseudoQuietFLE_H:
+    return emitQuietFCMP(MI, BB, RISCV::FLE_H, RISCV::FEQ_H, Subtarget);
+  case RISCV::PseudoQuietFLT_H:
+    return emitQuietFCMP(MI, BB, RISCV::FLT_H, RISCV::FEQ_H, Subtarget);
+  case RISCV::PseudoQuietFLE_S:
+    return emitQuietFCMP(MI, BB, RISCV::FLE_S, RISCV::FEQ_S, Subtarget);
+  case RISCV::PseudoQuietFLT_S:
+    return emitQuietFCMP(MI, BB, RISCV::FLT_S, RISCV::FEQ_S, Subtarget);
+  case RISCV::PseudoQuietFLE_D:
+    return emitQuietFCMP(MI, BB, RISCV::FLE_D, RISCV::FEQ_D, Subtarget);
+  case RISCV::PseudoQuietFLT_D:
+    return emitQuietFCMP(MI, BB, RISCV::FLT_D, RISCV::FEQ_D, Subtarget);
   }
 }
 
@@ -8393,7 +8914,8 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
         LocVT = XLenVT;
         LocInfo = CCValAssign::Indirect;
       } else if (ValVT.isScalableVector()) {
-        report_fatal_error("Unable to pass scalable vector types on the stack");
+        LocVT = XLenVT;
+        LocInfo = CCValAssign::Indirect;
       } else {
         // Pass fixed-length vectors on the stack.
         LocVT = ValVT;
@@ -8592,8 +9114,14 @@ static SDValue unpackFromMemLoc(SelectionDAG &DAG, SDValue Chain,
   EVT LocVT = VA.getLocVT();
   EVT ValVT = VA.getValVT();
   EVT PtrVT = MVT::getIntegerVT(DAG.getDataLayout().getPointerSizeInBits(0));
+  if (ValVT.isScalableVector()) {
+    // When the value is a scalable vector, we save the pointer which points to
+    // the scalable vector value in the stack. The ValVT will be the pointer
+    // type, instead of the scalable vector type.
+    ValVT = LocVT;
+  }
   int FI = MFI.CreateFixedObject(ValVT.getStoreSize(), VA.getLocMemOffset(),
-                                 /*Immutable=*/true);
+                                 /*IsImmutable=*/true);
   SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
   SDValue Val;
 
@@ -8623,7 +9151,8 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
 
   if (VA.isMemLoc()) {
     // f64 is passed on the stack.
-    int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*Immutable=*/true);
+    int FI =
+        MFI.CreateFixedObject(8, VA.getLocMemOffset(), /*IsImmutable=*/true);
     SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
     return DAG.getLoad(MVT::f64, DL, Chain, FIN,
                        MachinePointerInfo::getFixedStack(MF, FI));
@@ -8637,7 +9166,7 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
   SDValue Hi;
   if (VA.getLocReg() == RISCV::X17) {
     // Second half of f64 is passed on the stack.
-    int FI = MFI.CreateFixedObject(4, 0, /*Immutable=*/true);
+    int FI = MFI.CreateFixedObject(4, 0, /*IsImmutable=*/true);
     SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
     Hi = DAG.getLoad(MVT::i32, DL, Chain, FIN,
                      MachinePointerInfo::getFixedStack(MF, FI));
@@ -9510,12 +10039,12 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FMV_X_ANYEXTH)
   NODE_NAME_CASE(FMV_W_X_RV64)
   NODE_NAME_CASE(FMV_X_ANYEXTW_RV64)
-  NODE_NAME_CASE(FCVT_X_RTZ)
-  NODE_NAME_CASE(FCVT_XU_RTZ)
-  NODE_NAME_CASE(FCVT_W_RTZ_RV64)
-  NODE_NAME_CASE(FCVT_WU_RTZ_RV64)
-  NODE_NAME_CASE(STRICT_FCVT_W_RTZ_RV64)
-  NODE_NAME_CASE(STRICT_FCVT_WU_RTZ_RV64)
+  NODE_NAME_CASE(FCVT_X)
+  NODE_NAME_CASE(FCVT_XU)
+  NODE_NAME_CASE(FCVT_W_RV64)
+  NODE_NAME_CASE(FCVT_WU_RV64)
+  NODE_NAME_CASE(STRICT_FCVT_W_RV64)
+  NODE_NAME_CASE(STRICT_FCVT_WU_RV64)
   NODE_NAME_CASE(READ_CYCLE_WIDE)
   NODE_NAME_CASE(GREV)
   NODE_NAME_CASE(GREVW)
@@ -9525,6 +10054,8 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SHFLW)
   NODE_NAME_CASE(UNSHFL)
   NODE_NAME_CASE(UNSHFLW)
+  NODE_NAME_CASE(BFP)
+  NODE_NAME_CASE(BFPW)
   NODE_NAME_CASE(BCOMPRESS)
   NODE_NAME_CASE(BCOMPRESSW)
   NODE_NAME_CASE(BDECOMPRESS)
@@ -9598,8 +10129,10 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(FP_ROUND_VL)
   NODE_NAME_CASE(VWMUL_VL)
   NODE_NAME_CASE(VWMULU_VL)
+  NODE_NAME_CASE(VWADDU_VL)
   NODE_NAME_CASE(SETCC_VL)
   NODE_NAME_CASE(VSELECT_VL)
+  NODE_NAME_CASE(VP_MERGE_VL)
   NODE_NAME_CASE(VMAND_VL)
   NODE_NAME_CASE(VMOR_VL)
   NODE_NAME_CASE(VMXOR_VL)
@@ -9768,12 +10301,18 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                         .Default(RISCV::NoRegister);
     if (FReg != RISCV::NoRegister) {
       assert(RISCV::F0_F <= FReg && FReg <= RISCV::F31_F && "Unknown fp-reg");
-      if (Subtarget.hasStdExtD()) {
+      if (Subtarget.hasStdExtD() && (VT == MVT::f64 || VT == MVT::Other)) {
         unsigned RegNo = FReg - RISCV::F0_F;
         unsigned DReg = RISCV::F0_D + RegNo;
         return std::make_pair(DReg, &RISCV::FPR64RegClass);
       }
-      return std::make_pair(FReg, &RISCV::FPR32RegClass);
+      if (VT == MVT::f32 || VT == MVT::Other)
+        return std::make_pair(FReg, &RISCV::FPR32RegClass);
+      if (Subtarget.hasStdExtZfh() && VT == MVT::f16) {
+        unsigned RegNo = FReg - RISCV::F0_F;
+        unsigned HReg = RISCV::F0_H + RegNo;
+        return std::make_pair(HReg, &RISCV::FPR16RegClass);
+      }
     }
   }
 
@@ -10070,6 +10609,24 @@ bool RISCVTargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
   }
 }
 
+unsigned RISCVTargetLowering::getJumpTableEncoding() const {
+  // If we are using the small code model, we can reduce size of jump table
+  // entry to 4 bytes.
+  if (Subtarget.is64Bit() && !isPositionIndependent() &&
+      getTargetMachine().getCodeModel() == CodeModel::Small) {
+    return MachineJumpTableInfo::EK_Custom32;
+  }
+  return TargetLowering::getJumpTableEncoding();
+}
+
+const MCExpr *RISCVTargetLowering::LowerCustomJumpTableEntry(
+    const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB,
+    unsigned uid, MCContext &Ctx) const {
+  assert(Subtarget.is64Bit() && !isPositionIndependent() &&
+         getTargetMachine().getCodeModel() == CodeModel::Small);
+  return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx);
+}
+
 bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                      EVT VT) const {
   VT = VT.getScalarType();
@@ -10293,6 +10850,60 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue(
   return SDValue();
 }
 
+SDValue
+RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                   SelectionDAG &DAG,
+                                   SmallVectorImpl<SDNode *> &Created) const {
+  AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+  if (isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N, 0); // Lower SDIV as SDIV
+
+  assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
+         "Unexpected divisor!");
+
+  // Conditional move is needed, so do the transformation iff Zbt is enabled.
+  if (!Subtarget.hasStdExtZbt())
+    return SDValue();
+
+  // When |Divisor| >= 2 ^ 12, it isn't profitable to do such transformation.
+  // Besides, more critical path instructions will be generated when dividing
+  // by 2. So we keep using the original DAGs for these cases.
+  unsigned Lg2 = Divisor.countTrailingZeros();
+  if (Lg2 == 1 || Lg2 >= 12)
+    return SDValue();
+
+  // fold (sdiv X, pow2)
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32 && !(Subtarget.is64Bit() && VT == MVT::i64))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue N0 = N->getOperand(0);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
+
+  // Add (N0 < 0) ? Pow2 - 1 : 0;
+  SDValue Cmp = DAG.getSetCC(DL, VT, N0, Zero, ISD::SETLT);
+  SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+  SDValue Sel = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+  Created.push_back(Cmp.getNode());
+  Created.push_back(Add.getNode());
+  Created.push_back(Sel.getNode());
+
+  // Divide by pow2.
+  SDValue SRA =
+      DAG.getNode(ISD::SRA, DL, VT, Sel, DAG.getConstant(Lg2, DL, VT));
+
+  // If we're dividing by a positive value, we're done.  Otherwise, we must
+  // negate the result.
+  if (Divisor.isNonNegative())
+    return SRA;
+
+  Created.push_back(SRA.getNode());
+  return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
+}
+
 #define GET_REGISTER_MATCHER
 #include "RISCVGenAsmMatcher.inc"
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 48c5ce730933..58b7ec89f875 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -63,11 +63,11 @@ enum NodeType : unsigned {
   CLZW,
   CTZW,
   // RV64IB/RV32IB funnel shifts, with the semantics of the named RISC-V
-  // instructions, but the same operand order as fshl/fshr intrinsics.
+  // instructions. Operand order is rs1, rs3, rs2/shamt.
   FSR,
   FSL,
-  // RV64IB funnel shifts, with the semantics of the named RISC-V instructions,
-  // but the same operand order as fshl/fshr intrinsics.
+  // RV64IB funnel shifts, with the semantics of the named RISC-V instructions.
+  // Operand order is rs1, rs3, rs2/shamt.
   FSRW,
   FSLW,
   // FPR<->GPR transfer operations when the FPR is smaller than XLEN, needed as
@@ -86,14 +86,16 @@ enum NodeType : unsigned {
   FMV_X_ANYEXTW_RV64,
   // FP to XLen int conversions. Corresponds to fcvt.l(u).s/d/h on RV64 and
   // fcvt.w(u).s/d/h on RV32. Unlike FP_TO_S/UINT these saturate out of
-  // range inputs. These are used for FP_TO_S/UINT_SAT lowering.
-  FCVT_X_RTZ,
-  FCVT_XU_RTZ,
+  // range inputs. These are used for FP_TO_S/UINT_SAT lowering. Rounding mode
+  // is passed as a TargetConstant operand using the RISCVFPRndMode enum.
+  FCVT_X,
+  FCVT_XU,
   // FP to 32 bit int conversions for RV64. These are used to keep track of the
   // result being sign extended to 64 bit. These saturate out of range inputs.
-  // Used for FP_TO_S/UINT and FP_TO_S/UINT_SAT lowering.
-  FCVT_W_RTZ_RV64,
-  FCVT_WU_RTZ_RV64,
+  // Used for FP_TO_S/UINT and FP_TO_S/UINT_SAT lowering. Rounding mode
+  // is passed as a TargetConstant operand using the RISCVFPRndMode enum.
+  FCVT_W_RV64,
+  FCVT_WU_RV64,
   // READ_CYCLE_WIDE - A read of the 64-bit cycle CSR on a 32-bit target
   // (returns (Lo, Hi)). It takes a chain operand.
   READ_CYCLE_WIDE,
@@ -118,6 +120,13 @@ enum NodeType : unsigned {
   BCOMPRESSW,
   BDECOMPRESS,
   BDECOMPRESSW,
+  // The bit field place (bfp) instruction places up to XLEN/2 LSB bits from rs2
+  // into the value in rs1. The upper bits of rs2 control the length of the bit
+  // field and target position. The layout of rs2 is chosen in a way that makes
+  // it possible to construct rs2 easily using pack[h] instructions and/or
+  // andi/lui.
+  BFP,
+  BFPW,
   // Vector Extension
   // VMV_V_X_VL matches the semantics of vmv.v.x but includes an extra operand
   // for the VL value to be used for the operation.
@@ -236,6 +245,7 @@ enum NodeType : unsigned {
   // Widening instructions
   VWMUL_VL,
   VWMULU_VL,
+  VWADDU_VL,
 
   // Vector compare producing a mask. Fourth operand is input mask. Fifth
   // operand is VL.
@@ -243,6 +253,10 @@ enum NodeType : unsigned {
 
   // Vector select with an additional VL operand. This operation is unmasked.
   VSELECT_VL,
+  // Vector select with operand #2 (the value when the condition is false) tied
+  // to the destination and an additional VL operand. This operation is
+  // unmasked.
+  VP_MERGE_VL,
 
   // Mask binary operators.
   VMAND_VL,
@@ -284,8 +298,8 @@ enum NodeType : unsigned {
 
   // FP to 32 bit int conversions for RV64. These are used to keep track of the
   // result being sign extended to 64 bit. These saturate out of range inputs.
-  STRICT_FCVT_W_RTZ_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE,
-  STRICT_FCVT_WU_RTZ_RV64,
+  STRICT_FCVT_W_RV64 = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+  STRICT_FCVT_WU_RV64,
 
   // Memory opcodes start here.
   VLE_VL = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -462,6 +476,8 @@ public:
                       SelectionDAG &DAG) const override;
   SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
+  template <class NodeTy>
+  SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override {
@@ -524,6 +540,16 @@ public:
 
   bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
 
+  SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+                        SmallVectorImpl<SDNode *> &Created) const override;
+
+  unsigned getJumpTableEncoding() const override;
+
+  const MCExpr *LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                          const MachineBasicBlock *MBB,
+                                          unsigned uid,
+                                          MCContext &Ctx) const override;
+
 private:
   /// RISCVCCAssignFn - This target-specific function extends the default
   /// CCValAssign with additional information used to lower RISC-V calling
@@ -544,9 +570,6 @@ private:
                          bool IsRet, CallLoweringInfo *CLI,
                          RISCVCCAssignFn Fn) const;
 
-  template <class NodeTy>
-  SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
-
   SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
                            bool UseGOT) const;
   SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
@@ -652,6 +675,15 @@ namespace RISCVVIntrinsicsTable {
 struct RISCVVIntrinsicInfo {
   unsigned IntrinsicID;
   uint8_t SplatOperand;
+  uint8_t VLOperand;
+  bool hasSplatOperand() const {
+    // 0xF is not valid. See NoSplatOperand in IntrinsicsRISCV.td.
+    return SplatOperand != 0xF;
+  }
+  bool hasVLOperand() const {
+    // 0x1F is not valid. See NoVLOperand in IntrinsicsRISCV.td.
+    return VLOperand != 0x1F;
+  }
 };
 
 using namespace RISCV;
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index dbfc90f36f80..d39e0805a79c 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -59,12 +59,13 @@ class VSETVLIInfo {
   uint8_t MaskAgnostic : 1;
   uint8_t MaskRegOp : 1;
   uint8_t StoreOp : 1;
+  uint8_t ScalarMovOp : 1;
   uint8_t SEWLMULRatioOnly : 1;
 
 public:
   VSETVLIInfo()
       : AVLImm(0), TailAgnostic(false), MaskAgnostic(false), MaskRegOp(false),
-        StoreOp(false), SEWLMULRatioOnly(false) {}
+        StoreOp(false), ScalarMovOp(false), SEWLMULRatioOnly(false) {}
 
   static VSETVLIInfo getUnknown() {
     VSETVLIInfo Info;
@@ -96,6 +97,18 @@ public:
     assert(hasAVLImm());
     return AVLImm;
   }
+  bool hasZeroAVL() const {
+    if (hasAVLImm())
+      return getAVLImm() == 0;
+    return false;
+  }
+  bool hasNonZeroAVL() const {
+    if (hasAVLImm())
+      return getAVLImm() > 0;
+    if (hasAVLReg())
+      return getAVLReg() == RISCV::X0;
+    return false;
+  }
 
   bool hasSameAVL(const VSETVLIInfo &Other) const {
     assert(isValid() && Other.isValid() &&
@@ -120,7 +133,7 @@ public:
     MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
   }
   void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA, bool MRO,
-                bool IsStore) {
+                bool IsStore, bool IsScalarMovOp) {
     assert(isValid() && !isUnknown() &&
            "Can't set VTYPE for uninitialized or unknown");
     VLMul = L;
@@ -129,6 +142,7 @@ public:
     MaskAgnostic = MA;
     MaskRegOp = MRO;
     StoreOp = IsStore;
+    ScalarMovOp = IsScalarMovOp;
   }
 
   unsigned encodeVTYPE() const {
@@ -139,6 +153,16 @@ public:
 
   bool hasSEWLMULRatioOnly() const { return SEWLMULRatioOnly; }
 
+  bool hasSameSEW(const VSETVLIInfo &Other) const {
+    assert(isValid() && Other.isValid() &&
+           "Can't compare invalid VSETVLIInfos");
+    assert(!isUnknown() && !Other.isUnknown() &&
+           "Can't compare VTYPE in unknown state");
+    assert(!SEWLMULRatioOnly && !Other.SEWLMULRatioOnly &&
+           "Can't compare when only LMUL/SEW ratio is valid.");
+    return SEW == Other.SEW;
+  }
+
   bool hasSameVTYPE(const VSETVLIInfo &Other) const {
     assert(isValid() && Other.isValid() &&
            "Can't compare invalid VSETVLIInfos");
@@ -178,6 +202,15 @@ public:
     return getSEWLMULRatio() == Other.getSEWLMULRatio();
   }
 
+  bool hasSamePolicy(const VSETVLIInfo &Other) const {
+    assert(isValid() && Other.isValid() &&
+           "Can't compare invalid VSETVLIInfos");
+    assert(!isUnknown() && !Other.isUnknown() &&
+           "Can't compare VTYPE in unknown state");
+    return TailAgnostic == Other.TailAgnostic &&
+           MaskAgnostic == Other.MaskAgnostic;
+  }
+
   bool hasCompatibleVTYPE(const VSETVLIInfo &InstrInfo, bool Strict) const {
     // Simple case, see if full VTYPE matches.
     if (hasSameVTYPE(InstrInfo))
@@ -222,6 +255,15 @@ public:
         return true;
     }
 
+    // For vmv.s.x and vfmv.s.f, there is only two behaviors, VL = 0 and VL > 0.
+    // So it's compatible when we could make sure that both VL be the same
+    // situation.
+    if (!Strict && InstrInfo.ScalarMovOp && InstrInfo.hasAVLImm() &&
+        ((hasNonZeroAVL() && InstrInfo.hasNonZeroAVL()) ||
+         (hasZeroAVL() && InstrInfo.hasZeroAVL())) &&
+        hasSameSEW(InstrInfo) && hasSamePolicy(InstrInfo))
+      return true;
+
     // The AVL must match.
     if (!hasSameAVL(InstrInfo))
       return false;
@@ -414,6 +456,36 @@ static MachineInstr *elideCopies(MachineInstr *MI,
   }
 }
 
+static bool isScalarMoveInstr(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case RISCV::PseudoVMV_S_X_M1:
+  case RISCV::PseudoVMV_S_X_M2:
+  case RISCV::PseudoVMV_S_X_M4:
+  case RISCV::PseudoVMV_S_X_M8:
+  case RISCV::PseudoVMV_S_X_MF2:
+  case RISCV::PseudoVMV_S_X_MF4:
+  case RISCV::PseudoVMV_S_X_MF8:
+  case RISCV::PseudoVFMV_S_F16_M1:
+  case RISCV::PseudoVFMV_S_F16_M2:
+  case RISCV::PseudoVFMV_S_F16_M4:
+  case RISCV::PseudoVFMV_S_F16_M8:
+  case RISCV::PseudoVFMV_S_F16_MF2:
+  case RISCV::PseudoVFMV_S_F16_MF4:
+  case RISCV::PseudoVFMV_S_F32_M1:
+  case RISCV::PseudoVFMV_S_F32_M2:
+  case RISCV::PseudoVFMV_S_F32_M4:
+  case RISCV::PseudoVFMV_S_F32_M8:
+  case RISCV::PseudoVFMV_S_F32_MF2:
+  case RISCV::PseudoVFMV_S_F64_M1:
+  case RISCV::PseudoVFMV_S_F64_M2:
+  case RISCV::PseudoVFMV_S_F64_M4:
+  case RISCV::PseudoVFMV_S_F64_M8:
+    return true;
+  }
+}
+
 static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
                                        const MachineRegisterInfo *MRI) {
   VSETVLIInfo InstrInfo;
@@ -461,6 +533,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
   // If there are no explicit defs, this is a store instruction which can
   // ignore the tail and mask policies.
   bool StoreOp = MI.getNumExplicitDefs() == 0;
+  bool ScalarMovOp = isScalarMoveInstr(MI);
 
   if (RISCVII::hasVLOp(TSFlags)) {
     const MachineOperand &VLOp = MI.getOperand(NumOperands - 2);
@@ -477,7 +550,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
   } else
     InstrInfo.setAVLReg(RISCV::NoRegister);
   InstrInfo.setVTYPE(VLMul, SEW, /*TailAgnostic*/ TailAgnostic,
-                     /*MaskAgnostic*/ false, MaskRegOp, StoreOp);
+                     /*MaskAgnostic*/ false, MaskRegOp, StoreOp, ScalarMovOp);
 
   return InstrInfo;
 }
@@ -1000,6 +1073,13 @@ void RISCVInsertVSETVLI::emitVSETVLIs(MachineBasicBlock &MBB) {
               PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
               NeedInsertVSETVLI = false;
             }
+            if (isScalarMoveInstr(MI) &&
+                ((CurInfo.hasNonZeroAVL() && NewInfo.hasNonZeroAVL()) ||
+                 (CurInfo.hasZeroAVL() && NewInfo.hasZeroAVL())) &&
+                NewInfo.hasSameVLMAX(CurInfo)) {
+              PrevVSETVLIMI->getOperand(2).setImm(NewInfo.encodeVTYPE());
+              NeedInsertVSETVLI = false;
+            }
           }
           if (NeedInsertVSETVLI)
             insertVSETVLI(MBB, MI, NewInfo, CurInfo);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 6a16b6354f95..f99d0f56c406 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -206,6 +206,13 @@ class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string
   let isCodeGenOnly = 1;
 }
 
+class PseudoQuietFCMP<RegisterClass Ty>
+    : Pseudo<(outs GPR:$rd), (ins Ty:$rs1, Ty:$rs2), []> {
+  let hasSideEffects = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
 // Pseudo load instructions.
 class PseudoLoad<string opcodestr, RegisterClass rdty = GPR>
     : Pseudo<(outs rdty:$rd), (ins bare_symbol:$addr), [], opcodestr, "$rd, $addr"> {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2e2e00886d57..7baed2793e4e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -201,8 +201,9 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI,
       if (MBBI->modifiesRegister(RISCV::VL))
         return false;
 
-      // Go through all defined operands, including implicit defines.
-      for (const MachineOperand &MO : MBBI->operands()) {
+      // Only converting whole register copies to vmv.v.v when the defining
+      // value appears in the explicit operands.
+      for (const MachineOperand &MO : MBBI->explicit_operands()) {
         if (!MO.isReg() || !MO.isDef())
           continue;
         if (!FoundDef && TRI->isSubRegisterEq(MO.getReg(), SrcReg)) {
@@ -914,7 +915,7 @@ void RISCVInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
                           .addMBB(&DestBB, RISCVII::MO_CALL);
 
   RS->enterBasicBlockEnd(MBB);
-  unsigned Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass,
+  Register Scav = RS->scavengeRegisterBackwards(RISCV::GPRRegClass,
                                                 MI.getIterator(), false, 0);
   // TODO: The case when there is no scavenged register needs special handling.
   assert(Scav != RISCV::NoRegister && "No register is scavenged!");
@@ -1145,6 +1146,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           else
             Ok = isUInt<5>(Imm);
           break;
+        case RISCVOp::OPERAND_RVKRNUM:
+          Ok = Imm >= 0 && Imm <= 10;
+          break;
         }
         if (!Ok) {
           ErrInfo = "Invalid immediate";
@@ -1399,19 +1403,28 @@ MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall(
 #define CASE_VFMA_OPCODE_COMMON(OP, TYPE, LMUL)                                \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL
 
-#define CASE_VFMA_OPCODE_LMULS(OP, TYPE)                                       \
-  CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF8):                                      \
-  case CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF4):                                 \
-  case CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF2):                                 \
-  case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M1):                                  \
+#define CASE_VFMA_OPCODE_LMULS_M1(OP, TYPE)                                    \
+  CASE_VFMA_OPCODE_COMMON(OP, TYPE, M1):                                       \
   case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M2):                                  \
   case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M4):                                  \
   case CASE_VFMA_OPCODE_COMMON(OP, TYPE, M8)
 
+#define CASE_VFMA_OPCODE_LMULS_MF2(OP, TYPE)                                   \
+  CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF2):                                      \
+  case CASE_VFMA_OPCODE_LMULS_M1(OP, TYPE)
+
+#define CASE_VFMA_OPCODE_LMULS_MF4(OP, TYPE)                                   \
+  CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF4):                                      \
+  case CASE_VFMA_OPCODE_LMULS_MF2(OP, TYPE)
+
+#define CASE_VFMA_OPCODE_LMULS(OP, TYPE)                                       \
+  CASE_VFMA_OPCODE_COMMON(OP, TYPE, MF8):                                      \
+  case CASE_VFMA_OPCODE_LMULS_MF4(OP, TYPE)
+
 #define CASE_VFMA_SPLATS(OP)                                                   \
-  CASE_VFMA_OPCODE_LMULS(OP, VF16):                                            \
-  case CASE_VFMA_OPCODE_LMULS(OP, VF32):                                       \
-  case CASE_VFMA_OPCODE_LMULS(OP, VF64)
+  CASE_VFMA_OPCODE_LMULS_MF4(OP, VF16):                                        \
+  case CASE_VFMA_OPCODE_LMULS_MF2(OP, VF32):                                   \
+  case CASE_VFMA_OPCODE_LMULS_M1(OP, VF64)
 // clang-format on
 
 bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
@@ -1430,10 +1443,10 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case CASE_VFMA_SPLATS(FNMSUB):
   case CASE_VFMA_SPLATS(FNMACC):
   case CASE_VFMA_SPLATS(FNMSAC):
-  case CASE_VFMA_OPCODE_LMULS(FMACC, VV):
-  case CASE_VFMA_OPCODE_LMULS(FMSAC, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMACC, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMSAC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMACC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMSAC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMACC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMSAC, VV):
   case CASE_VFMA_OPCODE_LMULS(MADD, VX):
   case CASE_VFMA_OPCODE_LMULS(NMSUB, VX):
   case CASE_VFMA_OPCODE_LMULS(MACC, VX):
@@ -1454,10 +1467,10 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
       return false;
     return true;
   }
-  case CASE_VFMA_OPCODE_LMULS(FMADD, VV):
-  case CASE_VFMA_OPCODE_LMULS(FMSUB, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMADD, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMSUB, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMADD, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMSUB, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMADD, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMSUB, VV):
   case CASE_VFMA_OPCODE_LMULS(MADD, VV):
   case CASE_VFMA_OPCODE_LMULS(NMSUB, VV): {
     // If the tail policy is undisturbed we can't commute.
@@ -1533,19 +1546,28 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
     Opc = RISCV::PseudoV##NEWOP##_##TYPE##_##LMUL;                             \
     break;
 
-#define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE)                      \
-  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8)                      \
-  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4)                      \
-  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2)                      \
+#define CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE)                   \
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M1)                       \
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M2)                       \
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M4)                       \
   CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, M8)
 
+#define CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE)                  \
+  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF2)                      \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE)
+
+#define CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE)                  \
+  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF4)                      \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, TYPE)
+
+#define CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, TYPE)                      \
+  CASE_VFMA_CHANGE_OPCODE_COMMON(OLDOP, NEWOP, TYPE, MF8)                      \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, TYPE)
+
 #define CASE_VFMA_CHANGE_OPCODE_SPLATS(OLDOP, NEWOP)                           \
-  CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF16)                            \
-  CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF32)                            \
-  CASE_VFMA_CHANGE_OPCODE_LMULS(OLDOP, NEWOP, VF64)
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VF16)                        \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VF32)                        \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VF64)
 
 MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
                                                      bool NewMI,
@@ -1566,10 +1588,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   case CASE_VFMA_SPLATS(FNMADD):
   case CASE_VFMA_SPLATS(FNMSAC):
   case CASE_VFMA_SPLATS(FNMSUB):
-  case CASE_VFMA_OPCODE_LMULS(FMACC, VV):
-  case CASE_VFMA_OPCODE_LMULS(FMSAC, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMACC, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMSAC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMACC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMSAC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMACC, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMSAC, VV):
   case CASE_VFMA_OPCODE_LMULS(MADD, VX):
   case CASE_VFMA_OPCODE_LMULS(NMSUB, VX):
   case CASE_VFMA_OPCODE_LMULS(MACC, VX):
@@ -1592,10 +1614,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
       CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMADD, FNMACC)
       CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMSAC, FNMSUB)
       CASE_VFMA_CHANGE_OPCODE_SPLATS(FNMSUB, FNMSAC)
-      CASE_VFMA_CHANGE_OPCODE_LMULS(FMACC, FMADD, VV)
-      CASE_VFMA_CHANGE_OPCODE_LMULS(FMSAC, FMSUB, VV)
-      CASE_VFMA_CHANGE_OPCODE_LMULS(FNMACC, FNMADD, VV)
-      CASE_VFMA_CHANGE_OPCODE_LMULS(FNMSAC, FNMSUB, VV)
+      CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMACC, FMADD, VV)
+      CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMSAC, FMSUB, VV)
+      CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMACC, FNMADD, VV)
+      CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMSAC, FNMSUB, VV)
       CASE_VFMA_CHANGE_OPCODE_LMULS(MACC, MADD, VX)
       CASE_VFMA_CHANGE_OPCODE_LMULS(MADD, MACC, VX)
       CASE_VFMA_CHANGE_OPCODE_LMULS(NMSAC, NMSUB, VX)
@@ -1609,10 +1631,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
-  case CASE_VFMA_OPCODE_LMULS(FMADD, VV):
-  case CASE_VFMA_OPCODE_LMULS(FMSUB, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMADD, VV):
-  case CASE_VFMA_OPCODE_LMULS(FNMSUB, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMADD, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FMSUB, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMADD, VV):
+  case CASE_VFMA_OPCODE_LMULS_MF4(FNMSUB, VV):
   case CASE_VFMA_OPCODE_LMULS(MADD, VV):
   case CASE_VFMA_OPCODE_LMULS(NMSUB, VV): {
     assert((OpIdx1 == 1 || OpIdx2 == 1) && "Unexpected opcode index");
@@ -1623,10 +1645,10 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
       switch (MI.getOpcode()) {
         default:
           llvm_unreachable("Unexpected opcode");
-        CASE_VFMA_CHANGE_OPCODE_LMULS(FMADD, FMACC, VV)
-        CASE_VFMA_CHANGE_OPCODE_LMULS(FMSUB, FMSAC, VV)
-        CASE_VFMA_CHANGE_OPCODE_LMULS(FNMADD, FNMACC, VV)
-        CASE_VFMA_CHANGE_OPCODE_LMULS(FNMSUB, FNMSAC, VV)
+        CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMADD, FMACC, VV)
+        CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FMSUB, FMSAC, VV)
+        CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMADD, FNMACC, VV)
+        CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(FNMSUB, FNMSAC, VV)
         CASE_VFMA_CHANGE_OPCODE_LMULS(MADD, MACC, VV)
         CASE_VFMA_CHANGE_OPCODE_LMULS(NMSUB, NMSAC, VV)
       }
@@ -1655,13 +1677,16 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
 #define CASE_WIDEOP_OPCODE_COMMON(OP, LMUL)                                    \
   RISCV::PseudoV##OP##_##LMUL##_TIED
 
-#define CASE_WIDEOP_OPCODE_LMULS(OP)                                           \
-  CASE_WIDEOP_OPCODE_COMMON(OP, MF8):                                          \
-  case CASE_WIDEOP_OPCODE_COMMON(OP, MF4):                                     \
+#define CASE_WIDEOP_OPCODE_LMULS_MF4(OP)                                       \
+  CASE_WIDEOP_OPCODE_COMMON(OP, MF4):                                          \
   case CASE_WIDEOP_OPCODE_COMMON(OP, MF2):                                     \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M1):                                      \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M2):                                      \
   case CASE_WIDEOP_OPCODE_COMMON(OP, M4)
+
+#define CASE_WIDEOP_OPCODE_LMULS(OP)                                           \
+  CASE_WIDEOP_OPCODE_COMMON(OP, MF8):                                          \
+  case CASE_WIDEOP_OPCODE_LMULS_MF4(OP)
 // clang-format on
 
 #define CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, LMUL)                             \
@@ -1669,22 +1694,25 @@ MachineInstr *RISCVInstrInfo::commuteInstructionImpl(MachineInstr &MI,
     NewOpc = RISCV::PseudoV##OP##_##LMUL;                                      \
     break;
 
-#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                    \
-  CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8)                                    \
+#define CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)                                 \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF4)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF2)                                    \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M1)                                     \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M2)                                     \
   CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, M4)
 
+#define CASE_WIDEOP_CHANGE_OPCODE_LMULS(OP)                                    \
+  CASE_WIDEOP_CHANGE_OPCODE_COMMON(OP, MF8)                                    \
+  CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(OP)
+
 MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
                                                     LiveVariables *LV,
                                                     LiveIntervals *LIS) const {
   switch (MI.getOpcode()) {
   default:
     break;
-  case CASE_WIDEOP_OPCODE_LMULS(FWADD_WV):
-  case CASE_WIDEOP_OPCODE_LMULS(FWSUB_WV):
+  case CASE_WIDEOP_OPCODE_LMULS_MF4(FWADD_WV):
+  case CASE_WIDEOP_OPCODE_LMULS_MF4(FWSUB_WV):
   case CASE_WIDEOP_OPCODE_LMULS(WADD_WV):
   case CASE_WIDEOP_OPCODE_LMULS(WADDU_WV):
   case CASE_WIDEOP_OPCODE_LMULS(WSUB_WV):
@@ -1694,14 +1722,14 @@ MachineInstr *RISCVInstrInfo::convertToThreeAddress(MachineInstr &MI,
     switch (MI.getOpcode()) {
     default:
       llvm_unreachable("Unexpected opcode");
-    CASE_WIDEOP_CHANGE_OPCODE_LMULS(FWADD_WV)
-    CASE_WIDEOP_CHANGE_OPCODE_LMULS(FWSUB_WV)
+    CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWADD_WV)
+    CASE_WIDEOP_CHANGE_OPCODE_LMULS_MF4(FWSUB_WV)
     CASE_WIDEOP_CHANGE_OPCODE_LMULS(WADD_WV)
     CASE_WIDEOP_CHANGE_OPCODE_LMULS(WADDU_WV)
     CASE_WIDEOP_CHANGE_OPCODE_LMULS(WSUB_WV)
     CASE_WIDEOP_CHANGE_OPCODE_LMULS(WSUBU_WV)
     }
-    //clang-format on
+    // clang-format on
 
     MachineBasicBlock &MBB = *MI.getParent();
     MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 71eb6f01a4f4..64cd89cda06a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -402,6 +402,21 @@ def AddiPairImmB : SDNodeXForm<imm, [{
                                    N->getValueType(0));
 }]>;
 
+def XLenSubTrailingOnes : SDNodeXForm<imm, [{
+  uint64_t XLen = Subtarget->getXLen();
+  uint64_t TrailingOnes = N->getAPIntValue().countTrailingOnes();
+  return CurDAG->getTargetConstant(XLen - TrailingOnes, SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
+// Checks if this mask is a non-empty sequence of ones starting at the
+// least significant bit with the remainder zero and exceeds simm12.
+def TrailingOnesMask : PatLeaf<(imm), [{
+  if (!N->hasOneUse())
+    return false;
+  return !isInt<12>(N->getSExtValue()) && isMask_64(N->getZExtValue());
+}], XLenSubTrailingOnes>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Formats
 //===----------------------------------------------------------------------===//
@@ -1019,6 +1034,23 @@ def mul_const_oneuse : PatFrag<(ops node:$A, node:$B),
   return false;
 }]>;
 
+def sext_oneuse : PatFrag<(ops node:$A), (sext node:$A), [{
+  return N->hasOneUse();
+}]>;
+
+def zext_oneuse : PatFrag<(ops node:$A), (zext node:$A), [{
+  return N->hasOneUse();
+}]>;
+
+def anyext_oneuse : PatFrag<(ops node:$A), (anyext node:$A), [{
+  return N->hasOneUse();
+}]>;
+
+def fpext_oneuse : PatFrag<(ops node:$A),
+                           (any_fpextend node:$A), [{
+  return N->hasOneUse();
+}]>;
+
 /// Simple arithmetic operations
 
 def : PatGprGpr<add, ADD>;
@@ -1034,6 +1066,10 @@ def : PatGprUimmLog2XLen<shl, SLLI>;
 def : PatGprUimmLog2XLen<srl, SRLI>;
 def : PatGprUimmLog2XLen<sra, SRAI>;
 
+// AND with trailing ones mask exceeding simm12.
+def : Pat<(XLenVT (and GPR:$rs, TrailingOnesMask:$mask)),
+          (SRLI (SLLI $rs, TrailingOnesMask:$mask), TrailingOnesMask:$mask)>;
+
 // Match both a plain shift and one where the shift amount is masked (this is
 // typically introduced when the legalizer promotes the shift amount and
 // zero-extends it). For RISC-V, the mask is unnecessary as shifts in the base
@@ -1350,6 +1386,10 @@ def ReadFRM : ReadSysReg<SysRegFRM, [FRM]>;
 def WriteFRM : WriteSysReg<SysRegFRM, [FRM]>;
 def WriteFRMImm : WriteSysRegImm<SysRegFRM, [FRM]>;
 
+let hasSideEffects = true in {
+def ReadFFLAGS : ReadSysReg<SysRegFFLAGS, [FFLAGS]>;
+def WriteFFLAGS : WriteSysReg<SysRegFFLAGS, [FFLAGS]>;
+}
 /// Other pseudo-instructions
 
 // Pessimistically assume the stack pointer will be clobbered
@@ -1476,5 +1516,6 @@ include "RISCVInstrInfoF.td"
 include "RISCVInstrInfoD.td"
 include "RISCVInstrInfoC.td"
 include "RISCVInstrInfoZb.td"
+include "RISCVInstrInfoZk.td"
 include "RISCVInstrInfoV.td"
 include "RISCVInstrInfoZfh.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index d6c31c4804db..2837b92da81f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -30,21 +30,12 @@ def RISCVSplitF64     : SDNode<"RISCVISD::SplitF64", SDT_RISCVSplitF64>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtD] in {
-
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
-def FLD : RVInstI<0b011, OPC_LOAD_FP, (outs FPR64:$rd),
-                  (ins GPR:$rs1, simm12:$imm12),
-                  "fld", "$rd, ${imm12}(${rs1})">,
-          Sched<[WriteFLD64, ReadFMemBase]>;
+def FLD : FPLoad_r<0b011, "fld", FPR64, WriteFLD64>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
 // reflecting the order these fields are specified in the instruction
 // encoding.
-let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
-def FSD : RVInstS<0b011, OPC_STORE_FP, (outs),
-                  (ins FPR64:$rs2, GPR:$rs1, simm12:$imm12),
-                   "fsd", "$rs2, ${imm12}(${rs1})">,
-          Sched<[WriteFST64, ReadStoreData, ReadFMemBase]>;
+def FSD : FPStore_r<0b011, "fsd", FPR64, WriteFST64>;
 
 let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64] in {
 def FMADD_D  : FPFMA_rrr_frm<OPC_MADD,  0b01, "fmadd.d",  FPR64>;
@@ -167,6 +158,10 @@ def : InstAlias<"fge.d $rd, $rs, $rt",
 
 def PseudoFLD  : PseudoFloatLoad<"fld", FPR64>;
 def PseudoFSD  : PseudoStore<"fsd", FPR64>;
+let usesCustomInserter = 1 in {
+def PseudoQuietFLE_D : PseudoQuietFCMP<FPR64>;
+def PseudoQuietFLT_D : PseudoQuietFCMP<FPR64>;
+}
 } // Predicates = [HasStdExtD]
 
 //===----------------------------------------------------------------------===//
@@ -231,13 +226,34 @@ def : PatFpr64Fpr64<fminnum, FMIN_D>;
 def : PatFpr64Fpr64<fmaxnum, FMAX_D>;
 
 /// Setcc
-
-def : PatFpr64Fpr64<seteq, FEQ_D>;
-def : PatFpr64Fpr64<setoeq, FEQ_D>;
-def : PatFpr64Fpr64<setlt, FLT_D>;
-def : PatFpr64Fpr64<setolt, FLT_D>;
-def : PatFpr64Fpr64<setle, FLE_D>;
-def : PatFpr64Fpr64<setole, FLE_D>;
+// FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for
+// strict versions of those.
+
+// Match non-signaling FEQ_D
+def : PatSetCC<FPR64, any_fsetcc, SETEQ, FEQ_D>;
+def : PatSetCC<FPR64, any_fsetcc, SETOEQ, FEQ_D>;
+def : PatSetCC<FPR64, strict_fsetcc, SETLT, PseudoQuietFLT_D>;
+def : PatSetCC<FPR64, strict_fsetcc, SETOLT, PseudoQuietFLT_D>;
+def : PatSetCC<FPR64, strict_fsetcc, SETLE, PseudoQuietFLE_D>;
+def : PatSetCC<FPR64, strict_fsetcc, SETOLE, PseudoQuietFLE_D>;
+
+// Match signaling FEQ_D
+def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETEQ),
+          (AND (FLE_D $rs1, $rs2),
+               (FLE_D $rs2, $rs1))>;
+def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs2, SETOEQ),
+          (AND (FLE_D $rs1, $rs2),
+               (FLE_D $rs2, $rs1))>;
+// If both operands are the same, use a single FLE.
+def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs1, SETEQ),
+          (FLE_D $rs1, $rs1)>;
+def : Pat<(strict_fsetccs FPR64:$rs1, FPR64:$rs1, SETOEQ),
+          (FLE_D $rs1, $rs1)>;
+
+def : PatSetCC<FPR64, any_fsetccs, SETLT, FLT_D>;
+def : PatSetCC<FPR64, any_fsetccs, SETOLT, FLT_D>;
+def : PatSetCC<FPR64, any_fsetccs, SETLE, FLE_D>;
+def : PatSetCC<FPR64, any_fsetccs, SETOLE, FLE_D>;
 
 def Select_FPR64_Using_CC_GPR : SelectCC_rrirr<FPR64, GPR>;
 
@@ -269,20 +285,22 @@ let Predicates = [HasStdExtD, IsRV32] in {
 
 /// Float constants
 def : Pat<(f64 (fpimm0)), (FCVT_D_W (i32 X0))>;
+def : Pat<(f64 (fpimmneg0)), (FSGNJN_D (FCVT_D_W (i32 X0)),
+                                       (FCVT_D_W (i32 X0)))>;
 
 // double->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, 0b001)>;
 def : Pat<(i32 (any_fp_to_uint FPR64:$rs1)), (FCVT_WU_D FPR64:$rs1, 0b001)>;
 
 // Saturating double->[u]int32.
-def : Pat<(i32 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_W_D $rs1, 0b001)>;
-def : Pat<(i32 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_WU_D $rs1, 0b001)>;
+def : Pat<(i32 (riscv_fcvt_x FPR64:$rs1, timm:$frm)), (FCVT_W_D $rs1, timm:$frm)>;
+def : Pat<(i32 (riscv_fcvt_xu FPR64:$rs1, timm:$frm)), (FCVT_WU_D $rs1, timm:$frm)>;
 
 // float->int32 with current rounding mode.
-def : Pat<(i32 (lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>;
+def : Pat<(i32 (any_lrint FPR64:$rs1)), (FCVT_W_D $rs1, 0b111)>;
 
 // float->int32 rounded to nearest with ties rounded away from zero.
-def : Pat<(i32 (lround FPR64:$rs1)), (FCVT_W_D $rs1, 0b100)>;
+def : Pat<(i32 (any_lround FPR64:$rs1)), (FCVT_W_D $rs1, 0b100)>;
 
 // [u]int->double.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1)>;
@@ -293,6 +311,8 @@ let Predicates = [HasStdExtD, IsRV64] in {
 
 /// Float constants
 def : Pat<(f64 (fpimm0)), (FMV_D_X (i64 X0))>;
+def : Pat<(f64 (fpimmneg0)), (FSGNJN_D (FMV_D_X (i64 X0)),
+                                       (FMV_D_X (i64 X0)))>;
 
 // Moves (no conversion)
 def : Pat<(bitconvert (i64 GPR:$rs1)), (FMV_D_X GPR:$rs1)>;
@@ -301,28 +321,28 @@ def : Pat<(i64 (bitconvert FPR64:$rs1)), (FMV_X_D FPR64:$rs1)>;
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR64:$rs1),  (FCVT_W_D $rs1, 0b001)>;
-def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR64:$rs1), (FCVT_WU_D $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_w_rv64 FPR64:$rs1, timm:$frm),  (FCVT_W_D $rs1, timm:$frm)>;
+def : Pat<(riscv_any_fcvt_wu_rv64 FPR64:$rs1, timm:$frm), (FCVT_WU_D $rs1, timm:$frm)>;
 
 // [u]int32->fp
 def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_D_W $rs1)>;
 def : Pat<(any_uint_to_fp (i64 (zexti32 (i64 GPR:$rs1)))), (FCVT_D_WU $rs1)>;
 
 // Saturating double->[u]int64.
-def : Pat<(i64 (riscv_fcvt_x_rtz FPR64:$rs1)), (FCVT_L_D $rs1, 0b001)>;
-def : Pat<(i64 (riscv_fcvt_xu_rtz FPR64:$rs1)), (FCVT_LU_D $rs1, 0b001)>;
+def : Pat<(i64 (riscv_fcvt_x FPR64:$rs1, timm:$frm)), (FCVT_L_D $rs1, timm:$frm)>;
+def : Pat<(i64 (riscv_fcvt_xu FPR64:$rs1, timm:$frm)), (FCVT_LU_D $rs1, timm:$frm)>;
 
 // double->[u]int64. Round-to-zero must be used.
 def : Pat<(i64 (any_fp_to_sint FPR64:$rs1)), (FCVT_L_D FPR64:$rs1, 0b001)>;
 def : Pat<(i64 (any_fp_to_uint FPR64:$rs1)), (FCVT_LU_D FPR64:$rs1, 0b001)>;
 
 // double->int64 with current rounding mode.
-def : Pat<(i64 (lrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>;
-def : Pat<(i64 (llrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>;
+def : Pat<(i64 (any_lrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>;
+def : Pat<(i64 (any_llrint FPR64:$rs1)), (FCVT_L_D $rs1, 0b111)>;
 
 // double->int64 rounded to nearest with ties rounded away from zero.
-def : Pat<(i64 (lround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
-def : Pat<(i64 (llround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
+def : Pat<(i64 (any_lround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
+def : Pat<(i64 (any_llround FPR64:$rs1)), (FCVT_L_D $rs1, 0b100)>;
 
 // [u]int64->fp. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_D_L GPR:$rs1, 0b111)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index bb45ed859442..a8ac06ba8da3 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -20,36 +20,38 @@ def SDT_RISCVFMV_W_X_RV64
 def SDT_RISCVFMV_X_ANYEXTW_RV64
     : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>;
 def SDT_RISCVFCVT_W_RV64
-    : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisFP<1>]>;
+    : SDTypeProfile<1, 2, [SDTCisVT<0, i64>, SDTCisFP<1>,
+                           SDTCisVT<2, i64>]>;
 def SDT_RISCVFCVT_X
-    : SDTypeProfile<1, 1, [SDTCisVT<0, XLenVT>, SDTCisFP<1>]>;
+    : SDTypeProfile<1, 2, [SDTCisVT<0, XLenVT>, SDTCisFP<1>,
+                           SDTCisVT<2, XLenVT>]>;
 
 def riscv_fmv_w_x_rv64
     : SDNode<"RISCVISD::FMV_W_X_RV64", SDT_RISCVFMV_W_X_RV64>;
 def riscv_fmv_x_anyextw_rv64
     : SDNode<"RISCVISD::FMV_X_ANYEXTW_RV64", SDT_RISCVFMV_X_ANYEXTW_RV64>;
-def riscv_fcvt_w_rtz_rv64
-    : SDNode<"RISCVISD::FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64>;
-def riscv_fcvt_wu_rtz_rv64
-    : SDNode<"RISCVISD::FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64>;
-def riscv_fcvt_x_rtz
-    : SDNode<"RISCVISD::FCVT_X_RTZ", SDT_RISCVFCVT_X>;
-def riscv_fcvt_xu_rtz
-    : SDNode<"RISCVISD::FCVT_XU_RTZ", SDT_RISCVFCVT_X>;
-
-def riscv_strict_fcvt_w_rtz_rv64
-    : SDNode<"RISCVISD::STRICT_FCVT_W_RTZ_RV64", SDT_RISCVFCVT_W_RV64,
+def riscv_fcvt_w_rv64
+    : SDNode<"RISCVISD::FCVT_W_RV64", SDT_RISCVFCVT_W_RV64>;
+def riscv_fcvt_wu_rv64
+    : SDNode<"RISCVISD::FCVT_WU_RV64", SDT_RISCVFCVT_W_RV64>;
+def riscv_fcvt_x
+    : SDNode<"RISCVISD::FCVT_X", SDT_RISCVFCVT_X>;
+def riscv_fcvt_xu
+    : SDNode<"RISCVISD::FCVT_XU", SDT_RISCVFCVT_X>;
+
+def riscv_strict_fcvt_w_rv64
+    : SDNode<"RISCVISD::STRICT_FCVT_W_RV64", SDT_RISCVFCVT_W_RV64,
              [SDNPHasChain]>;
-def riscv_strict_fcvt_wu_rtz_rv64
-    : SDNode<"RISCVISD::STRICT_FCVT_WU_RTZ_RV64", SDT_RISCVFCVT_W_RV64,
+def riscv_strict_fcvt_wu_rv64
+    : SDNode<"RISCVISD::STRICT_FCVT_WU_RV64", SDT_RISCVFCVT_W_RV64,
              [SDNPHasChain]>;
 
-def riscv_any_fcvt_w_rtz_rv64 : PatFrags<(ops node:$src),
-                                         [(riscv_strict_fcvt_w_rtz_rv64 node:$src),
-                                          (riscv_fcvt_w_rtz_rv64 node:$src)]>;
-def riscv_any_fcvt_wu_rtz_rv64 : PatFrags<(ops node:$src),
-                                          [(riscv_strict_fcvt_wu_rtz_rv64 node:$src),
-                                           (riscv_fcvt_wu_rtz_rv64 node:$src)]>;
+def riscv_any_fcvt_w_rv64 : PatFrags<(ops node:$src, node:$frm),
+                                     [(riscv_strict_fcvt_w_rv64 node:$src, node:$frm),
+                                      (riscv_fcvt_w_rv64 node:$src, node:$frm)]>;
+def riscv_any_fcvt_wu_rv64 : PatFrags<(ops node:$src, node:$frm),
+                                      [(riscv_strict_fcvt_wu_rv64 node:$src, node:$frm),
+                                       (riscv_fcvt_wu_rv64 node:$src, node:$frm)]>;
 
 //===----------------------------------------------------------------------===//
 // Operand and SDNode transformation definitions.
@@ -73,6 +75,22 @@ def frmarg : Operand<XLenVT> {
 // Instruction class templates
 //===----------------------------------------------------------------------===//
 
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
+class FPLoad_r<bits<3> funct3, string opcodestr, RegisterClass rty,
+               SchedWrite sw>
+    : RVInstI<funct3, OPC_LOAD_FP, (outs rty:$rd),
+              (ins GPR:$rs1, simm12:$imm12),
+              opcodestr, "$rd, ${imm12}(${rs1})">,
+      Sched<[sw, ReadFMemBase]>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+class FPStore_r<bits<3> funct3, string opcodestr, RegisterClass rty,
+                SchedWrite sw>
+    : RVInstS<funct3, OPC_STORE_FP, (outs),
+              (ins rty:$rs2, GPR:$rs1, simm12:$imm12),
+              opcodestr, "$rs2, ${imm12}(${rs1})">,
+      Sched<[sw, ReadStoreData, ReadFMemBase]>;
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, mayRaiseFPException = 1,
     UseNamedOperandTable = 1, hasPostISelHook = 1 in
 class FPFMA_rrr_frm<RISCVOpcode opcode, bits<2> funct2, string opcodestr,
@@ -138,20 +156,12 @@ class FPCmp_rr<bits<7> funct7, bits<3> funct3, string opcodestr,
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtF] in {
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
-def FLW : RVInstI<0b010, OPC_LOAD_FP, (outs FPR32:$rd),
-                  (ins GPR:$rs1, simm12:$imm12),
-                   "flw", "$rd, ${imm12}(${rs1})">,
-          Sched<[WriteFLD32, ReadFMemBase]>;
+def FLW : FPLoad_r<0b010, "flw", FPR32, WriteFLD32>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
 // reflecting the order these fields are specified in the instruction
 // encoding.
-let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
-def FSW : RVInstS<0b010, OPC_STORE_FP, (outs),
-                  (ins FPR32:$rs2, GPR:$rs1, simm12:$imm12),
-                   "fsw", "$rs2, ${imm12}(${rs1})">,
-          Sched<[WriteFST32, ReadStoreData, ReadFMemBase]>;
+def FSW : FPStore_r<0b010, "fsw", FPR32, WriteFST32>;
 
 let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in {
 def FMADD_S  : FPFMA_rrr_frm<OPC_MADD,  0b00, "fmadd.s",  FPR32>;
@@ -299,6 +309,10 @@ def : MnemonicAlias<"fmv.x.s", "fmv.x.w">;
 
 def PseudoFLW  : PseudoFloatLoad<"flw", FPR32>;
 def PseudoFSW  : PseudoStore<"fsw", FPR32>;
+let usesCustomInserter = 1 in {
+def PseudoQuietFLE_S : PseudoQuietFCMP<FPR32>;
+def PseudoQuietFLT_S : PseudoQuietFCMP<FPR32>;
+}
 } // Predicates = [HasStdExtF]
 
 //===----------------------------------------------------------------------===//
@@ -306,9 +320,13 @@ def PseudoFSW  : PseudoStore<"fsw", FPR32>;
 //===----------------------------------------------------------------------===//
 
 /// Floating point constants
-def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+def fpimm0    : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
 
 /// Generic pattern classes
+class PatSetCC<RegisterClass Ty, SDPatternOperator OpNode, CondCode Cond, RVInst Inst>
+    : Pat<(OpNode Ty:$rs1, Ty:$rs2, Cond), (Inst $rs1, $rs2)>;
+
 class PatFpr32Fpr32<SDPatternOperator OpNode, RVInstR Inst>
     : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2)>;
 
@@ -319,6 +337,7 @@ let Predicates = [HasStdExtF] in {
 
 /// Float constants
 def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>;
+def : Pat<(f32 (fpimmneg0)), (FSGNJN_S (FMV_W_X X0), (FMV_W_X X0))>;
 
 /// Float conversion operations
 
@@ -363,13 +382,34 @@ def : PatFpr32Fpr32<fminnum, FMIN_S>;
 def : PatFpr32Fpr32<fmaxnum, FMAX_S>;
 
 /// Setcc
-
-def : PatFpr32Fpr32<seteq, FEQ_S>;
-def : PatFpr32Fpr32<setoeq, FEQ_S>;
-def : PatFpr32Fpr32<setlt, FLT_S>;
-def : PatFpr32Fpr32<setolt, FLT_S>;
-def : PatFpr32Fpr32<setle, FLE_S>;
-def : PatFpr32Fpr32<setole, FLE_S>;
+// FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for
+// strict versions of those.
+
+// Match non-signaling FEQ_S
+def : PatSetCC<FPR32, any_fsetcc, SETEQ, FEQ_S>;
+def : PatSetCC<FPR32, any_fsetcc, SETOEQ, FEQ_S>;
+def : PatSetCC<FPR32, strict_fsetcc, SETLT, PseudoQuietFLT_S>;
+def : PatSetCC<FPR32, strict_fsetcc, SETOLT, PseudoQuietFLT_S>;
+def : PatSetCC<FPR32, strict_fsetcc, SETLE, PseudoQuietFLE_S>;
+def : PatSetCC<FPR32, strict_fsetcc, SETOLE, PseudoQuietFLE_S>;
+
+// Match signaling FEQ_S
+def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETEQ),
+          (AND (FLE_S $rs1, $rs2),
+               (FLE_S $rs2, $rs1))>;
+def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs2, SETOEQ),
+          (AND (FLE_S $rs1, $rs2),
+               (FLE_S $rs2, $rs1))>;
+// If both operands are the same, use a single FLE.
+def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETEQ),
+          (FLE_S $rs1, $rs1)>;
+def : Pat<(strict_fsetccs FPR32:$rs1, FPR32:$rs1, SETOEQ),
+          (FLE_S $rs1, $rs1)>;
+
+def : PatSetCC<FPR32, any_fsetccs, SETLT, FLT_S>;
+def : PatSetCC<FPR32, any_fsetccs, SETOLT, FLT_S>;
+def : PatSetCC<FPR32, any_fsetccs, SETLE, FLE_S>;
+def : PatSetCC<FPR32, any_fsetccs, SETOLE, FLE_S>;
 
 def Select_FPR32_Using_CC_GPR : SelectCC_rrirr<FPR32, GPR>;
 
@@ -393,14 +433,14 @@ def : Pat<(i32 (any_fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
 def : Pat<(i32 (any_fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>;
 
 // Saturating float->[u]int32.
-def : Pat<(i32 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_W_S $rs1, 0b001)>;
-def : Pat<(i32 (riscv_fcvt_xu_rtz FPR32:$rs1)), (FCVT_WU_S $rs1, 0b001)>;
+def : Pat<(i32 (riscv_fcvt_x FPR32:$rs1, timm:$frm)), (FCVT_W_S $rs1, timm:$frm)>;
+def : Pat<(i32 (riscv_fcvt_xu FPR32:$rs1, timm:$frm)), (FCVT_WU_S $rs1, timm:$frm)>;
 
 // float->int32 with current rounding mode.
-def : Pat<(i32 (lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>;
+def : Pat<(i32 (any_lrint FPR32:$rs1)), (FCVT_W_S $rs1, 0b111)>;
 
 // float->int32 rounded to nearest with ties rounded away from zero.
-def : Pat<(i32 (lround FPR32:$rs1)), (FCVT_W_S $rs1, 0b100)>;
+def : Pat<(i32 (any_lround FPR32:$rs1)), (FCVT_W_S $rs1, 0b100)>;
 
 // [u]int->float. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, 0b111)>;
@@ -417,24 +457,24 @@ def : Pat<(sext_inreg (riscv_fmv_x_anyextw_rv64 FPR32:$src), i32),
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR32:$rs1),  (FCVT_W_S $rs1, 0b001)>;
-def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_w_rv64 FPR32:$rs1, timm:$frm),  (FCVT_W_S $rs1, timm:$frm)>;
+def : Pat<(riscv_any_fcvt_wu_rv64 FPR32:$rs1, timm:$frm), (FCVT_WU_S $rs1, timm:$frm)>;
 
 // float->[u]int64. Round-to-zero must be used.
 def : Pat<(i64 (any_fp_to_sint FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
 def : Pat<(i64 (any_fp_to_uint FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>;
 
 // Saturating float->[u]int64.
-def : Pat<(i64 (riscv_fcvt_x_rtz FPR32:$rs1)), (FCVT_L_S $rs1, 0b001)>;
-def : Pat<(i64 (riscv_fcvt_xu_rtz FPR32:$rs1)), (FCVT_LU_S $rs1, 0b001)>;
+def : Pat<(i64 (riscv_fcvt_x FPR32:$rs1, timm:$frm)), (FCVT_L_S $rs1, timm:$frm)>;
+def : Pat<(i64 (riscv_fcvt_xu FPR32:$rs1, timm:$frm)), (FCVT_LU_S $rs1, timm:$frm)>;
 
 // float->int64 with current rounding mode.
-def : Pat<(i64 (lrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
-def : Pat<(i64 (llrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
+def : Pat<(i64 (any_lrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
+def : Pat<(i64 (any_llrint FPR32:$rs1)), (FCVT_L_S $rs1, 0b111)>;
 
 // float->int64 rounded to neartest with ties rounded away from zero.
-def : Pat<(i64 (lround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
-def : Pat<(i64 (llround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
+def : Pat<(i64 (any_lround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
+def : Pat<(i64 (any_llround FPR32:$rs1)), (FCVT_L_S $rs1, 0b100)>;
 
 // [u]int->fp. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_S_W $rs1, 0b111)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 173ae43a08d6..306024a3e4fd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -19,18 +19,22 @@ include "RISCVInstrFormatsV.td"
 // Operand and SDNode transformation definitions.
 //===----------------------------------------------------------------------===//
 
-def VTypeIAsmOperand : AsmOperandClass {
-  let Name = "VTypeI";
+class VTypeIAsmOperand<int VTypeINum> : AsmOperandClass {
+  let Name = "VTypeI" # VTypeINum;
   let ParserMethod = "parseVTypeI";
   let DiagnosticType = "InvalidVTypeI";
+  let RenderMethod = "addVTypeIOperands";
 }
 
-def VTypeIOp : Operand<XLenVT> {
-  let ParserMatchClass = VTypeIAsmOperand;
+class VTypeIOp<int VTypeINum> : Operand<XLenVT> {
+  let ParserMatchClass = VTypeIAsmOperand<VTypeINum>;
   let PrintMethod = "printVTypeI";
-  let DecoderMethod = "decodeUImmOperand<11>";
+  let DecoderMethod = "decodeUImmOperand<"#VTypeINum#">";
 }
 
+def VTypeIOp10 : VTypeIOp<10>;
+def VTypeIOp11 : VTypeIOp<11>;
+
 def VMaskAsmOperand : AsmOperandClass {
   let Name = "RVVMaskRegOpOperand";
   let RenderMethod = "addRegOperands";
@@ -77,6 +81,9 @@ def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
   }];
 }
 
+def simm5_plus1_nonzero : ImmLeaf<XLenVT,
+  [{return Imm != 0 && ((isInt<5>(Imm) && Imm != -16) || Imm == 16);}]>;
+
 //===----------------------------------------------------------------------===//
 // Scheduling definitions.
 //===----------------------------------------------------------------------===//
@@ -342,6 +349,27 @@ class VALUVs2<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr>
 // Combination of instruction classes.
 // Use these multiclasses to define instructions more easily.
 //===----------------------------------------------------------------------===//
+
+multiclass VIndexLoadStore<list<int> EEWList> {
+  foreach n = EEWList in {
+    defvar w = !cast<RISCVWidth>("LSWidth" # n);
+
+    def VLUXEI # n # _V :
+      VIndexedLoad<MOPLDIndexedUnord, w, "vluxei" # n # ".v">,
+      VLXSched<n, "U">;
+    def VLOXEI # n # _V :
+      VIndexedLoad<MOPLDIndexedOrder, w, "vloxei" # n # ".v">,
+      VLXSched<n, "O">;
+
+    def VSUXEI # n # _V :
+      VIndexedStore<MOPSTIndexedUnord, w, "vsuxei" # n # ".v">,
+      VSXSched<n, "U">;
+    def VSOXEI # n # _V :
+      VIndexedStore<MOPSTIndexedOrder, w, "vsoxei" # n # ".v">,
+      VSXSched<n, "O">;
+  }
+}
+
 multiclass VALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
   def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">,
            Sched<[WriteVIALUV, ReadVIALUV, ReadVIALUV, ReadVMask]>;
@@ -757,7 +785,7 @@ multiclass VCPR_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
 }
 
 multiclass VWholeLoadN<bits<3> nf, string opcodestr, RegisterClass VRC> {
-  foreach l = [8, 16, 32, 64] in {
+  foreach l = [8, 16, 32] in {
     defvar w = !cast<RISCVWidth>("LSWidth" # l);
     defvar s = !cast<SchedWrite>("WriteVLD" # !add(nf, 1) # "R" # l);
 
@@ -765,23 +793,27 @@ multiclass VWholeLoadN<bits<3> nf, string opcodestr, RegisterClass VRC> {
                      Sched<[s, ReadVLDX]>;
   }
 }
+multiclass VWholeLoadEEW64<bits<3> nf, string opcodestr, RegisterClass VRC, SchedReadWrite schedrw> {
+  def E64_V : VWholeLoad<nf, LSWidth64, opcodestr # "e64.v", VRC>,
+              Sched<[schedrw, ReadVLDX]>;
+}
 
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
-def VSETVLI : RVInstSetVLi<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei),
+def VSETVLI : RVInstSetVLi<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp11:$vtypei),
                            "vsetvli", "$rd, $rs1, $vtypei">;
 
-def VSETIVLI : RVInstSetiVLi<(outs GPR:$rd), (ins uimm5:$uimm, VTypeIOp:$vtypei),
+def VSETIVLI : RVInstSetiVLi<(outs GPR:$rd), (ins uimm5:$uimm, VTypeIOp10:$vtypei),
                              "vsetivli", "$rd, $uimm, $vtypei">;
 
 def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
                          "vsetvl", "$rd, $rs1, $rs2">;
 } // hasSideEffects = 1, mayLoad = 0, mayStore = 0
-foreach eew = [8, 16, 32, 64] in {
+foreach eew = [8, 16, 32] in {
   defvar w = !cast<RISCVWidth>("LSWidth" # eew);
 
   // Vector Unit-Stride Instructions
@@ -794,18 +826,12 @@ foreach eew = [8, 16, 32, 64] in {
   // Vector Strided Instructions
   def VLSE#eew#_V  : VStridedLoad<w,  "vlse"#eew#".v">, VLSSched<eew>;
   def VSSE#eew#_V  : VStridedStore<w,  "vsse"#eew#".v">, VSSSched<eew>;
-
-  // Vector Indexed Instructions
-  def VLUXEI#eew#_V :
-    VIndexedLoad<MOPLDIndexedUnord, w, "vluxei"#eew#".v">, VLXSched<eew, "U">;
-  def VLOXEI#eew#_V :
-    VIndexedLoad<MOPLDIndexedOrder, w, "vloxei"#eew#".v">, VLXSched<eew, "O">;
-  def VSUXEI#eew#_V :
-    VIndexedStore<MOPSTIndexedUnord, w, "vsuxei"#eew#".v">, VSXSched<eew, "U">;
-  def VSOXEI#eew#_V :
-    VIndexedStore<MOPSTIndexedOrder, w, "vsoxei"#eew#".v">, VSXSched<eew, "O">;
 }
 
+defm "" : VIndexLoadStore<[8, 16, 32]>;
+} // Predicates = [HasVInstructions]
+
+let Predicates = [HasVInstructions] in {
 def VLM_V : VUnitStrideLoadMask<"vlm.v">,
              Sched<[WriteVLDM, ReadVLDX]>;
 def VSM_V : VUnitStrideStoreMask<"vsm.v">,
@@ -820,11 +846,6 @@ defm VL2R : VWholeLoadN<1, "vl2r", VRM2>;
 defm VL4R : VWholeLoadN<3, "vl4r", VRM4>;
 defm VL8R : VWholeLoadN<7, "vl8r", VRM8>;
 
-def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
-def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VRM2:$vd, GPR:$rs1)>;
-def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VRM4:$vd, GPR:$rs1)>;
-def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VRM8:$vd, GPR:$rs1)>;
-
 def VS1R_V : VWholeStore<0, "vs1r.v", VR>,
              Sched<[WriteVST1R, ReadVST1R, ReadVSTX]>;
 def VS2R_V : VWholeStore<1, "vs2r.v", VRM2>,
@@ -834,6 +855,40 @@ def VS4R_V : VWholeStore<3, "vs4r.v", VRM4>,
 def VS8R_V : VWholeStore<7, "vs8r.v", VRM8>,
              Sched<[WriteVST8R, ReadVST8R, ReadVSTX]>;
 
+def : InstAlias<"vl1r.v $vd, (${rs1})", (VL1RE8_V VR:$vd, GPR:$rs1)>;
+def : InstAlias<"vl2r.v $vd, (${rs1})", (VL2RE8_V VRM2:$vd, GPR:$rs1)>;
+def : InstAlias<"vl4r.v $vd, (${rs1})", (VL4RE8_V VRM4:$vd, GPR:$rs1)>;
+def : InstAlias<"vl8r.v $vd, (${rs1})", (VL8RE8_V VRM8:$vd, GPR:$rs1)>;
+} // Predicates = [HasVInstructions]
+
+let Predicates = [HasVInstructionsI64] in {
+// Vector Unit-Stride Instructions
+def VLE64_V : VUnitStrideLoad<LSWidth64, "vle64.v">,
+              VLESched<64>;
+
+def VLE64FF_V : VUnitStrideLoadFF<LSWidth64, "vle64ff.v">,
+                VLFSched<64>;
+
+def VSE64_V : VUnitStrideStore<LSWidth64, "vse64.v">,
+              VSESched<64>;
+// Vector Strided Instructions
+def VLSE64_V : VStridedLoad<LSWidth64, "vlse64.v">,
+               VLSSched<32>;
+
+def VSSE64_V : VStridedStore<LSWidth64, "vsse64.v">,
+               VSSSched<64>;
+
+defm VL1R: VWholeLoadEEW64<0, "vl1r", VR, WriteVLD1R64>;
+defm VL2R: VWholeLoadEEW64<1, "vl2r", VRM2, WriteVLD2R64>;
+defm VL4R: VWholeLoadEEW64<3, "vl4r", VRM4, WriteVLD4R64>;
+defm VL8R: VWholeLoadEEW64<7, "vl8r", VRM8, WriteVLD8R64>;
+} // Predicates = [HasVInstructionsI64]
+let Predicates = [IsRV64, HasVInstructionsI64] in {
+  // Vector Indexed Instructions
+  defm "" : VIndexLoadStore<[64]>;
+} // [IsRV64, HasVInstructionsI64]
+
+let Predicates = [HasVInstructions] in {
 // Vector Single-Width Integer Add and Subtract
 defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
 defm VSUB_V : VALU_IV_V_X<"vsub", 0b000010>;
@@ -1065,9 +1120,9 @@ let Constraints = "@earlyclobber $vd" in {
 defm VNCLIPU_W : VNCLP_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
 defm VNCLIP_W : VNCLP_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
 } // Constraints = "@earlyclobber $vd"
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 // Vector Single-Width Floating-Point Add/Subtract Instructions
 defm VFADD_V : VALU_FV_V_F<"vfadd", 0b000000>;
 defm VFSUB_V : VALU_FV_V_F<"vfsub", 0b000010>;
@@ -1202,9 +1257,9 @@ defm VFNCVT_F_X_W : VNCVTF_IV_VS2<"vfncvt.f.x.w", 0b010010, 0b10011>;
 defm VFNCVT_F_F_W : VNCVTF_FV_VS2<"vfncvt.f.f.w", 0b010010, 0b10100>;
 defm VFNCVT_ROD_F_F_W : VNCVTF_FV_VS2<"vfncvt.rod.f.f.w", 0b010010, 0b10101>;
 } // Constraints = "@earlyclobber $vd"
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 
 // Vector Single-Width Integer Reduction Instructions
 let RVVConstraint = NoConstraint in {
@@ -1228,9 +1283,9 @@ defm VWREDSUMU : VWRED_IV_V<"vwredsumu", 0b110000>;
 defm VWREDSUM : VWRED_IV_V<"vwredsum", 0b110001>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = NoConstraint
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 // Vector Single-Width Floating-Point Reduction Instructions
 let RVVConstraint = NoConstraint in {
 defm VFREDOSUM : VREDO_FV_V<"vfredosum", 0b000011>;
@@ -1254,9 +1309,9 @@ defm VFWREDUSUM : VWRED_FV_V<"vfwredusum", 0b110001>;
 
 def : InstAlias<"vfwredsum.vs $vd, $vs2, $vs1$vm",
                 (VFWREDUSUM_VS VR:$vd, VR:$vs2, VR:$vs1, VMaskOp:$vm), 0>;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 // Vector Mask-Register Logical Instructions
 let RVVConstraint = NoConstraint in {
 defm VMAND_M : VMALU_MV_Mask<"vmand", 0b011001, "m">;
@@ -1337,9 +1392,9 @@ def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VR:$vd_wb),
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1,
     RVVConstraint = NoConstraint  in {
@@ -1354,9 +1409,9 @@ def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VR:$vd_wb),
 
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
 
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 // Vector Slide Instructions
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
 defm VSLIDEUP_V : VSLD_IV_X_I<"vslideup", 0b001110, uimm5>;
@@ -1364,16 +1419,16 @@ defm VSLIDE1UP_V : VSLD1_MV_X<"vslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
 defm VSLIDEDOWN_V : VSLD_IV_X_I<"vslidedown", 0b001111, uimm5>;
 defm VSLIDE1DOWN_V : VSLD1_MV_X<"vslide1down", 0b001111>;
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtV, HasStdExtF] in {
+let Predicates = [HasVInstructionsAnyF] in {
 let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
 defm VFSLIDE1UP_V : VSLD1_FV_F<"vfslide1up", 0b001110>;
 } // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
 defm VFSLIDE1DOWN_V : VSLD1_FV_F<"vfslide1down", 0b001111>;
-} // Predicates = [HasStdExtV, HasStdExtF]
+} // Predicates = [HasVInstructionsAnyF]
 
-let Predicates = [HasStdExtV] in {
+let Predicates = [HasVInstructions] in {
 // Vector Register Gather Instruction
 let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
 defm VRGATHER_V : VGTR_IV_V_X_I<"vrgather", 0b001100, uimm5>;
@@ -1404,11 +1459,11 @@ foreach n = [2, 4, 8] in {
   }
 }
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
-} // Predicates = [HasStdExtV]
+} // Predicates = [HasVInstructions]
 
-let Predicates = [HasStdExtZvlsseg] in {
+let Predicates = [HasVInstructions] in {
   foreach nf=2-8 in {
-    foreach eew = [8, 16, 32, 64] in {
+    foreach eew = [8, 16, 32] in {
       defvar w = !cast<RISCVWidth>("LSWidth"#eew);
 
       def VLSEG#nf#E#eew#_V :
@@ -1439,6 +1494,41 @@ let Predicates = [HasStdExtZvlsseg] in {
                              "vsoxseg"#nf#"ei"#eew#".v">;
     }
   }
-} // Predicates = [HasStdExtZvlsseg]
+} // Predicates = [HasVInstructions]
+
+let Predicates = [HasVInstructionsI64] in {
+  foreach nf=2-8 in {
+    // Vector Unit-strided Segment Instructions
+    def VLSEG#nf#E64_V :
+      VUnitStrideSegmentLoad<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64.v">;
+    def VLSEG#nf#E64FF_V :
+      VUnitStrideSegmentLoadFF<!add(nf, -1), LSWidth64, "vlseg"#nf#"e64ff.v">;
+    def VSSEG#nf#E64_V :
+      VUnitStrideSegmentStore<!add(nf, -1), LSWidth64, "vsseg"#nf#"e64.v">;
+
+    // Vector Strided Segment Instructions
+    def VLSSEG#nf#E64_V :
+      VStridedSegmentLoad<!add(nf, -1), LSWidth64, "vlsseg"#nf#"e64.v">;
+    def VSSSEG#nf#E64_V :
+      VStridedSegmentStore<!add(nf, -1), LSWidth64, "vssseg"#nf#"e64.v">;
+  }
+} // Predicates = [HasVInstructionsI64]
+let Predicates = [HasVInstructionsI64, IsRV64] in {
+  foreach nf=2-8 in {
+    // Vector Indexed Segment Instructions
+    def VLUXSEG#nf#EI64_V :
+      VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedUnord, LSWidth64,
+                          "vluxseg"#nf#"ei64.v">;
+    def VLOXSEG#nf#EI64_V :
+      VIndexedSegmentLoad<!add(nf, -1), MOPLDIndexedOrder, LSWidth64,
+                          "vloxseg"#nf#"ei64.v">;
+    def VSUXSEG#nf#EI64_V :
+      VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedUnord, LSWidth64,
+                           "vsuxseg"#nf#"ei64.v">;
+    def VSOXSEG#nf#EI64_V :
+      VIndexedSegmentStore<!add(nf, -1), MOPSTIndexedOrder, LSWidth64,
+                           "vsoxseg"#nf#"ei64.v">;
+  }
+} // Predicates = [HasVInstructionsI64, IsRV64]
 
 include "RISCVInstrInfoVPseudos.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 073fa605e0fb..4e7e251bc412 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -71,49 +71,45 @@ def V_MF4 : LMULInfo<0b110, 2, VR, VR,          VR,/*NoVReg*/VR,/*NoVReg*/VR, "M
 def V_MF2 : LMULInfo<0b111, 4, VR, VR,          VR,          VR,/*NoVReg*/VR, "MF2">;
 
 // Used to iterate over all possible LMULs.
-def MxList {
-  list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
-}
+defvar MxList = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
+// For floating point which don't need MF8.
+defvar MxListF = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
+
 // Used for widening and narrowing instructions as it doesn't contain M8.
-def MxListW {
-  list<LMULInfo> m = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4];
-}
+defvar MxListW = [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4];
+// For floating point which don't need MF8.
+defvar MxListFW = [V_MF4, V_MF2, V_M1, V_M2, V_M4];
+
 // Use for zext/sext.vf2
-def MxListVF2 {
-  list<LMULInfo> m = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
-}
+defvar MxListVF2 = [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8];
+
 // Use for zext/sext.vf4
-def MxListVF4 {
-  list<LMULInfo> m = [V_MF2, V_M1, V_M2, V_M4, V_M8];
-}
+defvar MxListVF4 = [V_MF2, V_M1, V_M2, V_M4, V_M8];
+
 // Use for zext/sext.vf8
-def MxListVF8 {
-  list<LMULInfo> m = [V_M1, V_M2, V_M4, V_M8];
+defvar MxListVF8 = [V_M1, V_M2, V_M4, V_M8];
+
+class MxSet<int eew> {
+  list<LMULInfo> m = !cond(!eq(eew, 8) : [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
+                           !eq(eew, 16) : [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
+                           !eq(eew, 32) : [V_MF2, V_M1, V_M2, V_M4, V_M8],
+                           !eq(eew, 64) : [V_M1, V_M2, V_M4, V_M8]);
 }
 
-class FPR_Info<RegisterClass regclass, string fx> {
+class FPR_Info<RegisterClass regclass, string fx, list<LMULInfo> mxlist> {
   RegisterClass fprclass = regclass;
   string FX = fx;
+  list<LMULInfo> MxList = mxlist;
 }
 
-def SCALAR_F16 : FPR_Info<FPR16, "F16">;
-def SCALAR_F32 : FPR_Info<FPR32, "F32">;
-def SCALAR_F64 : FPR_Info<FPR64, "F64">;
+def SCALAR_F16 : FPR_Info<FPR16, "F16", MxSet<16>.m>;
+def SCALAR_F32 : FPR_Info<FPR32, "F32", MxSet<32>.m>;
+def SCALAR_F64 : FPR_Info<FPR64, "F64", MxSet<64>.m>;
 
-def FPList {
-  list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32, SCALAR_F64];
-}
-// Used for widening instructions. It excludes F64.
-def FPListW {
-  list<FPR_Info> fpinfo = [SCALAR_F16, SCALAR_F32];
-}
+defvar FPList = [SCALAR_F16, SCALAR_F32, SCALAR_F64];
 
-class MxSet<int eew> {
-  list<LMULInfo> m = !cond(!eq(eew, 8) : [V_MF8, V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
-                           !eq(eew, 16) : [V_MF4, V_MF2, V_M1, V_M2, V_M4, V_M8],
-                           !eq(eew, 32) : [V_MF2, V_M1, V_M2, V_M4, V_M8],
-                           !eq(eew, 64) : [V_M1, V_M2, V_M4, V_M8]);
-}
+// Used for widening instructions. It excludes F64.
+defvar FPListW = [SCALAR_F16, SCALAR_F32];
 
 class NFSet<LMULInfo m> {
   list<int> L = !cond(!eq(m.value, V_M8.value): [],
@@ -236,25 +232,25 @@ defset list<VTypeInfo> AllVectors = {
 
     defset list<GroupVTypeInfo> GroupFloatVectors = {
       def VF16M2: GroupVTypeInfo<vfloat16m2_t, vfloat16m1_t, vbool8_t, 16,
-                                  VRM2, V_M2, f16, FPR16>;
+                                 VRM2, V_M2, f16, FPR16>;
       def VF16M4: GroupVTypeInfo<vfloat16m4_t, vfloat16m1_t, vbool4_t, 16,
-                                  VRM4, V_M4, f16, FPR16>;
+                                 VRM4, V_M4, f16, FPR16>;
       def VF16M8: GroupVTypeInfo<vfloat16m8_t, vfloat16m1_t, vbool2_t, 16,
-                                  VRM8, V_M8, f16, FPR16>;
+                                 VRM8, V_M8, f16, FPR16>;
 
       def VF32M2: GroupVTypeInfo<vfloat32m2_t, vfloat32m1_t, vbool16_t, 32,
-                                  VRM2, V_M2, f32, FPR32>;
+                                 VRM2, V_M2, f32, FPR32>;
       def VF32M4: GroupVTypeInfo<vfloat32m4_t, vfloat32m1_t, vbool8_t,  32,
-                                  VRM4, V_M4, f32, FPR32>;
+                                 VRM4, V_M4, f32, FPR32>;
       def VF32M8: GroupVTypeInfo<vfloat32m8_t, vfloat32m1_t, vbool4_t,  32,
-                                  VRM8, V_M8, f32, FPR32>;
+                                 VRM8, V_M8, f32, FPR32>;
 
       def VF64M2: GroupVTypeInfo<vfloat64m2_t, vfloat64m1_t, vbool32_t, 64,
-                                  VRM2, V_M2, f64, FPR64>;
+                                 VRM2, V_M2, f64, FPR64>;
       def VF64M4: GroupVTypeInfo<vfloat64m4_t, vfloat64m1_t, vbool16_t, 64,
-                                  VRM4, V_M4, f64, FPR64>;
+                                 VRM4, V_M4, f64, FPR64>;
       def VF64M8: GroupVTypeInfo<vfloat64m8_t, vfloat64m1_t, vbool8_t,  64,
-                                  VRM8, V_M8, f64, FPR64>;
+                                 VRM8, V_M8, f64, FPR64>;
     }
   }
 }
@@ -423,13 +419,14 @@ def RISCVVPseudosTable : GenericTable {
 def RISCVVIntrinsicsTable : GenericTable {
   let FilterClass = "RISCVVIntrinsic";
   let CppTypeName = "RISCVVIntrinsicInfo";
-  let Fields = ["IntrinsicID", "SplatOperand"];
+  let Fields = ["IntrinsicID", "SplatOperand", "VLOperand"];
   let PrimaryKey = ["IntrinsicID"];
   let PrimaryKeyName = "getRISCVVIntrinsicInfo";
 }
 
-class RISCVVLE<bit M, bit Str, bit F, bits<3> S, bits<3> L> {
+class RISCVVLE<bit M, bit TU, bit Str, bit F, bits<3> S, bits<3> L> {
   bits<1> Masked = M;
+  bits<1> IsTU = TU;
   bits<1> Strided = Str;
   bits<1> FF = F;
   bits<3> Log2SEW = S;
@@ -440,8 +437,8 @@ class RISCVVLE<bit M, bit Str, bit F, bits<3> S, bits<3> L> {
 def RISCVVLETable : GenericTable {
   let FilterClass = "RISCVVLE";
   let CppTypeName = "VLEPseudo";
-  let Fields = ["Masked", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"];
-  let PrimaryKey = ["Masked", "Strided", "FF", "Log2SEW", "LMUL"];
+  let Fields = ["Masked", "IsTU", "Strided", "FF", "Log2SEW", "LMUL", "Pseudo"];
+  let PrimaryKey = ["Masked", "IsTU", "Strided", "FF", "Log2SEW", "LMUL"];
   let PrimaryKeyName = "getVLEPseudo";
 }
 
@@ -461,8 +458,9 @@ def RISCVVSETable : GenericTable {
   let PrimaryKeyName = "getVSEPseudo";
 }
 
-class RISCVVLX_VSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> {
+class RISCVVLX_VSX<bit M, bit TU, bit O, bits<3> S, bits<3> L, bits<3> IL> {
   bits<1> Masked = M;
+  bits<1> IsTU = TU;
   bits<1> Ordered = O;
   bits<3> Log2SEW = S;
   bits<3> LMUL = L;
@@ -470,15 +468,15 @@ class RISCVVLX_VSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> {
   Pseudo Pseudo = !cast<Pseudo>(NAME);
 }
 
-class RISCVVLX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> :
-  RISCVVLX_VSX<M, O, S, L, IL>;
+class RISCVVLX<bit M, bit TU, bit O, bits<3> S, bits<3> L, bits<3> IL> :
+  RISCVVLX_VSX<M, TU, O, S, L, IL>;
 class RISCVVSX<bit M, bit O, bits<3> S, bits<3> L, bits<3> IL> :
-  RISCVVLX_VSX<M, O, S, L, IL>;
+  RISCVVLX_VSX<M, /*TU*/0, O, S, L, IL>;
 
 class RISCVVLX_VSXTable : GenericTable {
   let CppTypeName = "VLX_VSXPseudo";
-  let Fields = ["Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"];
-  let PrimaryKey = ["Masked", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"];
+  let Fields = ["Masked", "IsTU", "Ordered", "Log2SEW", "LMUL", "IndexLMUL", "Pseudo"];
+  let PrimaryKey = ["Masked", "IsTU", "Ordered", "Log2SEW", "LMUL", "IndexLMUL"];
 }
 
 def RISCVVLXTable : RISCVVLX_VSXTable {
@@ -583,10 +581,11 @@ class PseudoToVInst<string PseudoInst> {
                  !subst("_B64", "",
                  !subst("_MASK", "",
                  !subst("_TIED", "",
+                 !subst("_TU", "",
                  !subst("F16", "F",
                  !subst("F32", "F",
                  !subst("F64", "F",
-                 !subst("Pseudo", "", PseudoInst))))))))))))))))))));
+                 !subst("Pseudo", "", PseudoInst)))))))))))))))))))));
 }
 
 // The destination vector register group for a masked vector instruction cannot
@@ -632,7 +631,7 @@ class VPseudoUSLoadNoMask<VReg RetClass, int EEW, bit isFF> :
       Pseudo<(outs RetClass:$rd),
              (ins GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLE</*Masked*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+      RISCVVLE</*Masked*/0, /*TU*/0, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -642,13 +641,29 @@ class VPseudoUSLoadNoMask<VReg RetClass, int EEW, bit isFF> :
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+class VPseudoUSLoadNoMaskTU<VReg RetClass, int EEW, bit isFF> :
+      Pseudo<(outs RetClass:$rd),
+             (ins RetClass:$dest, GPR:$rs1, AVL:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLE</*Masked*/0, /*TU*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $dest";
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
 class VPseudoUSLoadMask<VReg RetClass, int EEW, bit isFF> :
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
               (ins GetVRegNoV0<RetClass>.R:$merge,
                    GPR:$rs1,
                    VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
-      RISCVVLE</*Masked*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
+      RISCVVLE</*Masked*/1, /*TU*/1, /*Strided*/0, /*FF*/isFF, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -664,7 +679,7 @@ class VPseudoSLoadNoMask<VReg RetClass, int EEW>:
       Pseudo<(outs RetClass:$rd),
              (ins GPR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLE</*Masked*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
+      RISCVVLE</*Masked*/0, /*TU*/0, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -674,13 +689,29 @@ class VPseudoSLoadNoMask<VReg RetClass, int EEW>:
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+class VPseudoSLoadNoMaskTU<VReg RetClass, int EEW>:
+      Pseudo<(outs RetClass:$rd),
+             (ins RetClass:$dest, GPR:$rs1, GPR:$rs2, AVL:$vl, ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLE</*Masked*/0, /*TU*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
+  let Constraints = "$rd = $dest";
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
 class VPseudoSLoadMask<VReg RetClass, int EEW>:
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
               (ins GetVRegNoV0<RetClass>.R:$merge,
                    GPR:$rs1, GPR:$rs2,
                    VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
-      RISCVVLE</*Masked*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
+      RISCVVLE</*Masked*/1, /*TU*/1, /*Strided*/1, /*FF*/0, log2<EEW>.val, VLMul> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -695,9 +726,10 @@ class VPseudoSLoadMask<VReg RetClass, int EEW>:
 class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
                          bit Ordered, bit EarlyClobber>:
       Pseudo<(outs RetClass:$rd),
-             (ins GPR:$rs1, IdxClass:$rs2, AVL:$vl, ixlenimm:$sew),[]>,
+             (ins GPR:$rs1, IdxClass:$rs2, AVL:$vl,
+              ixlenimm:$sew),[]>,
       RISCVVPseudo,
-      RISCVVLX</*Masked*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
+      RISCVVLX</*Masked*/0, /*TU*/0, Ordered, log2<EEW>.val, VLMul, LMUL> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -708,6 +740,24 @@ class VPseudoILoadNoMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+class VPseudoILoadNoMaskTU<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
+                           bit Ordered, bit EarlyClobber>:
+      Pseudo<(outs RetClass:$rd),
+             (ins RetClass:$dest, GPR:$rs1, IdxClass:$rs2, AVL:$vl,
+              ixlenimm:$sew),[]>,
+      RISCVVPseudo,
+      RISCVVLX</*Masked*/0, /*TU*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
+  let mayLoad = 1;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasDummyMask = 1;
+  let HasMergeOp = 1;
+  let Constraints = !if(!eq(EarlyClobber, 1), "@earlyclobber $rd, $rd = $dest", "$rd = $dest");
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+}
+
 class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
                        bit Ordered, bit EarlyClobber>:
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
@@ -715,7 +765,7 @@ class VPseudoILoadMask<VReg RetClass, VReg IdxClass, int EEW, bits<3> LMUL,
                    GPR:$rs1, IdxClass:$rs2,
                    VMaskOp:$vm, AVL:$vl, ixlenimm:$sew, ixlenimm:$policy),[]>,
       RISCVVPseudo,
-      RISCVVLX</*Masked*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
+      RISCVVLX</*Masked*/1, /*TU*/1, Ordered, log2<EEW>.val, VLMul, LMUL> {
   let mayLoad = 1;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -932,6 +982,9 @@ class VPseudoBinaryNoMask<VReg RetClass,
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
+// Special version of VPseudoBinaryNoMask where we pretend the first source is
+// tied to the destination.
+// This allows maskedoff and rs2 to be the same register.
 class VPseudoTiedBinaryNoMask<VReg RetClass,
                               DAGOperand Op2Class,
                               string Constraint> :
@@ -1083,6 +1136,30 @@ class VPseudoBinaryCarryIn<VReg RetClass,
   let VLMul = MInfo.value;
 }
 
+class VPseudoTiedBinaryCarryIn<VReg RetClass,
+                               VReg Op1Class,
+                               DAGOperand Op2Class,
+                               LMULInfo MInfo,
+                               bit CarryIn,
+                               string Constraint> :
+        Pseudo<(outs RetClass:$rd),
+               !if(CarryIn,
+                  (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, VMV0:$carry, AVL:$vl,
+                       ixlenimm:$sew),
+                  (ins RetClass:$merge, Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew)), []>,
+        RISCVVPseudo {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+  let Constraints = Join<[Constraint, "$rd = $merge"], ",">.ret;
+  let HasVLOp = 1;
+  let HasSEWOp = 1;
+  let HasMergeOp = 1;
+  let HasVecPolicyOp = 0;
+  let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
+  let VLMul = MInfo.value;
+}
+
 class VPseudoTernaryNoMask<VReg RetClass,
                            RegisterClass Op1Class,
                            DAGOperand Op2Class,
@@ -1323,6 +1400,9 @@ multiclass VPseudoUSLoad {
         def "E" # eew # "_V_" # LInfo :
           VPseudoUSLoadNoMask<vreg, eew, false>,
           VLESched<eew>;
+        def "E" # eew # "_V_" # LInfo # "_TU":
+          VPseudoUSLoadNoMaskTU<vreg, eew, false>,
+          VLESched<eew>;
         def "E" # eew # "_V_" # LInfo # "_MASK" :
           VPseudoUSLoadMask<vreg, eew, false>,
           VLESched<eew>;
@@ -1340,6 +1420,9 @@ multiclass VPseudoFFLoad {
         def "E" # eew # "FF_V_" # LInfo :
           VPseudoUSLoadNoMask<vreg, eew, true>,
           VLFSched<eew>;
+        def "E" # eew # "FF_V_" # LInfo # "_TU":
+          VPseudoUSLoadNoMaskTU<vreg, eew, true>,
+          VLFSched<eew>;
         def "E" # eew # "FF_V_" # LInfo # "_MASK" :
           VPseudoUSLoadMask<vreg, eew, true>,
           VLFSched<eew>;
@@ -1364,6 +1447,8 @@ multiclass VPseudoSLoad {
       let VLMul = lmul.value in {
         def "E" # eew # "_V_" # LInfo : VPseudoSLoadNoMask<vreg, eew>,
                                         VLSSched<eew>;
+        def "E" # eew # "_V_" # LInfo # "_TU": VPseudoSLoadNoMaskTU<vreg, eew>,
+                                        VLSSched<eew>;
         def "E" # eew # "_V_" # LInfo # "_MASK" : VPseudoSLoadMask<vreg, eew>,
                                                   VLSSched<eew>;
       }
@@ -1390,6 +1475,9 @@ multiclass VPseudoILoad<bit Ordered> {
             def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo :
               VPseudoILoadNoMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>,
               VLXSched<eew, Order>;
+            def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_TU":
+              VPseudoILoadNoMaskTU<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>,
+              VLXSched<eew, Order>;
             def "EI" # eew # "_V_" # IdxLInfo # "_" # LInfo # "_MASK" :
               VPseudoILoadMask<Vreg, IdxVreg, eew, idx_lmul.value, Ordered, HasConstraint>,
               VLXSched<eew, Order>;
@@ -1504,7 +1592,7 @@ multiclass VPseudoVSFS_M {
 }
 
 multiclass VPseudoVID_V {
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoNullaryNoMask<m.vrclass>,
                          Sched<[WriteVMIdxV, ReadVMask]>;
@@ -1524,7 +1612,7 @@ multiclass VPseudoNullaryPseudoM <string BaseInst> {
 
 multiclass VPseudoVIOT_M {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, VR, constraint>,
                        Sched<[WriteVMIotV, ReadVMIotV, ReadVMask]>;
@@ -1535,7 +1623,7 @@ multiclass VPseudoVIOT_M {
 }
 
 multiclass VPseudoVCPR_V {
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     let VLMul = m.value in
       def _VM # "_" # m.MX : VPseudoUnaryAnyMask<m.vrclass, m.vrclass>,
                              Sched<[WriteVCompressV, ReadVCompressV, ReadVCompressV]>;
@@ -1596,12 +1684,18 @@ multiclass VPseudoTiedBinary<VReg RetClass,
 }
 
 multiclass VPseudoBinaryV_VV<string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
+    defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
+}
+
+// Similar to VPseudoBinaryV_VV, but uses MxListF.
+multiclass VPseudoBinaryFV_VV<string Constraint = ""> {
+  foreach m = MxListF in
     defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint>;
 }
 
 multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> {
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     foreach sew = EEWList in {
       defvar octuple_lmul = m.octuple;
       // emul = lmul * eew / sew
@@ -1617,38 +1711,38 @@ multiclass VPseudoVGTR_VV_EEW<int eew, string Constraint = ""> {
 }
 
 multiclass VPseudoBinaryV_VX<string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>;
 }
 
 multiclass VPseudoVSLD1_VX<string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm "_VX" : VPseudoBinary<m.vrclass, m.vrclass, GPR, m, Constraint>,
                  Sched<[WriteVISlide1X, ReadVISlideV, ReadVISlideX, ReadVMask]>;
 }
 
 multiclass VPseudoBinaryV_VF<string Constraint = ""> {
-  foreach m = MxList.m in
-    foreach f = FPList.fpinfo in
+  foreach f = FPList in
+    foreach m = f.MxList in
       defm "_V" # f.FX : VPseudoBinary<m.vrclass, m.vrclass,
                                        f.fprclass, m, Constraint>;
 }
 
 multiclass VPseudoVSLD1_VF<string Constraint = ""> {
-  foreach m = MxList.m in
-    foreach f = FPList.fpinfo in
+  foreach f = FPList in
+    foreach m = f.MxList in
       defm "_V" # f.FX :
         VPseudoBinary<m.vrclass, m.vrclass, f.fprclass, m, Constraint>,
         Sched<[WriteVFSlide1F, ReadVFSlideV, ReadVFSlideF, ReadVMask]>;
 }
 
 multiclass VPseudoBinaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm _VI : VPseudoBinary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
 }
 
 multiclass VPseudoVALU_MM {
-  foreach m = MxList.m in
+  foreach m = MxList in
     let VLMul = m.value in {
       def "_MM_" # m.MX : VPseudoBinaryNoMask<VR, VR, VR, "">,
                           Sched<[WriteVMALUV, ReadVMALUV, ReadVMALUV]>;
@@ -1662,28 +1756,28 @@ multiclass VPseudoVALU_MM {
 // * The destination EEW is greater than the source EEW, the source EMUL is
 //   at least 1, and the overlap is in the highest-numbered part of the
 //   destination register group is legal. Otherwise, it is illegal.
-multiclass VPseudoBinaryW_VV {
-  foreach m = MxListW.m in
+multiclass VPseudoBinaryW_VV<list<LMULInfo> mxlist = MxListW> {
+  foreach m = mxlist in
     defm _VV : VPseudoBinary<m.wvrclass, m.vrclass, m.vrclass, m,
                              "@earlyclobber $rd">;
 }
 
 multiclass VPseudoBinaryW_VX {
-  foreach m = MxListW.m in
+  foreach m = MxListW in
     defm "_VX" : VPseudoBinary<m.wvrclass, m.vrclass, GPR, m,
                                "@earlyclobber $rd">;
 }
 
 multiclass VPseudoBinaryW_VF {
-  foreach m = MxListW.m in
-    foreach f = FPListW.fpinfo in
+  foreach f = FPListW in
+    foreach m = f.MxList in
       defm "_V" # f.FX : VPseudoBinary<m.wvrclass, m.vrclass,
                                        f.fprclass, m,
                                        "@earlyclobber $rd">;
 }
 
-multiclass VPseudoBinaryW_WV {
-  foreach m = MxListW.m in {
+multiclass VPseudoBinaryW_WV<list<LMULInfo> mxlist = MxListW> {
+  foreach m = mxlist in {
     defm _WV : VPseudoBinary<m.wvrclass, m.wvrclass, m.vrclass, m,
                              "@earlyclobber $rd">;
     defm _WV : VPseudoTiedBinary<m.wvrclass, m.vrclass, m,
@@ -1692,13 +1786,13 @@ multiclass VPseudoBinaryW_WV {
 }
 
 multiclass VPseudoBinaryW_WX {
-  foreach m = MxListW.m in
+  foreach m = MxListW in
     defm "_WX" : VPseudoBinary<m.wvrclass, m.wvrclass, GPR, m>;
 }
 
 multiclass VPseudoBinaryW_WF {
-  foreach m = MxListW.m in
-    foreach f = FPListW.fpinfo in
+  foreach f = FPListW in
+    foreach m = f.MxList in
       defm "_W" # f.FX : VPseudoBinary<m.wvrclass, m.wvrclass,
                                        f.fprclass, m>;
 }
@@ -1709,19 +1803,19 @@ multiclass VPseudoBinaryW_WF {
 // "The destination EEW is smaller than the source EEW and the overlap is in the
 //  lowest-numbered part of the source register group."
 multiclass VPseudoBinaryV_WV {
-  foreach m = MxListW.m in
+  foreach m = MxListW in
     defm _WV : VPseudoBinary<m.vrclass, m.wvrclass, m.vrclass, m,
                              !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>;
 }
 
 multiclass VPseudoBinaryV_WX {
-  foreach m = MxListW.m in
+  foreach m = MxListW in
     defm _WX : VPseudoBinary<m.vrclass, m.wvrclass, GPR, m,
                              !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>;
 }
 
 multiclass VPseudoBinaryV_WI {
-  foreach m = MxListW.m in
+  foreach m = MxListW in
     defm _WI : VPseudoBinary<m.vrclass, m.wvrclass, uimm5, m,
                              !if(!ge(m.octuple, 8), "@earlyclobber $rd", "")>;
 }
@@ -1731,7 +1825,7 @@ multiclass VPseudoBinaryV_WI {
 // For vadc and vsbc, CarryIn == 1 and CarryOut == 0
 multiclass VPseudoBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1,
                              string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX :
       VPseudoBinaryCarryIn<!if(CarryOut, VR,
                            !if(!and(CarryIn, !not(CarryOut)),
@@ -1739,9 +1833,19 @@ multiclass VPseudoBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1,
                            m.vrclass, m.vrclass, m, CarryIn, Constraint>;
 }
 
+multiclass VPseudoTiedBinaryV_VM<bit CarryOut = 0, bit CarryIn = 1,
+                                 string Constraint = ""> {
+  foreach m = MxList in
+    def "_VV" # !if(CarryIn, "M", "") # "_" # m.MX # "_TU" :
+      VPseudoTiedBinaryCarryIn<!if(CarryOut, VR,
+                               !if(!and(CarryIn, !not(CarryOut)),
+                                   GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                               m.vrclass, m.vrclass, m, CarryIn, Constraint>;
+}
+
 multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1,
                              string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX :
       VPseudoBinaryCarryIn<!if(CarryOut, VR,
                            !if(!and(CarryIn, !not(CarryOut)),
@@ -1749,18 +1853,34 @@ multiclass VPseudoBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1,
                            m.vrclass, GPR, m, CarryIn, Constraint>;
 }
 
+multiclass VPseudoTiedBinaryV_XM<bit CarryOut = 0, bit CarryIn = 1,
+                                 string Constraint = ""> {
+  foreach m = MxList in
+    def "_VX" # !if(CarryIn, "M", "") # "_" # m.MX # "_TU":
+      VPseudoTiedBinaryCarryIn<!if(CarryOut, VR,
+                               !if(!and(CarryIn, !not(CarryOut)),
+                                   GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                               m.vrclass, GPR, m, CarryIn, Constraint>;
+}
+
 multiclass VPseudoVMRG_FM {
-  foreach m = MxList.m in
-    foreach f = FPList.fpinfo in
+  foreach f = FPList in
+    foreach m = f.MxList in {
       def "_V" # f.FX # "M_" # m.MX :
         VPseudoBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                              m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">,
         Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>;
+      // Tied version to allow codegen control over the tail elements
+      def "_V" # f.FX # "M_" # m.MX # "_TU":
+        VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
+                                 m.vrclass, f.fprclass, m, /*CarryIn=*/1, "">,
+        Sched<[WriteVFMergeV, ReadVFMergeV, ReadVFMergeF, ReadVMask]>;
+    }
 }
 
 multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1,
                              string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX :
       VPseudoBinaryCarryIn<!if(CarryOut, VR,
                            !if(!and(CarryIn, !not(CarryOut)),
@@ -1768,8 +1888,18 @@ multiclass VPseudoBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1,
                            m.vrclass, simm5, m, CarryIn, Constraint>;
 }
 
+multiclass VPseudoTiedBinaryV_IM<bit CarryOut = 0, bit CarryIn = 1,
+                                 string Constraint = ""> {
+  foreach m = MxList in
+    def "_VI" # !if(CarryIn, "M", "") # "_" # m.MX # "_TU":
+      VPseudoTiedBinaryCarryIn<!if(CarryOut, VR,
+                               !if(!and(CarryIn, !not(CarryOut)),
+                                   GetVRegNoV0<m.vrclass>.R, m.vrclass)),
+                               m.vrclass, simm5, m, CarryIn, Constraint>;
+}
+
 multiclass VPseudoUnaryVMV_V_X_I {
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoUnaryNoDummyMask<m.vrclass, m.vrclass>,
                          Sched<[WriteVIMovV, ReadVIMovV]>;
@@ -1782,8 +1912,8 @@ multiclass VPseudoUnaryVMV_V_X_I {
 }
 
 multiclass VPseudoVMV_F {
-  foreach m = MxList.m in {
-    foreach f = FPList.fpinfo in {
+  foreach f = FPList in {
+    foreach m = f.MxList in {
       let VLMul = m.value in {
         def "_" # f.FX # "_" # m.MX :
           VPseudoUnaryNoDummyMask<m.vrclass, f.fprclass>,
@@ -1794,7 +1924,7 @@ multiclass VPseudoVMV_F {
 }
 
 multiclass VPseudoVCLS_V {
-  foreach m = MxList.m in {
+  foreach m = MxListF in {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          Sched<[WriteVFClassV, ReadVFClassV, ReadVMask]>;
@@ -1805,7 +1935,7 @@ multiclass VPseudoVCLS_V {
 }
 
 multiclass VPseudoVSQR_V {
-  foreach m = MxList.m in {
+  foreach m = MxListF in {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          Sched<[WriteVFSqrtV, ReadVFSqrtV, ReadVMask]>;
@@ -1816,7 +1946,7 @@ multiclass VPseudoVSQR_V {
 }
 
 multiclass VPseudoVRCP_V {
-  foreach m = MxList.m in {
+  foreach m = MxListF in {
     let VLMul = m.value in {
       def "_V_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
                          Sched<[WriteVFRecpV, ReadVFRecpV, ReadVMask]>;
@@ -1828,7 +1958,7 @@ multiclass VPseudoVRCP_V {
 
 multiclass PseudoVEXT_VF2 {
   defvar constraints = "@earlyclobber $rd";
-  foreach m = MxListVF2.m in
+  foreach m = MxListVF2 in
   {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f2vrclass, constraints>,
@@ -1842,7 +1972,7 @@ multiclass PseudoVEXT_VF2 {
 
 multiclass PseudoVEXT_VF4 {
   defvar constraints = "@earlyclobber $rd";
-  foreach m = MxListVF4.m in
+  foreach m = MxListVF4 in
   {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f4vrclass, constraints>,
@@ -1856,7 +1986,7 @@ multiclass PseudoVEXT_VF4 {
 
 multiclass PseudoVEXT_VF8 {
   defvar constraints = "@earlyclobber $rd";
-  foreach m = MxListVF8.m in
+  foreach m = MxListVF8 in
   {
     let VLMul = m.value in {
       def "_" # m.MX : VPseudoUnaryNoMask<m.vrclass, m.f8vrclass, constraints>,
@@ -1879,29 +2009,29 @@ multiclass PseudoVEXT_VF8 {
 //  lowest-numbered part of the source register group".
 // With LMUL<=1 the source and dest occupy a single register so any overlap
 // is in the lowest-numbered part.
-multiclass VPseudoBinaryM_VV {
-  foreach m = MxList.m in
+multiclass VPseudoBinaryM_VV<list<LMULInfo> mxlist = MxList> {
+  foreach m = mxlist in
     defm _VV : VPseudoBinaryM<VR, m.vrclass, m.vrclass, m,
                               !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
 }
 
 multiclass VPseudoBinaryM_VX {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm "_VX" :
       VPseudoBinaryM<VR, m.vrclass, GPR, m,
                      !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
 }
 
 multiclass VPseudoBinaryM_VF {
-  foreach m = MxList.m in
-    foreach f = FPList.fpinfo in
+  foreach f = FPList in
+    foreach m = f.MxList in
       defm "_V" # f.FX :
         VPseudoBinaryM<VR, m.vrclass, f.fprclass, m,
                        !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
 }
 
 multiclass VPseudoBinaryM_VI {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm _VI : VPseudoBinaryM<VR, m.vrclass, simm5, m,
                               !if(!ge(m.octuple, 16), "@earlyclobber $rd", "")>;
 }
@@ -1995,14 +2125,14 @@ multiclass VPseudoVDIV_VV_VX {
 }
 
 multiclass VPseudoVFMUL_VV_VF {
-  defm "" : VPseudoBinaryV_VV,
+  defm "" : VPseudoBinaryFV_VV,
             Sched<[WriteVFMulV, ReadVFMulV, ReadVFMulV, ReadVMask]>;
   defm "" : VPseudoBinaryV_VF,
             Sched<[WriteVFMulF, ReadVFMulV, ReadVFMulF, ReadVMask]>;
 }
 
 multiclass VPseudoVFDIV_VV_VF {
-  defm "" : VPseudoBinaryV_VV,
+  defm "" : VPseudoBinaryFV_VV,
             Sched<[WriteVFDivV, ReadVFDivV, ReadVFDivV, ReadVMask]>;
   defm "" : VPseudoBinaryV_VF,
             Sched<[WriteVFDivF, ReadVFDivV, ReadVFDivF, ReadVMask]>;
@@ -2021,21 +2151,21 @@ multiclass VPseudoVALU_VV_VX {
 }
 
 multiclass VPseudoVSGNJ_VV_VF {
-  defm "" : VPseudoBinaryV_VV,
+  defm "" : VPseudoBinaryFV_VV,
             Sched<[WriteVFSgnjV, ReadVFSgnjV, ReadVFSgnjV, ReadVMask]>;
   defm "" : VPseudoBinaryV_VF,
             Sched<[WriteVFSgnjF, ReadVFSgnjV, ReadVFSgnjF, ReadVMask]>;
 }
 
 multiclass VPseudoVMAX_VV_VF {
-  defm "" : VPseudoBinaryV_VV,
+  defm "" : VPseudoBinaryFV_VV,
             Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>;
   defm "" : VPseudoBinaryV_VF,
             Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
 }
 
 multiclass VPseudoVALU_VV_VF {
-  defm "" : VPseudoBinaryV_VV,
+  defm "" : VPseudoBinaryFV_VV,
             Sched<[WriteVFALUV, ReadVFALUV, ReadVFALUV, ReadVMask]>;
   defm "" : VPseudoBinaryV_VF,
             Sched<[WriteVFALUF, ReadVFALUV, ReadVFALUF, ReadVMask]>;
@@ -2068,17 +2198,12 @@ multiclass VPseudoVWMUL_VV_VX {
 }
 
 multiclass VPseudoVWMUL_VV_VF {
-  defm "" : VPseudoBinaryW_VV,
+  defm "" : VPseudoBinaryW_VV<MxListFW>,
             Sched<[WriteVFWMulV, ReadVFWMulV, ReadVFWMulV, ReadVMask]>;
   defm "" : VPseudoBinaryW_VF,
             Sched<[WriteVFWMulF, ReadVFWMulV, ReadVFWMulF, ReadVMask]>;
 }
 
-multiclass VPseudoBinaryW_VV_VF {
-  defm "" : VPseudoBinaryW_VV;
-  defm "" : VPseudoBinaryW_VF;
-}
-
 multiclass VPseudoVWALU_WV_WX {
   defm "" : VPseudoBinaryW_WV,
             Sched<[WriteVIWALUV, ReadVIWALUV, ReadVIWALUV, ReadVMask]>;
@@ -2087,14 +2212,14 @@ multiclass VPseudoVWALU_WV_WX {
 }
 
 multiclass VPseudoVFWALU_VV_VF {
-  defm "" : VPseudoBinaryW_VV,
+  defm "" : VPseudoBinaryW_VV<MxListFW>,
             Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>;
   defm "" : VPseudoBinaryW_VF,
             Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>;
 }
 
 multiclass VPseudoVFWALU_WV_WF {
-  defm "" : VPseudoBinaryW_WV,
+  defm "" : VPseudoBinaryW_WV<MxListFW>,
             Sched<[WriteVFWALUV, ReadVFWALUV, ReadVFWALUV, ReadVMask]>;
   defm "" : VPseudoBinaryW_WF,
             Sched<[WriteVFWALUF, ReadVFWALUV, ReadVFWALUF, ReadVMask]>;
@@ -2107,6 +2232,13 @@ multiclass VPseudoVMRG_VM_XM_IM {
             Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>;
   defm "" : VPseudoBinaryV_IM,
             Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>;
+  // Tied versions to allow codegen control over the tail elements
+  defm "" : VPseudoTiedBinaryV_VM,
+            Sched<[WriteVIMergeV, ReadVIMergeV, ReadVIMergeV, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_XM,
+            Sched<[WriteVIMergeX, ReadVIMergeV, ReadVIMergeX, ReadVMask]>;
+  defm "" : VPseudoTiedBinaryV_IM,
+            Sched<[WriteVIMergeI, ReadVIMergeV, ReadVMask]>;
 }
 
 multiclass VPseudoVCALU_VM_XM_IM {
@@ -2199,56 +2331,57 @@ multiclass VPseudoTernaryWithPolicy<VReg RetClass,
   }
 }
 
-multiclass VPseudoTernaryV_VV_AAXA<string Constraint = ""> {
-  foreach m = MxList.m in {
+multiclass VPseudoTernaryV_VV_AAXA<string Constraint = "",
+                                   list<LMULInfo> mxlist = MxList> {
+  foreach m = mxlist in {
     defm _VV : VPseudoTernaryWithPolicy<m.vrclass, m.vrclass, m.vrclass, m,
                                         Constraint, /*Commutable*/1>;
   }
 }
 
 multiclass VPseudoTernaryV_VX<string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm _VX : VPseudoTernary<m.vrclass, m.vrclass, GPR, m, Constraint>;
 }
 
 multiclass VPseudoTernaryV_VX_AAXA<string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm "_VX" : VPseudoTernaryWithPolicy<m.vrclass, GPR, m.vrclass, m,
                                           Constraint, /*Commutable*/1>;
 }
 
 multiclass VPseudoTernaryV_VF_AAXA<string Constraint = ""> {
-  foreach m = MxList.m in
-    foreach f = FPList.fpinfo in
+  foreach f = FPList in
+    foreach m = f.MxList in
       defm "_V" # f.FX : VPseudoTernaryWithPolicy<m.vrclass, f.fprclass,
                                                   m.vrclass, m, Constraint,
                                                   /*Commutable*/1>;
 }
 
-multiclass VPseudoTernaryW_VV {
+multiclass VPseudoTernaryW_VV<list<LMULInfo> mxlist = MxListW> {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxListW.m in
+  foreach m = mxlist in
     defm _VV : VPseudoTernaryWithPolicy<m.wvrclass, m.vrclass, m.vrclass, m,
                                         constraint>;
 }
 
 multiclass VPseudoTernaryW_VX {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxListW.m in
+  foreach m = MxListW in
     defm "_VX" : VPseudoTernaryWithPolicy<m.wvrclass, GPR, m.vrclass, m,
                                           constraint>;
 }
 
 multiclass VPseudoTernaryW_VF {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxListW.m in
-    foreach f = FPListW.fpinfo in
+  foreach f = FPListW in
+    foreach m = f.MxList in
       defm "_V" # f.FX : VPseudoTernaryWithPolicy<m.wvrclass, f.fprclass,
                                                   m.vrclass, m, constraint>;
 }
 
 multiclass VPseudoTernaryV_VI<Operand ImmType = simm5, string Constraint = ""> {
-  foreach m = MxList.m in
+  foreach m = MxList in
     defm _VI : VPseudoTernary<m.vrclass, m.vrclass, ImmType, m, Constraint>;
 }
 
@@ -2260,7 +2393,7 @@ multiclass VPseudoVMAC_VV_VX_AAXA<string Constraint = ""> {
 }
 
 multiclass VPseudoVMAC_VV_VF_AAXA<string Constraint = ""> {
-  defm "" : VPseudoTernaryV_VV_AAXA<Constraint>,
+  defm "" : VPseudoTernaryV_VV_AAXA<Constraint, MxListF>,
             Sched<[WriteVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddV, ReadVMask]>;
   defm "" : VPseudoTernaryV_VF_AAXA<Constraint>,
             Sched<[WriteVFMulAddF, ReadVFMulAddV, ReadVFMulAddV, ReadVFMulAddF, ReadVMask]>;
@@ -2286,7 +2419,7 @@ multiclass VPseudoVWMAC_VX {
 }
 
 multiclass VPseudoVWMAC_VV_VF {
-  defm "" : VPseudoTernaryW_VV,
+  defm "" : VPseudoTernaryW_VV<MxListFW>,
             Sched<[WriteVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddV, ReadVMask]>;
   defm "" : VPseudoTernaryW_VF,
             Sched<[WriteVFWMulAddF, ReadVFWMulAddV, ReadVFWMulAddV, ReadVFWMulAddF, ReadVMask]>;
@@ -2309,7 +2442,7 @@ multiclass VPseudoVCMPM_VV_VX {
 }
 
 multiclass VPseudoVCMPM_VV_VF {
-  defm "" : VPseudoBinaryM_VV,
+  defm "" : VPseudoBinaryM_VV<MxListF>,
             Sched<[WriteVFCmpV, ReadVFCmpV, ReadVFCmpV, ReadVMask]>;
   defm "" : VPseudoBinaryM_VF,
             Sched<[WriteVFCmpF, ReadVFCmpV, ReadVFCmpF, ReadVMask]>;
@@ -2328,35 +2461,35 @@ multiclass VPseudoVCMPM_VX_VI {
 }
 
 multiclass VPseudoVRED_VS {
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
                Sched<[WriteVIRedV, ReadVIRedV, ReadVIRedV, ReadVIRedV, ReadVMask]>;
   }
 }
 
 multiclass VPseudoVWRED_VS {
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
                Sched<[WriteVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVIWRedV, ReadVMask]>;
   }
 }
 
 multiclass VPseudoVFRED_VS {
-  foreach m = MxList.m in {
+  foreach m = MxListF in {
     defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
                Sched<[WriteVFRedV, ReadVFRedV, ReadVFRedV, ReadVFRedV, ReadVMask]>;
   }
 }
 
 multiclass VPseudoVFREDO_VS {
-  foreach m = MxList.m in {
+  foreach m = MxListF in {
     defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
                Sched<[WriteVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVFRedOV, ReadVMask]>;
   }
 }
 
 multiclass VPseudoVFWRED_VS {
-  foreach m = MxList.m in {
+  foreach m = MxListF in {
     defm _VS : VPseudoTernary<V_M1.vrclass, m.vrclass, V_M1.vrclass, m>,
                Sched<[WriteVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVFWRedV, ReadVMask]>;
   }
@@ -2374,61 +2507,61 @@ multiclass VPseudoConversion<VReg RetClass,
 }
 
 multiclass VPseudoVCVTI_V {
-  foreach m = MxList.m in
+  foreach m = MxListF in
     defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>,
               Sched<[WriteVFCvtFToIV, ReadVFCvtFToIV, ReadVMask]>;
 }
 
 multiclass VPseudoVCVTF_V {
-  foreach m = MxList.m in
+  foreach m = MxListF in
     defm _V : VPseudoConversion<m.vrclass, m.vrclass, m>,
               Sched<[WriteVFCvtIToFV, ReadVFCvtIToFV, ReadVMask]>;
 }
 
 multiclass VPseudoConversionW_V {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxListW.m in
+  foreach m = MxListW in
     defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>;
 }
 
 multiclass VPseudoVWCVTI_V {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxList.m[0-5] in
+  foreach m = MxListFW in
     defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>,
               Sched<[WriteVFWCvtFToIV, ReadVFWCvtFToIV, ReadVMask]>;
 }
 
 multiclass VPseudoVWCVTF_V {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxList.m[0-5] in
+  foreach m = MxListW in
     defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>,
               Sched<[WriteVFWCvtIToFV, ReadVFWCvtIToFV, ReadVMask]>;
 }
 
 multiclass VPseudoVWCVTD_V {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxList.m[0-5] in
+  foreach m = MxListFW in
     defm _V : VPseudoConversion<m.wvrclass, m.vrclass, m, constraint>,
               Sched<[WriteVFWCvtFToFV, ReadVFWCvtFToFV, ReadVMask]>;
 }
 
 multiclass VPseudoVNCVTI_W {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxList.m[0-5] in
+  foreach m = MxListW in
     defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>,
               Sched<[WriteVFNCvtFToIV, ReadVFNCvtFToIV, ReadVMask]>;
 }
 
 multiclass VPseudoVNCVTF_W {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxList.m[0-5] in
+  foreach m = MxListFW in
     defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>,
               Sched<[WriteVFNCvtIToFV, ReadVFNCvtIToFV, ReadVMask]>;
 }
 
 multiclass VPseudoVNCVTD_W {
   defvar constraint = "@earlyclobber $rd";
-  foreach m = MxListW.m in
+  foreach m = MxListFW in
     defm _W : VPseudoConversion<m.vrclass, m.wvrclass, m, constraint>,
               Sched<[WriteVFNCvtFToFV, ReadVFNCvtFToFV, ReadVMask]>;
 }
@@ -3702,6 +3835,28 @@ multiclass VPatConversionVF_WF <string intrinsic, string instruction> {
   }
 }
 
+multiclass VPatCompare_VI<string intrinsic, string inst,
+                          ImmLeaf ImmType = simm5_plus1> {
+  foreach vti = AllIntegerVectors in {
+    defvar Intr = !cast<Intrinsic>(intrinsic);
+    defvar Pseudo = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX);
+    def : Pat<(vti.Mask (Intr (vti.Vector vti.RegClass:$rs1),
+                              (vti.Scalar ImmType:$rs2),
+                              VLOpFrag)),
+              (Pseudo vti.RegClass:$rs1, (DecImm ImmType:$rs2),
+                      GPR:$vl, vti.Log2SEW)>;
+    defvar IntrMask = !cast<Intrinsic>(intrinsic # "_mask");
+    defvar PseudoMask = !cast<Instruction>(inst#"_VI_"#vti.LMul.MX#"_MASK");
+    def : Pat<(vti.Mask (IntrMask (vti.Mask VR:$merge),
+                                  (vti.Vector vti.RegClass:$rs1),
+                                  (vti.Scalar ImmType:$rs2),
+                                  (vti.Mask V0),
+                                  VLOpFrag)),
+              (PseudoMask VR:$merge, vti.RegClass:$rs1, (DecImm ImmType:$rs2),
+                          (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -3741,7 +3896,7 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in {
   def PseudoVRELOAD_M8 : VPseudo<VL8RE8_V, V_M8, (outs VRM8:$rs1), (ins GPR:$rs2)>;
 }
 
-foreach lmul = MxList.m in {
+foreach lmul = MxList in {
   foreach nf = NFSet<lmul>.L in {
     defvar vreg = SegRegClass<lmul, nf>.RC;
     let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1 in {
@@ -3765,9 +3920,9 @@ let hasSideEffects = 1, mayLoad = 0, mayStore = 0, Defs = [VL, VTYPE] in {
 // the when we aren't using one of the special X0 encodings. Otherwise it could
 // be accidentally be made X0 by MachineIR optimizations. To satisfy the
 // verifier, we also need a GPRX0 instruction for the special encodings.
-def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPRNoX0:$rs1, VTypeIOp:$vtypei), []>;
-def PseudoVSETVLIX0 : Pseudo<(outs GPR:$rd), (ins GPRX0:$rs1, VTypeIOp:$vtypei), []>;
-def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp:$vtypei), []>;
+def PseudoVSETVLI : Pseudo<(outs GPR:$rd), (ins GPRNoX0:$rs1, VTypeIOp11:$vtypei), []>;
+def PseudoVSETVLIX0 : Pseudo<(outs GPR:$rd), (ins GPRX0:$rs1, VTypeIOp11:$vtypei), []>;
+def PseudoVSETIVLI : Pseudo<(outs GPR:$rd), (ins uimm5:$rs1, VTypeIOp10:$vtypei), []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4304,7 +4459,7 @@ defm PseudoVID : VPseudoVID_V;
 
 let Predicates = [HasVInstructions] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  foreach m = MxList.m in {
+  foreach m = MxList in {
     let VLMul = m.value in {
       let HasSEWOp = 1, BaseInstr = VMV_X_S in
       def PseudoVMV_X_S # "_" # m.MX:
@@ -4330,8 +4485,8 @@ let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 
 let Predicates = [HasVInstructionsAnyF] in {
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  foreach m = MxList.m in {
-    foreach f = FPList.fpinfo in {
+  foreach f = FPList in {
+    foreach m = f.MxList in {
       let VLMul = m.value in {
         let HasSEWOp = 1, BaseInstr = VFMV_F_S in
         def "PseudoVFMV_" # f.FX # "_S_" # m.MX :
@@ -4452,6 +4607,30 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsrl", "PseudoVSRL", AllIntegerVectors,
 defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors,
                             uimm5>;
 
+foreach vti = AllIntegerVectors in {
+  // Emit shift by 1 as an add since it might be faster.
+  def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$rs1),
+                                        (XLenVT 1), VLOpFrag)),
+            (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
+                                                              vti.RegClass:$rs1,
+                                                              GPR:$vl,
+                                                              vti.Log2SEW)>;
+  def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge),
+                                             (vti.Vector vti.RegClass:$rs1),
+                                             (XLenVT 1),
+                                             (vti.Mask V0),
+                                             VLOpFrag,
+                                             (XLenVT timm:$policy))),
+            (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX#"_MASK")
+                                                        vti.RegClass:$merge,
+                                                        vti.RegClass:$rs1,
+                                                        vti.RegClass:$rs1,
+                                                        (vti.Mask V0),
+                                                        GPR:$vl,
+                                                        vti.Log2SEW,
+                                                        (XLenVT timm:$policy))>;
+}
+
 //===----------------------------------------------------------------------===//
 // 12.7. Vector Narrowing Integer Right Shift Instructions
 //===----------------------------------------------------------------------===//
@@ -4481,129 +4660,11 @@ defm : VPatBinarySwappedM_VV<"int_riscv_vmsge", "PseudoVMSLE", AllIntegerVectors
 // Match vmslt(u).vx intrinsics to vmsle(u).vi if the scalar is -15 to 16. This
 // avoids the user needing to know that there is no vmslt(u).vi instruction.
 // Similar for vmsge(u).vx intrinsics using vmslt(u).vi.
-foreach vti = AllIntegerVectors in {
-  def : Pat<(vti.Mask (int_riscv_vmslt (vti.Vector vti.RegClass:$rs1),
-                                       (vti.Scalar simm5_plus1:$rs2),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
-                                                               (DecImm simm5_plus1:$rs2),
-                                                               GPR:$vl,
-                                                               vti.Log2SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmslt_mask (vti.Mask VR:$merge),
-                                            (vti.Vector vti.RegClass:$rs1),
-                                            (vti.Scalar simm5_plus1:$rs2),
-                                            (vti.Mask V0),
-                                            VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSLE_VI_"#vti.LMul.MX#"_MASK")
-                                                      VR:$merge,
-                                                      vti.RegClass:$rs1,
-                                                      (DecImm simm5_plus1:$rs2),
-                                                      (vti.Mask V0),
-                                                      GPR:$vl,
-                                                      vti.Log2SEW)>;
+defm : VPatCompare_VI<"int_riscv_vmslt", "PseudoVMSLE">;
+defm : VPatCompare_VI<"int_riscv_vmsltu", "PseudoVMSLEU", simm5_plus1_nonzero>;
 
-  def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
-                                        (vti.Scalar simm5_plus1:$rs2),
-                                        VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
-                                                                (DecImm simm5_plus1:$rs2),
-                                                                GPR:$vl,
-                                                                vti.Log2SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
-                                             (vti.Vector vti.RegClass:$rs1),
-                                             (vti.Scalar simm5_plus1:$rs2),
-                                             (vti.Mask V0),
-                                             VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSLEU_VI_"#vti.LMul.MX#"_MASK")
-                                                      VR:$merge,
-                                                      vti.RegClass:$rs1,
-                                                      (DecImm simm5_plus1:$rs2),
-                                                      (vti.Mask V0),
-                                                      GPR:$vl,
-                                                      vti.Log2SEW)>;
-
-  // Special cases to avoid matching vmsltu.vi 0 (always false) to
-  // vmsleu.vi -1 (always true). Instead match to vmsne.vv.
-  def : Pat<(vti.Mask (int_riscv_vmsltu (vti.Vector vti.RegClass:$rs1),
-                                        (vti.Scalar 0), VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
-                                                               vti.RegClass:$rs1,
-                                                               GPR:$vl,
-                                                               vti.Log2SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmsltu_mask (vti.Mask VR:$merge),
-                                             (vti.Vector vti.RegClass:$rs1),
-                                             (vti.Scalar 0),
-                                             (vti.Mask V0),
-                                             VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSNE_VV_"#vti.LMul.MX#"_MASK")
-                                                     VR:$merge,
-                                                     vti.RegClass:$rs1,
-                                                     vti.RegClass:$rs1,
-                                                     (vti.Mask V0),
-                                                     GPR:$vl,
-                                                     vti.Log2SEW)>;
-
-  def : Pat<(vti.Mask (int_riscv_vmsge (vti.Vector vti.RegClass:$rs1),
-                                       (vti.Scalar simm5_plus1:$rs2),
-                                       VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSGT_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
-                                                               (DecImm simm5_plus1:$rs2),
-                                                               GPR:$vl,
-                                                               vti.Log2SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmsge_mask (vti.Mask VR:$merge),
-                                            (vti.Vector vti.RegClass:$rs1),
-                                            (vti.Scalar simm5_plus1:$rs2),
-                                            (vti.Mask V0),
-                                            VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSGT_VI_"#vti.LMul.MX#"_MASK")
-                                                      VR:$merge,
-                                                      vti.RegClass:$rs1,
-                                                      (DecImm simm5_plus1:$rs2),
-                                                      (vti.Mask V0),
-                                                      GPR:$vl,
-                                                      vti.Log2SEW)>;
-
-  def : Pat<(vti.Mask (int_riscv_vmsgeu (vti.Vector vti.RegClass:$rs1),
-                                        (vti.Scalar simm5_plus1:$rs2),
-                                        VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSGTU_VI_"#vti.LMul.MX) vti.RegClass:$rs1,
-                                                                (DecImm simm5_plus1:$rs2),
-                                                                GPR:$vl,
-                                                                vti.Log2SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmsgeu_mask (vti.Mask VR:$merge),
-                                             (vti.Vector vti.RegClass:$rs1),
-                                             (vti.Scalar simm5_plus1:$rs2),
-                                             (vti.Mask V0),
-                                             VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSGTU_VI_"#vti.LMul.MX#"_MASK")
-                                                      VR:$merge,
-                                                      vti.RegClass:$rs1,
-                                                      (DecImm simm5_plus1:$rs2),
-                                                      (vti.Mask V0),
-                                                      GPR:$vl,
-                                                      vti.Log2SEW)>;
-
-  // Special cases to avoid matching vmsgeu.vi 0 (always true) to
-  // vmsgtu.vi -1 (always false). Instead match to vmsne.vv.
-  def : Pat<(vti.Mask (int_riscv_vmsgeu (vti.Vector vti.RegClass:$rs1),
-                                        (vti.Scalar 0), VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSEQ_VV_"#vti.LMul.MX) vti.RegClass:$rs1,
-                                                               vti.RegClass:$rs1,
-                                                               GPR:$vl,
-                                                               vti.Log2SEW)>;
-  def : Pat<(vti.Mask (int_riscv_vmsgeu_mask (vti.Mask VR:$merge),
-                                             (vti.Vector vti.RegClass:$rs1),
-                                             (vti.Scalar 0),
-                                             (vti.Mask V0),
-                                             VLOpFrag)),
-            (!cast<Instruction>("PseudoVMSEQ_VV_"#vti.LMul.MX#"_MASK")
-                                                     VR:$merge,
-                                                     vti.RegClass:$rs1,
-                                                     vti.RegClass:$rs1,
-                                                     (vti.Mask V0),
-                                                     GPR:$vl,
-                                                     vti.Log2SEW)>;
-}
+defm : VPatCompare_VI<"int_riscv_vmsge", "PseudoVMSGT">;
+defm : VPatCompare_VI<"int_riscv_vmsgeu", "PseudoVMSGTU", simm5_plus1_nonzero>;
 
 //===----------------------------------------------------------------------===//
 // 12.9. Vector Integer Min/Max Instructions
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
index 711ad4335ece..e452a84a9a6f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td
@@ -363,6 +363,91 @@ multiclass VPatNConvertFP2ISDNode_V<SDNode vop, string instruction_name> {
   }
 }
 
+multiclass VPatWidenBinarySDNode_VV_VX_WV_WX<SDNode op, PatFrags extop, string instruction_name> {
+  foreach vti = AllWidenableIntVectors in {
+    def : Pat<(op (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
+                  (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX)
+                 vti.Vti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+    def : Pat<(op (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
+                  (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))),
+              (!cast<Instruction>(instruction_name#"_VX_"#vti.Vti.LMul.MX)
+                 vti.Vti.RegClass:$rs2, GPR:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
+                  (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX)
+                 vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
+                  (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))),
+              (!cast<Instruction>(instruction_name#"_WX_"#vti.Vti.LMul.MX)
+                 vti.Wti.RegClass:$rs2, GPR:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenMulAddSDNode_VV<PatFrags extop1, PatFrags extop2, string instruction_name> {
+  foreach vti = AllWidenableIntVectors in {
+    def : Pat<
+      (add (vti.Wti.Vector vti.Wti.RegClass:$rd),
+        (mul_oneuse (vti.Wti.Vector (extop1 (vti.Vti.Vector vti.Vti.RegClass:$rs1))),
+                    (vti.Wti.Vector (extop2 (vti.Vti.Vector vti.Vti.RegClass:$rs2))))),
+      (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX)
+        vti.Wti.RegClass:$rd, vti.Vti.RegClass:$rs1, vti.Vti.RegClass:$rs2,
+        vti.Vti.AVL, vti.Vti.Log2SEW, TAIL_AGNOSTIC
+      )>;
+  }
+}
+multiclass VPatWidenMulAddSDNode_VX<PatFrags extop1, PatFrags extop2, string instruction_name> {
+  foreach vti = AllWidenableIntVectors in {
+    def : Pat<
+      (add (vti.Wti.Vector vti.Wti.RegClass:$rd),
+        (mul_oneuse (vti.Wti.Vector (extop1 (vti.Vti.Vector (SplatPat GPR:$rs1)))),
+                    (vti.Wti.Vector (extop2 (vti.Vti.Vector vti.Vti.RegClass:$rs2))))),
+      (!cast<Instruction>(instruction_name#"_VX_"#vti.Vti.LMul.MX)
+        vti.Wti.RegClass:$rd, GPR:$rs1, vti.Vti.RegClass:$rs2,
+        vti.Vti.AVL, vti.Vti.Log2SEW, TAIL_AGNOSTIC
+      )>;
+  }
+}
+
+multiclass VPatWidenBinaryFPSDNode_VV_VF<SDNode op, string instruction_name> {
+  foreach vti = AllWidenableFloatVectors in {
+    def : Pat<(op (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
+                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_VV_"#vti.Vti.LMul.MX)
+                 vti.Vti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+    def : Pat<(op (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs2))),
+                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector (SplatPat vti.Vti.ScalarRegClass:$rs1))))),
+              (!cast<Instruction>(instruction_name#"_V"#vti.Vti.ScalarSuffix#"_"#vti.Vti.LMul.MX)
+                 vti.Vti.RegClass:$rs2, vti.Vti.ScalarRegClass:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenBinaryFPSDNode_WV_WF<SDNode op, string instruction_name> {
+  foreach vti = AllWidenableFloatVectors in {
+    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
+                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
+              (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX)
+                 vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+    def : Pat<(op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
+                  (vti.Wti.Vector (fpext_oneuse (vti.Vti.Vector (SplatPat vti.Vti.ScalarRegClass:$rs1))))),
+              (!cast<Instruction>(instruction_name#"_W"#vti.Vti.ScalarSuffix#"_"#vti.Vti.LMul.MX)
+                 vti.Wti.RegClass:$rs2, vti.Vti.ScalarRegClass:$rs1,
+                 vti.Vti.AVL, vti.Vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatWidenBinaryFPSDNode_VV_VF_WV_WF<SDNode op, string instruction_name> {
+  defm : VPatWidenBinaryFPSDNode_VV_VF<op, instruction_name>;
+  defm : VPatWidenBinaryFPSDNode_WV_WF<op, instruction_name>;
+}
+
 //===----------------------------------------------------------------------===//
 // Patterns.
 //===----------------------------------------------------------------------===//
@@ -399,6 +484,15 @@ foreach vti = AllIntegerVectors in {
                  vti.RegClass:$rs1, simm5:$rs2, vti.AVL, vti.Log2SEW)>;
 }
 
+// 12.2. Vector Widening Integer Add and Subtract
+defm : VPatWidenBinarySDNode_VV_VX_WV_WX<add, sext_oneuse, "PseudoVWADD">;
+defm : VPatWidenBinarySDNode_VV_VX_WV_WX<add, zext_oneuse, "PseudoVWADDU">;
+defm : VPatWidenBinarySDNode_VV_VX_WV_WX<add, anyext_oneuse, "PseudoVWADDU">;
+
+defm : VPatWidenBinarySDNode_VV_VX_WV_WX<sub, sext_oneuse, "PseudoVWSUB">;
+defm : VPatWidenBinarySDNode_VV_VX_WV_WX<sub, zext_oneuse, "PseudoVWSUBU">;
+defm : VPatWidenBinarySDNode_VV_VX_WV_WX<sub, anyext_oneuse, "PseudoVWSUBU">;
+
 // 12.3. Vector Integer Extension
 defm : VPatExtendSDNode_V<[zext, anyext], "PseudoVZEXT", "VF2",
                           AllFractionableVF2IntVectors>;
@@ -513,6 +607,15 @@ foreach vti = AllIntegerVectors in {
                  vti.AVL, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
+// 12.14 Vector Widening Integer Multiply-Add Instructions
+defm : VPatWidenMulAddSDNode_VV<sext_oneuse, sext_oneuse, "PseudoVWMACC">;
+defm : VPatWidenMulAddSDNode_VX<sext_oneuse, sext_oneuse, "PseudoVWMACC">;
+defm : VPatWidenMulAddSDNode_VV<zext_oneuse, zext_oneuse, "PseudoVWMACCU">;
+defm : VPatWidenMulAddSDNode_VX<zext_oneuse, zext_oneuse, "PseudoVWMACCU">;
+defm : VPatWidenMulAddSDNode_VV<sext_oneuse, zext_oneuse, "PseudoVWMACCSU">;
+defm : VPatWidenMulAddSDNode_VX<sext_oneuse, zext_oneuse, "PseudoVWMACCSU">;
+defm : VPatWidenMulAddSDNode_VX<zext_oneuse, sext_oneuse, "PseudoVWMACCUS">;
+
 // 12.15. Vector Integer Merge Instructions
 foreach vti = AllIntegerVectors in {
   def : Pat<(vti.Vector (vselect (vti.Mask V0), vti.RegClass:$rs1,
@@ -582,11 +685,18 @@ defm : VPatBinaryFPSDNode_VV_VF<fadd, "PseudoVFADD">;
 defm : VPatBinaryFPSDNode_VV_VF<fsub, "PseudoVFSUB">;
 defm : VPatBinaryFPSDNode_R_VF<fsub, "PseudoVFRSUB">;
 
+// 14.3. Vector Widening Floating-Point Add/Subtract Instructions
+defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF<fadd, "PseudoVFWADD">;
+defm : VPatWidenBinaryFPSDNode_VV_VF_WV_WF<fsub, "PseudoVFWSUB">;
+
 // 14.4. Vector Single-Width Floating-Point Multiply/Divide Instructions
 defm : VPatBinaryFPSDNode_VV_VF<fmul, "PseudoVFMUL">;
 defm : VPatBinaryFPSDNode_VV_VF<fdiv, "PseudoVFDIV">;
 defm : VPatBinaryFPSDNode_R_VF<fdiv, "PseudoVFRDIV">;
 
+// 14.5. Vector Widening Floating-Point Multiply Instructions
+defm : VPatWidenBinaryFPSDNode_VV_VF<fmul, "PseudoVFWMUL">;
+
 // 14.6 Vector Single-Width Floating-Point Fused Multiply-Add Instructions.
 foreach fvti = AllFloatVectors in {
   // NOTE: We choose VFMADD because it has the most commuting freedom. So it
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index 73b97e1c3675..964f0fa54512 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -177,14 +177,13 @@ def riscv_vrgatherei16_vv_vl : SDNode<"RISCVISD::VRGATHEREI16_VV_VL",
                                                            SDTCisSameNumEltsAs<0, 3>,
                                                            SDTCisVT<4, XLenVT>]>>;
 
-def riscv_vselect_vl : SDNode<"RISCVISD::VSELECT_VL",
-                              SDTypeProfile<1, 4, [SDTCisVec<0>,
-                                                   SDTCisVec<1>,
-                                                   SDTCisSameNumEltsAs<0, 1>,
-                                                   SDTCVecEltisVT<1, i1>,
-                                                   SDTCisSameAs<0, 2>,
-                                                   SDTCisSameAs<2, 3>,
-                                                   SDTCisVT<4, XLenVT>]>>;
+def SDT_RISCVSelect_VL  : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>, SDTCVecEltisVT<1, i1>,
+  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisVT<4, XLenVT>
+]>;
+
+def riscv_vselect_vl  : SDNode<"RISCVISD::VSELECT_VL", SDT_RISCVSelect_VL>;
+def riscv_vp_merge_vl : SDNode<"RISCVISD::VP_MERGE_VL", SDT_RISCVSelect_VL>;
 
 def SDT_RISCVMaskBinOp_VL : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
                                                  SDTCisSameAs<0, 2>,
@@ -216,19 +215,20 @@ def riscv_zext_vl : SDNode<"RISCVISD::VZEXT_VL", SDT_RISCVVEXTEND_VL>;
 
 def riscv_trunc_vector_vl : SDNode<"RISCVISD::TRUNCATE_VECTOR_VL",
                                    SDTypeProfile<1, 3, [SDTCisVec<0>,
-                                                        SDTCisVec<1>,
+                                                        SDTCisSameNumEltsAs<0, 1>,
                                                         SDTCisSameNumEltsAs<0, 2>,
                                                         SDTCVecEltisVT<2, i1>,
                                                         SDTCisVT<3, XLenVT>]>>;
 
-def SDT_RISCVVWMUL_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
-                                             SDTCisSameNumEltsAs<0, 1>,
-                                             SDTCisSameAs<1, 2>,
-                                             SDTCisSameNumEltsAs<1, 3>,
-                                             SDTCVecEltisVT<3, i1>,
-                                             SDTCisVT<4, XLenVT>]>;
-def riscv_vwmul_vl  : SDNode<"RISCVISD::VWMUL_VL",  SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
-def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWMUL_VL, [SDNPCommutative]>;
+def SDT_RISCVVWBinOp_VL : SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                               SDTCisSameNumEltsAs<0, 1>,
+                                               SDTCisSameAs<1, 2>,
+                                               SDTCisSameNumEltsAs<1, 3>,
+                                               SDTCVecEltisVT<3, i1>,
+                                               SDTCisVT<4, XLenVT>]>;
+def riscv_vwmul_vl  : SDNode<"RISCVISD::VWMUL_VL",  SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
+def riscv_vwmulu_vl : SDNode<"RISCVISD::VWMULU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
+def riscv_vwaddu_vl : SDNode<"RISCVISD::VWADDU_VL", SDT_RISCVVWBinOp_VL, [SDNPCommutative]>;
 
 def SDTRVVVecReduce : SDTypeProfile<1, 5, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisSameAs<0, 3>,
@@ -363,37 +363,47 @@ multiclass VPatBinaryWVL_VV_VX<SDNode vop, string instruction_name> {
   }
 }
 
-class VPatBinaryVL_VF<SDNode vop,
-                      string instruction_name,
-                      ValueType result_type,
-                      ValueType vop_type,
-                      ValueType mask_type,
-                      int sew,
-                      LMULInfo vlmul,
-                      VReg vop_reg_class,
-                      RegisterClass scalar_reg_class> :
-    Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
-                          (vop_type (SplatFPOp scalar_reg_class:$rs2)),
-                          (mask_type true_mask),
-                          VLOpFrag)),
+multiclass VPatBinaryVL_VF<SDNode vop,
+                           string instruction_name,
+                           ValueType result_type,
+                           ValueType vop_type,
+                           ValueType mask_type,
+                           int sew,
+                           LMULInfo vlmul,
+                           VReg vop_reg_class,
+                           RegisterClass scalar_reg_class> {
+  def : Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
+                         (vop_type (SplatFPOp scalar_reg_class:$rs2)),
+                         (mask_type true_mask),
+                         VLOpFrag)),
         (!cast<Instruction>(instruction_name#"_"#vlmul.MX)
                      vop_reg_class:$rs1,
                      scalar_reg_class:$rs2,
                      GPR:$vl, sew)>;
+  def : Pat<(result_type (vop (vop_type vop_reg_class:$rs1),
+                         (vop_type (SplatFPOp scalar_reg_class:$rs2)),
+                         (mask_type V0),
+                         VLOpFrag)),
+        (!cast<Instruction>(instruction_name#"_"#vlmul.MX#"_MASK")
+                     (result_type (IMPLICIT_DEF)),
+                     vop_reg_class:$rs1,
+                     scalar_reg_class:$rs2,
+                     (mask_type V0), GPR:$vl, sew, TAIL_AGNOSTIC)>;
+}
 
 multiclass VPatBinaryFPVL_VV_VF<SDNode vop, string instruction_name> {
   foreach vti = AllFloatVectors in {
     defm : VPatBinaryVL_VV<vop, instruction_name,
                            vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
                            vti.LMul, vti.RegClass>;
-    def : VPatBinaryVL_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
-                          vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
-                          vti.LMul, vti.RegClass, vti.ScalarRegClass>;
+    defm : VPatBinaryVL_VF<vop, instruction_name#"_V"#vti.ScalarSuffix,
+                           vti.Vector, vti.Vector, vti.Mask, vti.Log2SEW,
+                           vti.LMul, vti.RegClass, vti.ScalarRegClass>;
   }
 }
 
 multiclass VPatBinaryFPVL_R_VF<SDNode vop, string instruction_name> {
-  foreach fvti = AllFloatVectors in
+  foreach fvti = AllFloatVectors in {
     def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 fvti.RegClass:$rs1,
                                 (fvti.Mask true_mask),
@@ -401,6 +411,15 @@ multiclass VPatBinaryFPVL_R_VF<SDNode vop, string instruction_name> {
               (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX)
                            fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
                            GPR:$vl, fvti.Log2SEW)>;
+    def : Pat<(fvti.Vector (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
+                                fvti.RegClass:$rs1,
+                                (fvti.Mask V0),
+                                VLOpFrag)),
+              (!cast<Instruction>(instruction_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
+                           (fvti.Vector (IMPLICIT_DEF)),
+                           fvti.RegClass:$rs1, fvti.ScalarRegClass:$rs2,
+                           (fvti.Mask V0), GPR:$vl, fvti.Log2SEW, TAIL_AGNOSTIC)>;
+  }
 }
 
 multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name,
@@ -602,6 +621,47 @@ multiclass VPatReductionVL<SDNode vop, string instruction_name, bit is_float> {
   }
 }
 
+multiclass VPatBinarySDNodeExt_V_WV<SDNode op, PatFrags extop, string instruction_name> {
+  foreach vti = AllWidenableIntVectors in {
+    def : Pat<
+      (vti.Vti.Vector
+        (riscv_trunc_vector_vl
+          (op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
+              (vti.Wti.Vector (extop (vti.Vti.Vector vti.Vti.RegClass:$rs1)))),
+          (riscv_vmset_vl VLMax),
+          VLMax)),
+      (!cast<Instruction>(instruction_name#"_WV_"#vti.Vti.LMul.MX)
+        vti.Wti.RegClass:$rs2, vti.Vti.RegClass:$rs1,
+        vti.Vti.AVL, vti.Vti.Log2SEW)>;
+  }
+}
+
+multiclass VPatBinarySDNodeExt_V_WX<SDNode op, PatFrags extop, string instruction_name> {
+  foreach vti = AllWidenableIntVectors in {
+    def : Pat<
+      (vti.Vti.Vector
+        (riscv_trunc_vector_vl
+          (op (vti.Wti.Vector vti.Wti.RegClass:$rs2),
+              (vti.Wti.Vector (extop (vti.Vti.Vector (SplatPat GPR:$rs1))))),
+          (riscv_vmset_vl VLMax),
+          VLMax)),
+      (!cast<Instruction>(instruction_name#"_WX_"#vti.Vti.LMul.MX)
+        vti.Wti.RegClass:$rs2, GPR:$rs1,
+        vti.Vti.AVL, vti.Vti.Log2SEW)>;
+  }
+}
+
+
+multiclass VPatBinarySDNode_V_WV<SDNode op, string instruction_name> {
+  defm : VPatBinarySDNodeExt_V_WV<op, sext_oneuse, instruction_name>;
+  defm : VPatBinarySDNodeExt_V_WV<op, zext_oneuse, instruction_name>;
+}
+
+multiclass VPatBinarySDNode_V_WX<SDNode op, string instruction_name> {
+  defm : VPatBinarySDNodeExt_V_WX<op, sext_oneuse, instruction_name>;
+  defm : VPatBinarySDNodeExt_V_WX<op, zext_oneuse, instruction_name>;
+}
+
 //===----------------------------------------------------------------------===//
 // Patterns.
 //===----------------------------------------------------------------------===//
@@ -661,6 +721,9 @@ foreach vti = AllIntegerVectors in {
                  (vti.Mask V0), GPR:$vl, vti.Log2SEW, TAIL_AGNOSTIC)>;
 }
 
+// 12.2. Vector Widening Integer Add/Subtract
+defm : VPatBinaryWVL_VV_VX<riscv_vwaddu_vl, "PseudoVWADDU">;
+
 // 12.3. Vector Integer Extension
 defm : VPatExtendSDNode_V_VL<riscv_zext_vl, "PseudoVZEXT", "VF2",
                              AllFractionableVF2IntVectors>;
@@ -696,14 +759,19 @@ foreach vti = AllIntegerVectors in {
 }
 
 // 12.7. Vector Narrowing Integer Right Shift Instructions
+defm : VPatBinarySDNode_V_WV<srl, "PseudoVNSRL">;
+defm : VPatBinarySDNode_V_WX<srl, "PseudoVNSRL">;
+defm : VPatBinarySDNode_V_WV<sra, "PseudoVNSRA">;
+defm : VPatBinarySDNode_V_WX<sra, "PseudoVNSRA">;
+
 foreach vtiTowti = AllWidenableIntVectors in {
   defvar vti = vtiTowti.Vti;
   defvar wti = vtiTowti.Wti;
   def : Pat<(vti.Vector (riscv_trunc_vector_vl (wti.Vector wti.RegClass:$rs1),
                                                (vti.Mask true_mask),
                                                VLOpFrag)),
-            (!cast<Instruction>("PseudoVNSRL_WI_"#vti.LMul.MX)
-                wti.RegClass:$rs1, 0, GPR:$vl, vti.Log2SEW)>;
+            (!cast<Instruction>("PseudoVNSRL_WX_"#vti.LMul.MX)
+                wti.RegClass:$rs1, X0, GPR:$vl, vti.Log2SEW)>;
 
   def : Pat<(vti.Vector
              (riscv_trunc_vector_vl
@@ -760,6 +828,8 @@ foreach vti = AllIntegerVectors in {
   defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSNE",  SETNE,  SETNE>;
   defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLE",  SETLE,  SETGE>;
   defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSLEU", SETULE, SETUGE>;
+  defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGT",  SETGT,  SETLT>;
+  defm : VPatIntegerSetCCVL_VI_Swappable<vti, "PseudoVMSGTU", SETUGT, SETULT>;
 
   defm : VPatIntegerSetCCVL_VIPlus1<vti, "PseudoVMSLE",  SETLT,
                                     SplatPat_simm5_plus1>;
@@ -905,6 +975,30 @@ foreach vti = AllIntegerVectors in {
                                           VLOpFrag)),
             (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX)
                  vti.RegClass:$rs2, simm5:$rs1, (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+
+  def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
+                                           vti.RegClass:$rs1,
+                                           vti.RegClass:$rs2,
+                                           VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VVM_"#vti.LMul.MX#"_TU")
+                 vti.RegClass:$rs2, vti.RegClass:$rs2, vti.RegClass:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+
+  def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
+                                           (SplatPat XLenVT:$rs1),
+                                           vti.RegClass:$rs2,
+                                           VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VXM_"#vti.LMul.MX#"_TU")
+                 vti.RegClass:$rs2, vti.RegClass:$rs2, GPR:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
+
+  def : Pat<(vti.Vector (riscv_vp_merge_vl (vti.Mask V0),
+                                           (SplatPat_simm5 simm5:$rs1),
+                                           vti.RegClass:$rs2,
+                                           VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VIM_"#vti.LMul.MX#"_TU")
+                 vti.RegClass:$rs2, vti.RegClass:$rs2, simm5:$rs1,
+                 (vti.Mask V0), GPR:$vl, vti.Log2SEW)>;
 }
 
 // 12.16. Vector Integer Move Instructions
@@ -1152,6 +1246,31 @@ foreach fvti = AllFloatVectors in {
             (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX)
                  fvti.RegClass:$rs2, 0, (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
 
+  def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
+                                            fvti.RegClass:$rs1,
+                                            fvti.RegClass:$rs2,
+                                            VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VVM_"#fvti.LMul.MX#"_TU")
+                 fvti.RegClass:$rs2, fvti.RegClass:$rs2, fvti.RegClass:$rs1, (fvti.Mask V0),
+                 GPR:$vl, fvti.Log2SEW)>;
+
+  def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
+                                            (SplatFPOp fvti.ScalarRegClass:$rs1),
+                                            fvti.RegClass:$rs2,
+                                            VLOpFrag)),
+            (!cast<Instruction>("PseudoVFMERGE_V"#fvti.ScalarSuffix#"M_"#fvti.LMul.MX#"_TU")
+                 fvti.RegClass:$rs2, fvti.RegClass:$rs2,
+                 (fvti.Scalar fvti.ScalarRegClass:$rs1),
+                 (fvti.Mask V0), GPR:$vl, fvti.Log2SEW)>;
+
+  def : Pat<(fvti.Vector (riscv_vp_merge_vl (fvti.Mask V0),
+                                            (SplatFPOp (fvti.Scalar fpimm0)),
+                                            fvti.RegClass:$rs2,
+                                            VLOpFrag)),
+            (!cast<Instruction>("PseudoVMERGE_VIM_"#fvti.LMul.MX#"_TU")
+                 fvti.RegClass:$rs2, fvti.RegClass:$rs2, 0, (fvti.Mask V0),
+                 GPR:$vl, fvti.Log2SEW)>;
+
   // 14.16. Vector Floating-Point Move Instruction
   // If we're splatting fpimm0, use vmv.v.x vd, x0.
   def : Pat<(fvti.Vector (riscv_vfmv_v_f_vl
@@ -1368,6 +1487,11 @@ let Predicates = [HasVInstructionsAnyF] in {
 // 17.2. Floating-Point Scalar Move Instructions
 foreach vti = AllFloatVectors in {
   def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
+                                           (vti.Scalar (fpimm0)),
+                                           VLOpFrag)),
+            (!cast<Instruction>("PseudoVMV_S_X_"#vti.LMul.MX)
+                vti.RegClass:$merge, X0, GPR:$vl, vti.Log2SEW)>;
+  def : Pat<(vti.Vector (riscv_vfmv_s_f_vl (vti.Vector vti.RegClass:$merge),
                                            vti.ScalarRegClass:$rs1,
                                            VLOpFrag)),
             (!cast<Instruction>("PseudoVFMV_S_"#vti.ScalarSuffix#"_"#vti.LMul.MX)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 7eb8ae7d4193..db3f5851879a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -12,14 +12,22 @@
 //   Zbb - 1.0
 //   Zbc - 1.0
 //   Zbs - 1.0
-//   Zbe - 0.93
-//   Zbf - 0.93
-//   Zbm - 0.93
-//   Zbp - 0.93
-//   Zbr - 0.93
-//   Zbt - 0.93
-// This version is still experimental as the Bitmanip extensions haven't been
-// ratified yet.
+//   Zbe - 0.93 *experimental
+//   Zbf - 0.93 *experimental
+//   Zbm - 0.93 *experimental
+//   Zbp - 0.93 *experimental
+//   Zbr - 0.93 *experimental
+//   Zbt - 0.93 *experimental
+//
+// The experimental extensions appeared in an earlier draft of the Bitmanip
+// extensions. They are not ratified and subject to change.
+//
+// This file also describes RISC-V instructions from the Zbk* extensions in
+// Cryptography Extensions Volume I: Scalar & Entropy Source Instructions,
+// versions:
+//   Zbkb - 1.0
+//   Zbkc - 1.0
+//   Zbkx - 1.0
 //
 //===----------------------------------------------------------------------===//
 
@@ -43,6 +51,8 @@ def riscv_shfl   : SDNode<"RISCVISD::SHFL",   SDTIntBinOp>;
 def riscv_shflw  : SDNode<"RISCVISD::SHFLW",  SDT_RISCVIntBinOpW>;
 def riscv_unshfl : SDNode<"RISCVISD::UNSHFL", SDTIntBinOp>;
 def riscv_unshflw: SDNode<"RISCVISD::UNSHFLW",SDT_RISCVIntBinOpW>;
+def riscv_bfp    : SDNode<"RISCVISD::BFP",    SDTIntBinOp>;
+def riscv_bfpw   : SDNode<"RISCVISD::BFPW",   SDT_RISCVIntBinOpW>;
 def riscv_bcompress    : SDNode<"RISCVISD::BCOMPRESS",   SDTIntBinOp>;
 def riscv_bcompressw   : SDNode<"RISCVISD::BCOMPRESSW",  SDT_RISCVIntBinOpW>;
 def riscv_bdecompress  : SDNode<"RISCVISD::BDECOMPRESS", SDTIntBinOp>;
@@ -309,14 +319,14 @@ class RVBTernaryImm5<bits<2> funct2, bits<3> funct3, RISCVOpcode opcode,
 // Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZbbOrZbp] in {
+let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
 def ANDN  : ALU_rr<0b0100000, 0b111, "andn">,
             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 def ORN   : ALU_rr<0b0100000, 0b110, "orn">,
             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 def XNOR  : ALU_rr<0b0100000, 0b100, "xnor">,
             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
-} // Predicates = [HasStdExtZbbOrZbp]
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb]
 
 let Predicates = [HasStdExtZba] in {
 def SH1ADD : ALU_rr<0b0010000, 0b010, "sh1add">,
@@ -327,18 +337,22 @@ def SH3ADD : ALU_rr<0b0010000, 0b110, "sh3add">,
              Sched<[WriteSHXADD, ReadSHXADD, ReadSHXADD]>;
 } // Predicates = [HasStdExtZba]
 
-let Predicates = [HasStdExtZbbOrZbp] in {
+let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
 def ROL   : ALU_rr<0b0110000, 0b001, "rol">,
             Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>;
 def ROR   : ALU_rr<0b0110000, 0b101, "ror">,
             Sched<[WriteRotateReg, ReadRotateReg, ReadRotateReg]>;
-} // Predicates = [HasStdExtZbbOrZbp]
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb]
 
 let Predicates = [HasStdExtZbs] in {
-def BCLR : ALU_rr<0b0100100, 0b001, "bclr">, Sched<[]>;
-def BSET : ALU_rr<0b0010100, 0b001, "bset">, Sched<[]>;
-def BINV : ALU_rr<0b0110100, 0b001, "binv">, Sched<[]>;
-def BEXT : ALU_rr<0b0100100, 0b101, "bext">, Sched<[]>;
+def BCLR : ALU_rr<0b0100100, 0b001, "bclr">,
+           Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>;
+def BSET : ALU_rr<0b0010100, 0b001, "bset">,
+           Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>;
+def BINV : ALU_rr<0b0110100, 0b001, "binv">,
+           Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>;
+def BEXT : ALU_rr<0b0100100, 0b101, "bext">,
+           Sched<[WriteSingleBit, ReadSingleBit, ReadSingleBit]>;
 } // Predicates = [HasStdExtZbs]
 
 let Predicates = [HasStdExtZbp] in {
@@ -346,21 +360,28 @@ def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
 def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
 } // Predicates = [HasStdExtZbp]
 
+let Predicates = [HasStdExtZbpOrZbkx] in {
+def XPERMN : ALU_rr<0b0010100, 0b010, "xperm4">, Sched<[]>;
+def XPERMB : ALU_rr<0b0010100, 0b100, "xperm8">, Sched<[]>;
+} // Predicates = [HasStdExtZbpOrZbkx]
+
 let Predicates = [HasStdExtZbp] in {
-def XPERMN : ALU_rr<0b0010100, 0b010, "xperm.n">, Sched<[]>;
-def XPERMB : ALU_rr<0b0010100, 0b100, "xperm.b">, Sched<[]>;
 def XPERMH : ALU_rr<0b0010100, 0b110, "xperm.h">, Sched<[]>;
 } // Predicates = [HasStdExtZbp]
 
-let Predicates = [HasStdExtZbbOrZbp] in
+let Predicates = [HasStdExtZbbOrZbpOrZbkb] in
 def RORI  : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">,
             Sched<[WriteRotateImm, ReadRotateImm]>;
 
 let Predicates = [HasStdExtZbs] in {
-def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">, Sched<[]>;
-def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">, Sched<[]>;
-def BINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "binvi">, Sched<[]>;
-def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">, Sched<[]>;
+def BCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "bclri">,
+            Sched<[WriteSingleBitImm, ReadSingleBitImm]>;
+def BSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "bseti">,
+            Sched<[WriteSingleBitImm, ReadSingleBitImm]>;
+def BINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "binvi">,
+            Sched<[WriteSingleBitImm, ReadSingleBitImm]>;
+def BEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "bexti">,
+            Sched<[WriteSingleBitImm, ReadSingleBitImm]>;
 } // Predicates = [HasStdExtZbs]
 
 let Predicates = [HasStdExtZbp] in {
@@ -428,11 +449,17 @@ def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, OPC_OP_IMM, "crc32c.d">,
               Sched<[]>;
 
 let Predicates = [HasStdExtZbc] in {
-def CLMUL  : ALU_rr<0b0000101, 0b001, "clmul">, Sched<[]>;
-def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">, Sched<[]>;
-def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, Sched<[]>;
+def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">,
+             Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>;
 } // Predicates = [HasStdExtZbc]
 
+let Predicates = [HasStdExtZbcOrZbkc] in {
+def CLMUL  : ALU_rr<0b0000101, 0b001, "clmul">,
+             Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>;
+def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">,
+             Sched<[WriteCLMUL, ReadCLMUL, ReadCLMUL]>;
+} // Predicates = [HasStdExtZbcOrZbkc]
+
 let Predicates = [HasStdExtZbb] in {
 def MIN  : ALU_rr<0b0000101, 0b100, "min">,
            Sched<[WriteIALU, ReadIALU, ReadIALU]>;
@@ -456,11 +483,13 @@ def BDECOMPRESS : ALU_rr<0b0100100, 0b110, "bdecompress">, Sched<[]>;
 def BCOMPRESS   : ALU_rr<0b0000100, 0b110, "bcompress">, Sched<[]>;
 } // Predicates = [HasStdExtZbe]
 
-let Predicates = [HasStdExtZbp] in {
+let Predicates = [HasStdExtZbpOrZbkb] in {
 def PACK  : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
-def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
 def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
-} // Predicates = [HasStdExtZbp]
+} // Predicates = [HasStdExtZbpOrZbkb]
+
+let Predicates = [HasStdExtZbp] in 
+def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
 
 let Predicates = [HasStdExtZbm, IsRV64] in {
 def BMATOR   : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
@@ -468,7 +497,8 @@ def BMATXOR  : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
 } // Predicates = [HasStdExtZbm, IsRV64]
 
 let Predicates = [HasStdExtZbf] in
-def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>;
+def BFP : ALU_rr<0b0100100, 0b111, "bfp">,
+          Sched<[WriteBFP, ReadBFP, ReadBFP]>;
 
 let Predicates = [HasStdExtZbp] in {
 def SHFLI   : RVBShfl_ri<0b0000100, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
@@ -488,7 +518,7 @@ def SH3ADDUW : ALUW_rr<0b0010000, 0b110, "sh3add.uw">,
                Sched<[WriteSHXADD32, ReadSHXADD32, ReadSHXADD32]>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
 def ROLW  : ALUW_rr<0b0110000, 0b001, "rolw">,
             Sched<[WriteRotateReg32, ReadRotateReg32, ReadRotateReg32]>;
 def RORW  : ALUW_rr<0b0110000, 0b101, "rorw">,
@@ -504,7 +534,7 @@ let Predicates = [HasStdExtZbp, IsRV64] in {
 def XPERMW : ALU_rr<0b0010100, 0b000, "xperm.w">, Sched<[]>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in
 def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">,
             Sched<[WriteRotateImm32, ReadRotateImm32]>;
 
@@ -543,13 +573,15 @@ def BDECOMPRESSW : ALUW_rr<0b0100100, 0b110, "bdecompressw">, Sched<[]>;
 def BCOMPRESSW   : ALUW_rr<0b0000100, 0b110, "bcompressw">, Sched<[]>;
 } // Predicates = [HasStdExtZbe, IsRV64]
 
-let Predicates = [HasStdExtZbp, IsRV64] in {
+let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in 
 def PACKW  : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
+
+let Predicates = [HasStdExtZbp, IsRV64] in 
 def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
-} // Predicates = [HasStdExtZbp, IsRV64]
 
 let Predicates = [HasStdExtZbf, IsRV64] in
-def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[]>;
+def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">,
+           Sched<[WriteBFP32, ReadBFP32, ReadBFP32]>;
 
 let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
@@ -576,30 +608,30 @@ def ZEXTH_RV64 : RVInstR<0b0000100, 0b100, OPC_OP_32, (outs GPR:$rd),
 // causes diagnostics to suggest that Zbp rather than Zbb is required for rev8
 // or gorci. Since Zbb is closer to being finalized than Zbp this will be
 // misleading to users.
-let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def REV8_RV32 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
-                        "rev8", "$rd, $rs1">, Sched<[WriteREV8, ReadREV8]> {
-  let imm12 = { 0b01101, 0b0011000 };
-}
-} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV32] in {
+def REV8_RV32 : RVBUnary<0b0110100, 0b11000, 0b101, OPC_OP_IMM, "rev8">,
+                Sched<[WriteREV8, ReadREV8]>;
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV32]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def REV8_RV64 : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
-                        "rev8", "$rd, $rs1">, Sched<[WriteREV8, ReadREV8]> {
-  let imm12 = { 0b01101, 0b0111000 };
-}
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
+def REV8_RV64 : RVBUnary<0b0110101, 0b11000, 0b101, OPC_OP_IMM, "rev8">,
+                Sched<[WriteREV8, ReadREV8]>;
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbbOrZbp] in {
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def ORCB : RVInstI<0b101, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
-                   "orc.b", "$rd, $rs1">, Sched<[WriteORCB, ReadORCB]> {
-  let imm12 = { 0b00101, 0b0000111 };
-}
+def ORCB : RVBUnary<0b0010100, 0b00111, 0b101, OPC_OP_IMM, "orc.b">,
+           Sched<[WriteORCB, ReadORCB]>;
 } // Predicates = [HasStdExtZbbOrZbp]
 
+let Predicates = [HasStdExtZbpOrZbkb] in 
+def BREV8 : RVBUnary<0b0110100, 0b00111, 0b101, OPC_OP_IMM, "brev8">;
+
+let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in {
+def ZIP_RV32   : RVBUnary<0b0000100, 0b01111, 0b001, OPC_OP_IMM, "zip">;
+def UNZIP_RV32 : RVBUnary<0b0000100, 0b01111, 0b101, OPC_OP_IMM, "unzip">;
+} // Predicates = [HasStdExtZbkb, IsRV32]
+
+
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -614,11 +646,11 @@ def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>;
 def : InstAlias<"rev.n $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00011)>;
 def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>;
 def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>;
-def : InstAlias<"rev.b $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00111)>;
 def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>;
 def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>;
 def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>;
 def : InstAlias<"rev.h $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b01111)>;
+def : InstAlias<"rev.b $rd, $rs",  (BREV8 GPR:$rd, GPR:$rs)>;
 
 def : InstAlias<"zip.n $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0001)>;
 def : InstAlias<"unzip.n $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>;
@@ -658,8 +690,7 @@ def : InstAlias<"zip4 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1100)>;
 def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>;
 def : InstAlias<"zip2 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1110)>;
 def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>;
-def : InstAlias<"zip $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b1111)>;
-def : InstAlias<"unzip $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>;
+// zip and unzip are considered instructions rather than an alias.
 
 def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>;
 def : InstAlias<"orc8 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11000)>;
@@ -741,6 +772,13 @@ def : InstAlias<"gorcw $rd, $rs1, $shamt",
                 (GORCIW  GPR:$rd, GPR:$rs1, uimm5:$shamt), 0>;
 } // Predicates = [HasStdExtZbp, IsRV64]
 
+// Zbp is unratified and that it would likely adopt the already ratified Zbkx names.
+// Thus current Zbp instructions are defined as aliases for Zbkx instructions.
+let Predicates = [HasStdExtZbp] in {
+  def : InstAlias<"xperm.b $rd, $rs1, $rs2", (XPERMB GPR:$rd, GPR:$rs1, GPR:$rs2)>;
+  def : InstAlias<"xperm.n $rd, $rs1, $rs2", (XPERMN GPR:$rd, GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbp]
+
 let Predicates = [HasStdExtZbs] in {
 def : InstAlias<"bset $rd, $rs1, $shamt",
                 (BSETI  GPR:$rd, GPR:$rs1, uimmlog2xlen:$shamt), 0>;
@@ -756,16 +794,16 @@ def : InstAlias<"bext $rd, $rs1, $shamt",
 // Codegen patterns
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZbbOrZbp] in {
+let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
 def : Pat<(and GPR:$rs1, (not GPR:$rs2)), (ANDN GPR:$rs1, GPR:$rs2)>;
 def : Pat<(or  GPR:$rs1, (not GPR:$rs2)), (ORN  GPR:$rs1, GPR:$rs2)>;
 def : Pat<(xor GPR:$rs1, (not GPR:$rs2)), (XNOR GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbbOrZbp]
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb]
 
-let Predicates = [HasStdExtZbbOrZbp] in {
+let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
 def : PatGprGpr<rotl, ROL>;
 def : PatGprGpr<rotr, ROR>;
-} // Predicates = [HasStdExtZbbOrZbp]
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb]
 
 let Predicates = [HasStdExtZbs] in {
 def : Pat<(and (not (shiftop<shl> 1, GPR:$rs2)), GPR:$rs1),
@@ -816,7 +854,7 @@ def : Pat<(and GPR:$r, BCLRIANDIMask:$i),
 
 // There's no encoding for roli in the the 'B' extension as it can be
 // implemented with rori by negating the immediate.
-let Predicates = [HasStdExtZbbOrZbp] in {
+let Predicates = [HasStdExtZbbOrZbpOrZbkb] in {
 def : PatGprImm<rotr, RORI, uimmlog2xlen>;
 def : Pat<(rotl GPR:$rs1, uimmlog2xlen:$shamt),
           (RORI GPR:$rs1, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
@@ -834,19 +872,28 @@ def : PatGprGpr<riscv_unshfl, UNSHFL>;
 def : PatGprGpr<int_riscv_xperm_n, XPERMN>;
 def : PatGprGpr<int_riscv_xperm_b, XPERMB>;
 def : PatGprGpr<int_riscv_xperm_h, XPERMH>;
-def : PatGprGpr<int_riscv_xperm_w, XPERMW>;
 def : PatGprImm<riscv_shfl, SHFLI, shfl_uimm>;
 def : PatGprImm<riscv_unshfl, UNSHFLI, shfl_uimm>;
 def : PatGprImm<riscv_grev, GREVI, uimmlog2xlen>;
 def : PatGprImm<riscv_gorc, GORCI, uimmlog2xlen>;
+
+// We treat brev8 as a separate instruction, so match it directly.
+def : Pat<(riscv_grev GPR:$rs1, 7), (BREV8 GPR:$rs1)>;
 } // Predicates = [HasStdExtZbp]
 
+let Predicates = [HasStdExtZbp, IsRV64] in
+def : PatGprGpr<int_riscv_xperm_w, XPERMW>;
+
 let Predicates = [HasStdExtZbp, IsRV32] in {
 def : Pat<(i32 (rotr (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
 def : Pat<(i32 (rotl (riscv_grev GPR:$rs1, 24), (i32 16))), (GREVI GPR:$rs1, 8)>;
 
 // We treat rev8 as a separate instruction, so match it directly.
 def : Pat<(i32 (riscv_grev GPR:$rs1, 24)), (REV8_RV32 GPR:$rs1)>;
+
+// We treat zip and unzip as separate instructions, so match it directly.
+def : Pat<(i32 (riscv_shfl GPR:$rs1, 15)), (ZIP_RV32 GPR:$rs1)>;
+def : Pat<(i32 (riscv_unshfl GPR:$rs1, 15)), (UNZIP_RV32 GPR:$rs1)>;
 } // Predicates = [HasStdExtZbp, IsRV32]
 
 let Predicates = [HasStdExtZbp, IsRV64] in {
@@ -882,21 +929,16 @@ def : Pat<(select GPR:$rs2, GPR:$rs1, GPR:$rs3),
           (CMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
 } // Predicates = [HasStdExtZbt]
 
-// fshl and fshr concatenate their operands in the same order. fsr and fsl
-// instruction use different orders. fshl will return its first operand for
-// shift of zero, fshr will return its second operand. fsl and fsr both return
-// $rs1 so the patterns need to have different operand orders.
 let Predicates = [HasStdExtZbt] in {
 def : Pat<(riscv_fsl GPR:$rs1, GPR:$rs3, GPR:$rs2),
           (FSL GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_fsr GPR:$rs3, GPR:$rs1, GPR:$rs2),
+def : Pat<(riscv_fsr GPR:$rs1, GPR:$rs3, GPR:$rs2),
           (FSR GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-
-def : Pat<(fshr GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
+def : Pat<(riscv_fsr GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt),
           (FSRI GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt)>;
-// We can use FSRI for fshl by immediate if we subtract the immediate from
+// We can use FSRI for FSL by immediate if we subtract the immediate from
 // XLen and swap the operands.
-def : Pat<(fshl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
+def : Pat<(riscv_fsl GPR:$rs3, GPR:$rs1, uimmlog2xlen:$shamt),
           (FSRI GPR:$rs1, GPR:$rs3, (ImmSubFromXLen uimmlog2xlen:$shamt))>;
 } // Predicates = [HasStdExtZbt]
 
@@ -918,31 +960,38 @@ def : PatGprGpr<umin, MINU>;
 def : PatGprGpr<umax, MAXU>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbb, IsRV32] in {
+let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in {
 def : Pat<(i32 (bswap GPR:$rs1)), (REV8_RV32 GPR:$rs1)>;
-} // Predicates = [HasStdExtZbb, IsRV32]
+} // Predicates = [HasStdExtZbbOrZbkb, IsRV32]
 
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
 def : Pat<(i64 (bswap GPR:$rs1)), (REV8_RV64 GPR:$rs1)>;
-} // Predicates = [HasStdExtZbb, IsRV64]
+} // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
 
-let Predicates = [HasStdExtZbp, IsRV32] in {
+let Predicates = [HasStdExtZbpOrZbkb, IsRV32] in
 def : Pat<(i32 (or (and GPR:$rs1, 0x0000FFFF), (shl GPR:$rs2, (i32 16)))),
           (PACK GPR:$rs1, GPR:$rs2)>;
+
+let Predicates = [HasStdExtZbp, IsRV32] in
 def : Pat<(i32 (or (and GPR:$rs2, 0xFFFF0000), (srl GPR:$rs1, (i32 16)))),
           (PACKU GPR:$rs1, GPR:$rs2)>;
 
-}
-let Predicates = [HasStdExtZbp, IsRV64] in {
+let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in
 def : Pat<(i64 (or (and GPR:$rs1, 0x00000000FFFFFFFF), (shl GPR:$rs2, (i64 32)))),
           (PACK GPR:$rs1, GPR:$rs2)>;
+
+let Predicates = [HasStdExtZbp, IsRV64] in
 def : Pat<(i64 (or (and GPR:$rs2, 0xFFFFFFFF00000000), (srl GPR:$rs1, (i64 32)))),
           (PACKU GPR:$rs1, GPR:$rs2)>;
-}
-let Predicates = [HasStdExtZbp] in
+
+let Predicates = [HasStdExtZbpOrZbkb] in {
 def : Pat<(or (and (shl GPR:$rs2, (XLenVT 8)), 0xFFFF),
               (and GPR:$rs1, 0x00FF)),
           (PACKH GPR:$rs1, GPR:$rs2)>;
+def : Pat<(or (shl (and GPR:$rs2, 0x00FF), (XLenVT 8)),
+              (and GPR:$rs1, 0x00FF)),
+          (PACKH GPR:$rs1, GPR:$rs2)>;
+} // Predicates = [HasStdExtZbpOrZbkb]
 
 let Predicates = [HasStdExtZbbOrZbp, IsRV32] in
 def : Pat<(i32 (and GPR:$rs, 0xFFFF)), (ZEXTH_RV32 GPR:$rs)>;
@@ -1045,13 +1094,13 @@ def : Pat<(i64 (add (and (shl GPR:$rs1, (i64 3)), 0x7FFFFFFFF), non_imm12:$rs2))
           (SH3ADDUW GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
-let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+let Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64] in {
 def : PatGprGpr<riscv_rolw, ROLW>;
 def : PatGprGpr<riscv_rorw, RORW>;
 def : PatGprImm<riscv_rorw, RORIW, uimm5>;
 def : Pat<(riscv_rolw GPR:$rs1, uimm5:$rs2),
           (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
-} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+} // Predicates = [HasStdExtZbbOrZbpOrZbkb, IsRV64]
 
 let Predicates = [HasStdExtZbp, IsRV64] in {
 def : Pat<(riscv_rorw (riscv_grevw GPR:$rs1, 24), 16), (GREVIW GPR:$rs1, 8)>;
@@ -1067,10 +1116,12 @@ def : PatGprImm<riscv_gorcw, GORCIW, uimm5>;
 let Predicates = [HasStdExtZbt, IsRV64] in {
 def : Pat<(riscv_fslw GPR:$rs1, GPR:$rs3, GPR:$rs2),
           (FSLW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, GPR:$rs2),
+def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, GPR:$rs2),
           (FSRW GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
-def : Pat<(riscv_fsrw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
+def : Pat<(riscv_fsrw GPR:$rs1, GPR:$rs3, uimm5:$shamt),
           (FSRIW GPR:$rs1, GPR:$rs3, uimm5:$shamt)>;
+// We can use FSRIW for FSLW by immediate if we subtract the immediate from
+// 32 and swap the operands.
 def : Pat<(riscv_fslw GPR:$rs3, GPR:$rs1, uimm5:$shamt),
           (FSRIW GPR:$rs1, GPR:$rs3, (ImmSubFrom32 uimm5:$shamt))>;
 } // Predicates = [HasStdExtZbt, IsRV64]
@@ -1081,7 +1132,7 @@ def : PatGpr<riscv_ctzw, CTZW>;
 def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbp, IsRV64] in {
+let Predicates = [HasStdExtZbpOrZbkb, IsRV64] in {
 def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)),
                                (and GPR:$rs1, 0x000000000000FFFF)),
                            i32)),
@@ -1089,16 +1140,21 @@ def : Pat<(i64 (sext_inreg (or (shl GPR:$rs2, (i64 16)),
 def : Pat<(i64 (or (sext_inreg (shl GPR:$rs2, (i64 16)), i32),
                    (and GPR:$rs1, 0x000000000000FFFF))),
           (PACKW GPR:$rs1, GPR:$rs2)>;
+}
+
+let Predicates = [HasStdExtZbp, IsRV64] in
 def : Pat<(i64 (or (and (assertsexti32 GPR:$rs2), 0xFFFFFFFFFFFF0000),
                    (srl (and GPR:$rs1, 0xFFFFFFFF), (i64 16)))),
           (PACKUW GPR:$rs1, GPR:$rs2)>;
-} // Predicates = [HasStdExtZbp, IsRV64]
 
-let Predicates = [HasStdExtZbc] in {
+
+let Predicates = [HasStdExtZbcOrZbkc] in {
 def : PatGprGpr<int_riscv_clmul, CLMUL>;
 def : PatGprGpr<int_riscv_clmulh, CLMULH>;
+} // Predicates = [HasStdExtZbcOrZbkc]
+
+let Predicates = [HasStdExtZbc] in
 def : PatGprGpr<int_riscv_clmulr, CLMULR>;
-} // Predicates = [HasStdExtZbc]
 
 let Predicates = [HasStdExtZbe] in {
 def : PatGprGpr<riscv_bcompress, BCOMPRESS>;
@@ -1123,3 +1179,23 @@ let Predicates = [HasStdExtZbr, IsRV64] in {
 def : PatGpr<int_riscv_crc32_d, CRC32D>;
 def : PatGpr<int_riscv_crc32c_d, CRC32CD>;
 } // Predicates = [HasStdExtZbr, IsRV64]
+
+let Predicates = [HasStdExtZbf] in
+def : PatGprGpr<riscv_bfp, BFP>;
+
+let Predicates = [HasStdExtZbf, IsRV64] in
+def : PatGprGpr<riscv_bfpw, BFPW>;
+
+let Predicates = [HasStdExtZbkb] in {
+def : PatGpr<int_riscv_brev8, BREV8>;
+} // Predicates = [HasStdExtZbkb]
+
+let Predicates = [HasStdExtZbkb, IsRV32] in {
+def : PatGpr<int_riscv_zip, ZIP_RV32>;
+def : PatGpr<int_riscv_unzip, UNZIP_RV32>;
+} // Predicates = [HasStdExtZbkb, IsRV32]
+
+let Predicates = [HasStdExtZbkx] in {
+def : PatGprGpr<int_riscv_xperm4, XPERMN>;
+def : PatGprGpr<int_riscv_xperm8, XPERMB>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 663e44813899..dfd0c74ee26c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -1,4 +1,4 @@
-//===-- RISCVInstrInfoFH.td - RISC-V 'FH' instructions -----*- tablegen -*-===//
+//===-- RISCVInstrInfoZfh.td - RISC-V 'Zfh' instructions ---*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file describes the RISC-V instructions from the standard 'Zfh'
-// half-precision floating-point extension, version 0.1.
-// This version is still experimental as the 'Zfh' extension hasn't been
-// ratified yet.
+// half-precision floating-point extension, version 1.0.
 //
 //===----------------------------------------------------------------------===//
 
@@ -32,20 +30,12 @@ def riscv_fmv_x_anyexth
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtZfhmin] in {
-let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
-def FLH : RVInstI<0b001, OPC_LOAD_FP, (outs FPR16:$rd),
-                  (ins GPR:$rs1, simm12:$imm12),
-                   "flh", "$rd, ${imm12}(${rs1})">,
-          Sched<[WriteFLD16, ReadFMemBase]>;
+def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
 // reflecting the order these fields are specified in the instruction
 // encoding.
-let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
-def FSH : RVInstS<0b001, OPC_STORE_FP, (outs),
-                  (ins FPR16:$rs2, GPR:$rs1, simm12:$imm12),
-                   "fsh", "$rs2, ${imm12}(${rs1})">,
-          Sched<[WriteFST16, ReadStoreData, ReadFMemBase]>;
+def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>;
 } // Predicates = [HasStdExtZfhmin]
 
 let Predicates = [HasStdExtZfh] in {
@@ -190,6 +180,10 @@ def : InstAlias<"fge.h $rd, $rs, $rt",
 let Predicates = [HasStdExtZfhmin] in {
 def PseudoFLH  : PseudoFloatLoad<"flh", FPR16>;
 def PseudoFSH  : PseudoStore<"fsh", FPR16>;
+let usesCustomInserter = 1 in {
+def PseudoQuietFLE_H : PseudoQuietFCMP<FPR16>;
+def PseudoQuietFLT_H : PseudoQuietFCMP<FPR16>;
+}
 } // Predicates = [HasStdExtZfhmin]
 
 //===----------------------------------------------------------------------===//
@@ -207,6 +201,7 @@ let Predicates = [HasStdExtZfh] in {
 
 /// Float constants
 def : Pat<(f16 (fpimm0)), (FMV_H_X X0)>;
+def : Pat<(f16 (fpimmneg0)), (FSGNJN_H (FMV_H_X X0), (FMV_H_X X0))>;
 
 /// Float conversion operations
 
@@ -254,13 +249,34 @@ def : PatFpr16Fpr16<fminnum, FMIN_H>;
 def : PatFpr16Fpr16<fmaxnum, FMAX_H>;
 
 /// Setcc
-
-def : PatFpr16Fpr16<seteq, FEQ_H>;
-def : PatFpr16Fpr16<setoeq, FEQ_H>;
-def : PatFpr16Fpr16<setlt, FLT_H>;
-def : PatFpr16Fpr16<setolt, FLT_H>;
-def : PatFpr16Fpr16<setle, FLE_H>;
-def : PatFpr16Fpr16<setole, FLE_H>;
+// FIXME: SETEQ/SETLT/SETLE imply nonans, can we pick better instructions for
+// strict versions of those.
+
+// Match non-signaling FEQ_D
+def : PatSetCC<FPR16, any_fsetcc, SETEQ, FEQ_H>;
+def : PatSetCC<FPR16, any_fsetcc, SETOEQ, FEQ_H>;
+def : PatSetCC<FPR16, strict_fsetcc, SETLT, PseudoQuietFLT_H>;
+def : PatSetCC<FPR16, strict_fsetcc, SETOLT, PseudoQuietFLT_H>;
+def : PatSetCC<FPR16, strict_fsetcc, SETLE, PseudoQuietFLE_H>;
+def : PatSetCC<FPR16, strict_fsetcc, SETOLE, PseudoQuietFLE_H>;
+
+// Match signaling FEQ_H
+def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs2, SETEQ),
+          (AND (FLE_H $rs1, $rs2),
+               (FLE_H $rs2, $rs1))>;
+def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs2, SETOEQ),
+          (AND (FLE_H $rs1, $rs2),
+               (FLE_H $rs2, $rs1))>;
+// If both operands are the same, use a single FLE.
+def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs1, SETEQ),
+          (FLE_H $rs1, $rs1)>;
+def : Pat<(strict_fsetccs FPR16:$rs1, FPR16:$rs1, SETOEQ),
+          (FLE_H $rs1, $rs1)>;
+
+def : PatSetCC<FPR16, any_fsetccs, SETLT, FLT_H>;
+def : PatSetCC<FPR16, any_fsetccs, SETOLT, FLT_H>;
+def : PatSetCC<FPR16, any_fsetccs, SETLE, FLE_H>;
+def : PatSetCC<FPR16, any_fsetccs, SETOLE, FLE_H>;
 
 def Select_FPR16_Using_CC_GPR : SelectCC_rrirr<FPR16, GPR>;
 } // Predicates = [HasStdExtZfh]
@@ -291,14 +307,14 @@ def : Pat<(i32 (any_fp_to_sint FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
 def : Pat<(i32 (any_fp_to_uint FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>;
 
 // Saturating float->[u]int32.
-def : Pat<(i32 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_W_H $rs1, 0b001)>;
-def : Pat<(i32 (riscv_fcvt_xu_rtz FPR16:$rs1)), (FCVT_WU_H $rs1, 0b001)>;
+def : Pat<(i32 (riscv_fcvt_x FPR16:$rs1, timm:$frm)), (FCVT_W_H $rs1, timm:$frm)>;
+def : Pat<(i32 (riscv_fcvt_xu FPR16:$rs1, timm:$frm)), (FCVT_WU_H $rs1, timm:$frm)>;
 
 // half->int32 with current rounding mode.
-def : Pat<(i32 (lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>;
+def : Pat<(i32 (any_lrint FPR16:$rs1)), (FCVT_W_H $rs1, 0b111)>;
 
 // half->int32 rounded to nearest with ties rounded away from zero.
-def : Pat<(i32 (lround FPR16:$rs1)), (FCVT_W_H $rs1, 0b100)>;
+def : Pat<(i32 (any_lround FPR16:$rs1)), (FCVT_W_H $rs1, 0b100)>;
 
 // [u]int->half. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W $rs1, 0b111)>;
@@ -309,24 +325,24 @@ let Predicates = [HasStdExtZfh, IsRV64] in {
 // Use target specific isd nodes to help us remember the result is sign
 // extended. Matching sext_inreg+fptoui/fptosi may cause the conversion to be
 // duplicated if it has another user that didn't need the sign_extend.
-def : Pat<(riscv_any_fcvt_w_rtz_rv64 FPR16:$rs1),  (FCVT_W_H $rs1, 0b001)>;
-def : Pat<(riscv_any_fcvt_wu_rtz_rv64 FPR16:$rs1), (FCVT_WU_H $rs1, 0b001)>;
+def : Pat<(riscv_any_fcvt_w_rv64 FPR16:$rs1, timm:$frm),  (FCVT_W_H $rs1, timm:$frm)>;
+def : Pat<(riscv_any_fcvt_wu_rv64 FPR16:$rs1, timm:$frm), (FCVT_WU_H $rs1, timm:$frm)>;
 
 // half->[u]int64. Round-to-zero must be used.
 def : Pat<(i64 (any_fp_to_sint FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
 def : Pat<(i64 (any_fp_to_uint FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>;
 
 // Saturating float->[u]int64.
-def : Pat<(i64 (riscv_fcvt_x_rtz FPR16:$rs1)), (FCVT_L_H $rs1, 0b001)>;
-def : Pat<(i64 (riscv_fcvt_xu_rtz FPR16:$rs1)), (FCVT_LU_H $rs1, 0b001)>;
+def : Pat<(i64 (riscv_fcvt_x FPR16:$rs1, timm:$frm)), (FCVT_L_H $rs1, timm:$frm)>;
+def : Pat<(i64 (riscv_fcvt_xu FPR16:$rs1, timm:$frm)), (FCVT_LU_H $rs1, timm:$frm)>;
 
 // half->int64 with current rounding mode.
-def : Pat<(i64 (lrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
-def : Pat<(i64 (llrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
+def : Pat<(i64 (any_lrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
+def : Pat<(i64 (any_llrint FPR16:$rs1)), (FCVT_L_H $rs1, 0b111)>;
 
 // half->int64 rounded to nearest with ties rounded away from zero.
-def : Pat<(i64 (lround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
-def : Pat<(i64 (llround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
+def : Pat<(i64 (any_lround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
+def : Pat<(i64 (any_llround FPR16:$rs1)), (FCVT_L_H $rs1, 0b100)>;
 
 // [u]int->fp. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i64 (sexti32 (i64 GPR:$rs1)))), (FCVT_H_W $rs1, 0b111)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td
new file mode 100644
index 000000000000..4a41cddedc71
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZk.td
@@ -0,0 +1,203 @@
+//===- RISCVInstrInfoZk.td - RISC-V Scalar Crypto instructions - tablegen -*===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard 'Zk',
+// Scalar Cryptography Instructions extension, version 1.0.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+def RnumArg : AsmOperandClass {
+  let Name = "RnumArg";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidRnumArg";
+}
+
+def rnum : Operand<i32>, TImmLeaf<i32, [{return (Imm >= 0 && Imm <= 10);}]> {
+  let ParserMatchClass = RnumArg;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeUImmOperand<4>";
+  let OperandType = "OPERAND_RVKRNUM";
+  let OperandNamespace = "RISCVOp";
+}
+
+def byteselect : Operand<i8>, TImmLeaf<i8, [{return isUInt<2>(Imm);}]> {
+  let ParserMatchClass = UImmAsmOperand<2>;
+  let DecoderMethod = "decodeUImmOperand<2>";
+  let OperandType = "OPERAND_UIMM2";
+  let OperandNamespace = "RISCVOp";
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction class templates
+//===----------------------------------------------------------------------===//
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVKUnary<bits<12> imm12_in, bits<3> funct3, string opcodestr>
+    : RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1),
+              opcodestr, "$rd, $rs1">{
+  let imm12 = imm12_in;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVKByteSelect<bits<5> funct5, string opcodestr>
+    : RVInstR<{0b00, funct5}, 0b000, OPC_OP, (outs GPR:$rd),
+              (ins GPR:$rs1, GPR:$rs2, byteselect:$bs),
+              opcodestr, "$rd, $rs1, $rs2, $bs">{
+  bits<2> bs;
+  let Inst{31-30} = bs;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVKUnary_rnum<bits<7> funct7, bits<3> funct3, string opcodestr>
+    : RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1, rnum:$rnum),
+              opcodestr, "$rd, $rs1, $rnum">{
+    bits<4> rnum;
+    let Inst{31-25} = funct7;
+    let Inst{24} = 1;
+    let Inst{23-20} = rnum;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtZknd, IsRV32] in {
+def AES32DSI  : RVKByteSelect<0b10101, "aes32dsi">;
+def AES32DSMI : RVKByteSelect<0b10111, "aes32dsmi">;
+} // Predicates = [HasStdExtZknd, IsRV32]
+
+let Predicates = [HasStdExtZknd, IsRV64] in {
+def AES64DS  : ALU_rr<0b0011101, 0b000, "aes64ds">;
+def AES64DSM : ALU_rr<0b0011111, 0b000, "aes64dsm">;
+
+def AES64IM  : RVKUnary<0b001100000000, 0b001, "aes64im">;
+} // Predicates = [HasStdExtZknd, IsRV64]
+
+let Predicates = [HasStdExtZkndOrZkne, IsRV64] in {
+def AES64KS2  : ALU_rr<0b0111111, 0b000, "aes64ks2">;
+
+def AES64KS1I : RVKUnary_rnum<0b0011000, 0b001, "aes64ks1i">;
+} // Predicates = [HasStdExtZkndOrZkne, IsRV64]
+
+let Predicates = [HasStdExtZkne, IsRV32] in {
+def AES32ESI  : RVKByteSelect<0b10001, "aes32esi">;
+def AES32ESMI : RVKByteSelect<0b10011, "aes32esmi">;
+} // Predicates = [HasStdExtZkne, IsRV32]
+
+let Predicates = [HasStdExtZkne, IsRV64] in {
+def AES64ES   : ALU_rr<0b0011001, 0b000, "aes64es">;
+def AES64ESM  : ALU_rr<0b0011011, 0b000, "aes64esm">;
+} // Predicates = [HasStdExtZkne, IsRV64]
+
+let Predicates = [HasStdExtZknh] in {
+def SHA256SIG0 : RVKUnary<0b000100000010, 0b001, "sha256sig0">;
+def SHA256SIG1 : RVKUnary<0b000100000011, 0b001, "sha256sig1">;
+def SHA256SUM0 : RVKUnary<0b000100000000, 0b001, "sha256sum0">;
+def SHA256SUM1 : RVKUnary<0b000100000001, 0b001, "sha256sum1">;
+} // Predicates = [HasStdExtZknh]
+
+let Predicates = [HasStdExtZknh, IsRV32] in {
+def SHA512SIG0H : ALU_rr<0b0101110, 0b000, "sha512sig0h">;
+def SHA512SIG0L : ALU_rr<0b0101010, 0b000, "sha512sig0l">;
+def SHA512SIG1H : ALU_rr<0b0101111, 0b000, "sha512sig1h">;
+def SHA512SIG1L : ALU_rr<0b0101011, 0b000, "sha512sig1l">;
+def SHA512SUM0R : ALU_rr<0b0101000, 0b000, "sha512sum0r">;
+def SHA512SUM1R : ALU_rr<0b0101001, 0b000, "sha512sum1r">;
+} // [HasStdExtZknh, IsRV32]
+
+let Predicates = [HasStdExtZknh, IsRV64] in {
+def SHA512SIG0 : RVKUnary<0b000100000110, 0b001, "sha512sig0">;
+def SHA512SIG1 : RVKUnary<0b000100000111, 0b001, "sha512sig1">;
+def SHA512SUM0 : RVKUnary<0b000100000100, 0b001, "sha512sum0">;
+def SHA512SUM1 : RVKUnary<0b000100000101, 0b001, "sha512sum1">;
+} // Predicates = [HasStdExtZknh, IsRV64]
+
+let Predicates = [HasStdExtZksed] in {
+def SM4ED : RVKByteSelect<0b11000, "sm4ed">;
+def SM4KS : RVKByteSelect<0b11010, "sm4ks">;
+} // Predicates = [HasStdExtZksed]
+
+let Predicates = [HasStdExtZksh] in {
+def SM3P0 : RVKUnary<0b000100001000, 0b001, "sm3p0">;
+def SM3P1 : RVKUnary<0b000100001001, 0b001, "sm3p1">;
+} // Predicates = [HasStdExtZksh]
+
+//===----------------------------------------------------------------------===//
+// Codegen patterns
+//===----------------------------------------------------------------------===//
+
+class PatGprGprByteSelect<SDPatternOperator OpNode, RVInst Inst>
+    : Pat<(OpNode GPR:$rs1, GPR:$rs2, i8:$imm),
+          (Inst GPR:$rs1, GPR:$rs2, byteselect:$imm)>;
+
+// Zknd
+let Predicates = [HasStdExtZknd, IsRV32] in {
+def : PatGprGprByteSelect<int_riscv_aes32dsi, AES32DSI>;
+def : PatGprGprByteSelect<int_riscv_aes32dsmi, AES32DSMI>;
+} // Predicates = [HasStdExtZknd, IsRV32]
+
+let Predicates = [HasStdExtZknd, IsRV64] in {
+def : PatGprGpr<int_riscv_aes64ds, AES64DS>;
+def : PatGprGpr<int_riscv_aes64dsm, AES64DSM>;
+def : PatGpr<int_riscv_aes64im, AES64IM>;
+} // Predicates = [HasStdExtZknd, IsRV64]
+
+let Predicates = [HasStdExtZkndOrZkne, IsRV64] in {
+def : PatGprGpr<int_riscv_aes64ks2, AES64KS2>;
+def : Pat<(int_riscv_aes64ks1i GPR:$rs1, i32:$rnum),
+          (AES64KS1I GPR:$rs1, rnum:$rnum)>;
+} // Predicates = [HasStdExtZkndOrZkne, IsRV64]
+
+// Zkne
+let Predicates = [HasStdExtZkne, IsRV32] in {
+def : PatGprGprByteSelect<int_riscv_aes32esi, AES32ESI>;
+def : PatGprGprByteSelect<int_riscv_aes32esmi, AES32ESMI>;
+} // Predicates = [HasStdExtZkne, IsRV32]
+
+let Predicates = [HasStdExtZkne, IsRV64] in {
+def : PatGprGpr<int_riscv_aes64es, AES64ES>;
+def : PatGprGpr<int_riscv_aes64esm, AES64ESM>;
+} // Predicates = [HasStdExtZkne, IsRV64]
+
+// Zknh
+let Predicates = [HasStdExtZknh] in {
+def : PatGpr<int_riscv_sha256sig0, SHA256SIG0>;
+def : PatGpr<int_riscv_sha256sig1, SHA256SIG1>;
+def : PatGpr<int_riscv_sha256sum0, SHA256SUM0>;
+def : PatGpr<int_riscv_sha256sum1, SHA256SUM1>;
+} // Predicates = [HasStdExtZknh]
+
+let Predicates = [HasStdExtZknh, IsRV32] in {
+def : PatGprGpr<int_riscv_sha512sig0l, SHA512SIG0L>;
+def : PatGprGpr<int_riscv_sha512sig0h, SHA512SIG0H>;
+def : PatGprGpr<int_riscv_sha512sig1l, SHA512SIG1L>;
+def : PatGprGpr<int_riscv_sha512sig1h, SHA512SIG1H>;
+def : PatGprGpr<int_riscv_sha512sum0r, SHA512SUM0R>;
+def : PatGprGpr<int_riscv_sha512sum1r, SHA512SUM1R>;
+} // Predicates = [HasStdExtZknh, IsRV32]
+
+let Predicates = [HasStdExtZknh, IsRV64] in {
+def : PatGpr<int_riscv_sha512sig0, SHA512SIG0>;
+def : PatGpr<int_riscv_sha512sig1, SHA512SIG1>;
+def : PatGpr<int_riscv_sha512sum0, SHA512SUM0>;
+def : PatGpr<int_riscv_sha512sum1, SHA512SUM1>;
+} // Predicates = [HasStdExtZknh, IsRV64]
+
+// Zksed
+let Predicates = [HasStdExtZksed] in {
+def : PatGprGprByteSelect<int_riscv_sm4ks, SM4KS>;
+def : PatGprGprByteSelect<int_riscv_sm4ed, SM4ED>;
+} // Predicates = [HasStdExtZksed]
+
+// Zksh
+let Predicates = [HasStdExtZksh] in {
+def : PatGpr<int_riscv_sm3p0, SM3P0>;
+def : PatGpr<int_riscv_sm3p1, SM3P1>;
+} // Predicates = [HasStdExtZksh]
diff --git a/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp
index 4d1f47da209d..8dfd71ac0b6b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp
@@ -69,8 +69,7 @@ private:
 RISCVInstructionSelector::RISCVInstructionSelector(
     const RISCVTargetMachine &TM, const RISCVSubtarget &STI,
     const RISCVRegisterBankInfo &RBI)
-    : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI),
+    : STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI),
 
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "RISCVGenGlobalISel.inc"
diff --git a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
index dd084f53e511..c167c095521a 100644
--- a/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMCInstLower.cpp
@@ -172,7 +172,7 @@ static bool lowerRISCVVMachineInstrToMCInst(const MachineInstr *MI,
     default:
       llvm_unreachable("Unknown operand type");
     case MachineOperand::MO_Register: {
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
 
       if (RISCV::VRM2RegClass.contains(Reg) ||
           RISCV::VRM4RegClass.contains(Reg) ||
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 9094dff1dda1..35363bf37c0d 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -347,3 +347,8 @@ void RISCVRegisterInfo::getOffsetOpcodes(const StackOffset &Offset,
     Ops.push_back(dwarf::DW_OP_minus);
   }
 }
+
+unsigned
+RISCVRegisterInfo::getRegisterCostTableIndex(const MachineFunction &MF) const {
+  return MF.getSubtarget<RISCVSubtarget>().hasStdExtC() ? 1 : 0;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 2b2bbdfbdf32..9e0ef7902210 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -66,6 +66,8 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   void getOffsetOpcodes(const StackOffset &Offset,
                         SmallVectorImpl<uint64_t> &Ops) const override;
+
+  unsigned getRegisterCostTableIndex(const MachineFunction &MF) const override;
 };
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 20903b317180..8c1c03b51c24 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -73,12 +73,11 @@ def sub_vrm1_7 : ComposedSubRegIndex<sub_vrm2_3, sub_vrm1_1>;
 // are not part of GPRC, the most restrictive register class used by the
 // compressed instruction set. This will influence the greedy register
 // allocator to reduce the use of registers that can't be encoded in 16 bit
-// instructions. This affects register allocation even when compressed
-// instruction isn't targeted, we see no major negative codegen impact.
+// instructions.
 
 let RegAltNameIndices = [ABIRegAltName] in {
   def X0  : RISCVReg<0, "x0", ["zero"]>, DwarfRegNum<[0]>;
-  let CostPerUse = [1] in {
+  let CostPerUse = [0, 1] in {
   def X1  : RISCVReg<1, "x1", ["ra"]>, DwarfRegNum<[1]>;
   def X2  : RISCVReg<2, "x2", ["sp"]>, DwarfRegNum<[2]>;
   def X3  : RISCVReg<3, "x3", ["gp"]>, DwarfRegNum<[3]>;
@@ -95,7 +94,7 @@ let RegAltNameIndices = [ABIRegAltName] in {
   def X13 : RISCVReg<13,"x13", ["a3"]>, DwarfRegNum<[13]>;
   def X14 : RISCVReg<14,"x14", ["a4"]>, DwarfRegNum<[14]>;
   def X15 : RISCVReg<15,"x15", ["a5"]>, DwarfRegNum<[15]>;
-  let CostPerUse = [1] in {
+  let CostPerUse = [0, 1] in {
   def X16 : RISCVReg<16,"x16", ["a6"]>, DwarfRegNum<[16]>;
   def X17 : RISCVReg<17,"x17", ["a7"]>, DwarfRegNum<[17]>;
   def X18 : RISCVReg<18,"x18", ["s2"]>, DwarfRegNum<[18]>;
@@ -138,27 +137,11 @@ def GPRX0 : RegisterClass<"RISCV", [XLenVT], 32, (add X0)> {
   let RegInfos = XLenRI;
 }
 
-// The order of registers represents the preferred allocation sequence.
-// Registers are listed in the order caller-save, callee-save, specials.
-def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (add
-    (sequence "X%u", 10, 17),
-    (sequence "X%u", 5, 7),
-    (sequence "X%u", 28, 31),
-    (sequence "X%u", 8, 9),
-    (sequence "X%u", 18, 27),
-    (sequence "X%u", 1, 4)
-  )> {
+def GPRNoX0 : RegisterClass<"RISCV", [XLenVT], 32, (sub GPR, X0)> {
   let RegInfos = XLenRI;
 }
 
-def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add
-    (sequence "X%u", 10, 17),
-    (sequence "X%u", 5, 7),
-    (sequence "X%u", 28, 31),
-    (sequence "X%u", 8, 9),
-    (sequence "X%u", 18, 27),
-    X1, X3, X4
-  )> {
+def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (sub GPR, X0, X2)> {
   let RegInfos = XLenRI;
 }
 
@@ -166,13 +149,7 @@ def GPRNoX0X2 : RegisterClass<"RISCV", [XLenVT], 32, (add
 // stack on some microarchitectures. Also remove the reserved registers X0, X2,
 // X3, and X4 as it reduces the number of register classes that get synthesized
 // by tablegen.
-def GPRJALR : RegisterClass<"RISCV", [XLenVT], 32, (add
-    (sequence "X%u", 10, 17),
-    (sequence "X%u", 6, 7),
-    (sequence "X%u", 28, 31),
-    (sequence "X%u", 8, 9),
-    (sequence "X%u", 18, 27)
-  )> {
+def GPRJALR : RegisterClass<"RISCV", [XLenVT], 32, (sub GPR, (sequence "X%u", 0, 5))> {
   let RegInfos = XLenRI;
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
new file mode 100644
index 000000000000..12ec52925798
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
@@ -0,0 +1,278 @@
+//===-------------- RISCVSExtWRemoval.cpp - MI sext.w Removal -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass removes unneeded sext.w instructions at the MI level.
+//
+//===---------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-sextw-removal"
+
+STATISTIC(NumRemovedSExtW, "Number of removed sign-extensions");
+
+static cl::opt<bool> DisableSExtWRemoval("riscv-disable-sextw-removal",
+                                         cl::desc("Disable removal of sext.w"),
+                                         cl::init(false), cl::Hidden);
+namespace {
+
+class RISCVSExtWRemoval : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RISCVSExtWRemoval() : MachineFunctionPass(ID) {
+    initializeRISCVSExtWRemovalPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "RISCV sext.w Removal"; }
+};
+
+} // end anonymous namespace
+
+char RISCVSExtWRemoval::ID = 0;
+INITIALIZE_PASS(RISCVSExtWRemoval, DEBUG_TYPE, "RISCV sext.w Removal", false,
+                false)
+
+FunctionPass *llvm::createRISCVSExtWRemovalPass() {
+  return new RISCVSExtWRemoval();
+}
+
+// This function returns true if the machine instruction always outputs a value
+// where bits 63:32 match bit 31.
+// TODO: Allocate a bit in TSFlags for the W instructions?
+// TODO: Add other W instructions.
+static bool isSignExtendingOpW(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case RISCV::LUI:
+  case RISCV::LW:
+  case RISCV::ADDW:
+  case RISCV::ADDIW:
+  case RISCV::SUBW:
+  case RISCV::MULW:
+  case RISCV::SLLW:
+  case RISCV::SLLIW:
+  case RISCV::SRAW:
+  case RISCV::SRAIW:
+  case RISCV::SRLW:
+  case RISCV::SRLIW:
+  case RISCV::DIVW:
+  case RISCV::DIVUW:
+  case RISCV::REMW:
+  case RISCV::REMUW:
+  case RISCV::ROLW:
+  case RISCV::RORW:
+  case RISCV::RORIW:
+  case RISCV::CLZW:
+  case RISCV::CTZW:
+  case RISCV::CPOPW:
+  case RISCV::FCVT_W_H:
+  case RISCV::FCVT_WU_H:
+  case RISCV::FCVT_W_S:
+  case RISCV::FCVT_WU_S:
+  case RISCV::FCVT_W_D:
+  case RISCV::FCVT_WU_D:
+  // The following aren't W instructions, but are either sign extended from a
+  // smaller size or put zeros in bits 63:31.
+  case RISCV::LBU:
+  case RISCV::LHU:
+  case RISCV::LB:
+  case RISCV::LH:
+  case RISCV::SLT:
+  case RISCV::SLTI:
+  case RISCV::SLTU:
+  case RISCV::SLTIU:
+  case RISCV::SEXTB:
+  case RISCV::SEXTH:
+  case RISCV::ZEXTH_RV64:
+    return true;
+  // shifting right sufficiently makes the value 32-bit sign-extended
+  case RISCV::SRAI:
+    return MI.getOperand(2).getImm() >= 32;
+  case RISCV::SRLI:
+    return MI.getOperand(2).getImm() > 32;
+  // The LI pattern ADDI rd, X0, imm is sign extended.
+  case RISCV::ADDI:
+    return MI.getOperand(1).isReg() && MI.getOperand(1).getReg() == RISCV::X0;
+  // An ANDI with an 11 bit immediate will zero bits 63:11.
+  case RISCV::ANDI:
+    return isUInt<11>(MI.getOperand(2).getImm());
+  // An ORI with an >11 bit immediate (negative 12-bit) will set bits 63:11.
+  case RISCV::ORI:
+    return !isUInt<11>(MI.getOperand(2).getImm());
+  // Copying from X0 produces zero.
+  case RISCV::COPY:
+    return MI.getOperand(1).getReg() == RISCV::X0;
+  }
+
+  return false;
+}
+
+static bool isSignExtendedW(const MachineInstr &OrigMI,
+                            MachineRegisterInfo &MRI) {
+
+  SmallPtrSet<const MachineInstr *, 4> Visited;
+  SmallVector<const MachineInstr *, 4> Worklist;
+
+  Worklist.push_back(&OrigMI);
+
+  while (!Worklist.empty()) {
+    const MachineInstr *MI = Worklist.pop_back_val();
+
+    // If we already visited this instruction, we don't need to check it again.
+    if (!Visited.insert(MI).second)
+      continue;
+
+    // If this is a sign extending operation we don't need to look any further.
+    if (isSignExtendingOpW(*MI))
+      continue;
+
+    // Is this an instruction that propagates sign extend.
+    switch (MI->getOpcode()) {
+    default:
+      // Unknown opcode, give up.
+      return false;
+    case RISCV::COPY: {
+      Register SrcReg = MI->getOperand(1).getReg();
+
+      // TODO: Handle arguments and returns from calls?
+
+      // If this is a copy from another register, check its source instruction.
+      if (!SrcReg.isVirtual())
+        return false;
+      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      if (!SrcMI)
+        return false;
+
+      // Add SrcMI to the worklist.
+      Worklist.push_back(SrcMI);
+      break;
+    }
+    case RISCV::REM:
+    case RISCV::ANDI:
+    case RISCV::ORI:
+    case RISCV::XORI: {
+      // |Remainder| is always <= |Dividend|. If D is 32-bit, then so is R.
+      // DIV doesn't work because of the edge case 0xf..f 8000 0000 / (long)-1
+      // Logical operations use a sign extended 12-bit immediate. We just need
+      // to check if the other operand is sign extended.
+      Register SrcReg = MI->getOperand(1).getReg();
+      if (!SrcReg.isVirtual())
+        return false;
+      const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+      if (!SrcMI)
+        return false;
+
+      // Add SrcMI to the worklist.
+      Worklist.push_back(SrcMI);
+      break;
+    }
+    case RISCV::REMU:
+    case RISCV::AND:
+    case RISCV::OR:
+    case RISCV::XOR:
+    case RISCV::ANDN:
+    case RISCV::ORN:
+    case RISCV::XNOR:
+    case RISCV::MAX:
+    case RISCV::MAXU:
+    case RISCV::MIN:
+    case RISCV::MINU:
+    case RISCV::PHI: {
+      // If all incoming values are sign-extended, the output of AND, OR, XOR,
+      // MIN, MAX, or PHI is also sign-extended.
+
+      // The input registers for PHI are operand 1, 3, ...
+      // The input registers for others are operand 1 and 2.
+      unsigned E = 3, D = 1;
+      if (MI->getOpcode() == RISCV::PHI) {
+        E = MI->getNumOperands();
+        D = 2;
+      }
+
+      for (unsigned I = 1; I != E; I += D) {
+        if (!MI->getOperand(I).isReg())
+          return false;
+
+        Register SrcReg = MI->getOperand(I).getReg();
+        if (!SrcReg.isVirtual())
+          return false;
+        const MachineInstr *SrcMI = MRI.getVRegDef(SrcReg);
+        if (!SrcMI)
+          return false;
+
+        // Add SrcMI to the worklist.
+        Worklist.push_back(SrcMI);
+      }
+
+      break;
+    }
+    }
+  }
+
+  // If we get here, then every node we visited produces a sign extended value
+  // or propagated sign extended values. So the result must be sign extended.
+  return true;
+}
+
+bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()) || DisableSExtWRemoval)
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+
+  if (!ST.is64Bit())
+    return false;
+
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto I = MBB.begin(), IE = MBB.end(); I != IE;) {
+      MachineInstr *MI = &*I++;
+
+      // We're looking for the sext.w pattern ADDIW rd, rs1, 0.
+      if (MI->getOpcode() != RISCV::ADDIW || !MI->getOperand(2).isImm() ||
+          MI->getOperand(2).getImm() != 0 || !MI->getOperand(1).isReg())
+        continue;
+
+      // Input should be a virtual register.
+      Register SrcReg = MI->getOperand(1).getReg();
+      if (!SrcReg.isVirtual())
+        continue;
+
+      const MachineInstr &SrcMI = *MRI.getVRegDef(SrcReg);
+      if (!isSignExtendedW(SrcMI, MRI))
+        continue;
+
+      Register DstReg = MI->getOperand(0).getReg();
+      if (!MRI.constrainRegClass(SrcReg, MRI.getRegClass(DstReg)))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Removing redundant sign-extension\n");
+      MRI.replaceRegWith(DstReg, SrcReg);
+      MRI.clearKillFlags(SrcReg);
+      MI->eraseFromParent();
+      ++NumRemovedSExtW;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index d5a0932c8778..78cf34c8c582 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -17,7 +17,10 @@ def RocketModel : SchedMachineModel {
   let LoadLatency = 3;
   let MispredictPenalty = 3;
   let CompleteModel = false;
-  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg];
+  let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+                             HasVInstructions, HasVInstructionsI64];
 }
 
 //===----------------------------------------------------------------------===//
@@ -237,5 +240,8 @@ def : ReadAdvance<ReadFClass64, 0>;
 defm : UnsupportedSchedV;
 defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbs;
+defm : UnsupportedSchedZbf;
 defm : UnsupportedSchedZfh;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 7f9d0aabc4ed..9f5e5ff1223c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -15,7 +15,10 @@ def SiFive7Model : SchedMachineModel {
   let LoadLatency = 3;
   let MispredictPenalty = 3;
   let CompleteModel = 0;
-  let UnsupportedFeatures = [HasStdExtV, HasStdExtZvlsseg];
+  let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
+                             HasStdExtZknd, HasStdExtZkne, HasStdExtZknh,
+                             HasStdExtZksed, HasStdExtZksh, HasStdExtZkr,
+                             HasVInstructions];
 }
 
 // The SiFive7 microarchitecture has two pipelines: A and B.
@@ -224,5 +227,8 @@ def : ReadAdvance<ReadFClass64, 0>;
 defm : UnsupportedSchedV;
 defm : UnsupportedSchedZba;
 defm : UnsupportedSchedZbb;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbs;
+defm : UnsupportedSchedZbf;
 defm : UnsupportedSchedZfh;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleB.td b/llvm/lib/Target/RISCV/RISCVScheduleB.td
index b668b0acd719..193760e1e15b 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleB.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleB.td
@@ -26,6 +26,17 @@ def WriteCPOP32      : SchedWrite;
 def WriteREV8        : SchedWrite;
 def WriteORCB        : SchedWrite;
 
+// Zbc extension
+def WriteCLMUL       : SchedWrite; // CLMUL/CLMULR/CLMULH
+
+// Zbs extension
+def WriteSingleBit   : SchedWrite; // BCLR/BSET/BINV/BEXT
+def WriteSingleBitImm: SchedWrite; // BCLRI/BSETI/BINVI/BEXTI
+
+// Zbf extension
+def WriteBFP         : SchedWrite; // BFP
+def WriteBFP32       : SchedWrite; // BFPW
+
 /// Define scheduler resources associated with use operands.
 
 // Zba extension
@@ -46,6 +57,17 @@ def ReadCPOP32      : SchedRead;
 def ReadREV8        : SchedRead;
 def ReadORCB        : SchedRead;
 
+// Zbc extension
+def ReadCLMUL       : SchedRead; // CLMUL/CLMULR/CLMULH
+
+// Zbs extension
+def ReadSingleBit   : SchedRead; // BCLR/BSET/BINV/BEXT
+def ReadSingleBitImm: SchedRead; // BCLRI/BSETI/BINVI/BEXTI
+
+// Zbf extension
+def ReadBFP         : SchedRead; // BFP
+def ReadBFP32       : SchedRead; // BFPW
+
 /// Define default scheduler resources for B.
 
 multiclass UnsupportedSchedZba {
@@ -87,3 +109,31 @@ def : ReadAdvance<ReadREV8, 0>;
 def : ReadAdvance<ReadORCB, 0>;
 }
 }
+
+multiclass UnsupportedSchedZbc {
+let Unsupported = true in {
+def : WriteRes<WriteCLMUL, []>;
+
+def : ReadAdvance<ReadCLMUL, 0>;
+}
+}
+
+multiclass UnsupportedSchedZbs {
+let Unsupported = true in {
+def : WriteRes<WriteSingleBit, []>;
+def : WriteRes<WriteSingleBitImm, []>;
+
+def : ReadAdvance<ReadSingleBit, 0>;
+def : ReadAdvance<ReadSingleBitImm, 0>;
+}
+}
+
+multiclass UnsupportedSchedZbf {
+let Unsupported = true in {
+def : WriteRes<WriteBFP, []>;
+def : WriteRes<WriteBFP32, []>;
+
+def : ReadAdvance<ReadBFP, 0>;
+def : ReadAdvance<ReadBFP32, 0>;
+}
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 1063134b8a6c..976e4ccb1422 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -18,6 +18,7 @@
 #include "RISCVRegisterBankInfo.h"
 #include "RISCVTargetMachine.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -50,6 +51,16 @@ static cl::opt<unsigned> RVVVectorELENMax(
     cl::desc("The maximum ELEN value to use for fixed length vectors."),
     cl::init(64), cl::Hidden);
 
+static cl::opt<bool> RISCVDisableUsingConstantPoolForLargeInts(
+    "riscv-disable-using-constant-pool-for-large-ints",
+    cl::desc("Disable using constant pool for large integers."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> RISCVMaxBuildIntsCost(
+    "riscv-max-build-ints-cost",
+    cl::desc("The maximum cost used for building integers."), cl::init(0),
+    cl::Hidden);
+
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &
@@ -110,37 +121,69 @@ const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const {
   return RegBankInfo.get();
 }
 
+bool RISCVSubtarget::useConstantPoolForLargeInts() const {
+  return !RISCVDisableUsingConstantPoolForLargeInts;
+}
+
+unsigned RISCVSubtarget::getMaxBuildIntsCost() const {
+  // Loading integer from constant pool needs two instructions (the reason why
+  // the minimum cost is 2): an address calculation instruction and a load
+  // instruction. Usually, address calculation and instructions used for
+  // building integers (addi, slli, etc.) can be done in one cycle, so here we
+  // set the default cost to (LoadLatency + 1) if no threshold is provided.
+  return RISCVMaxBuildIntsCost == 0
+             ? getSchedModel().LoadLatency + 1
+             : std::max<unsigned>(2, RISCVMaxBuildIntsCost);
+}
+
 unsigned RISCVSubtarget::getMaxRVVVectorSizeInBits() const {
   assert(hasVInstructions() &&
          "Tried to get vector length without Zve or V extension support!");
   if (RVVVectorBitsMax == 0)
     return 0;
-  assert(RVVVectorBitsMax >= 128 && RVVVectorBitsMax <= 65536 &&
-         isPowerOf2_32(RVVVectorBitsMax) &&
-         "V extension requires vector length to be in the range of 128 to "
-         "65536 and a power of 2!");
+
+  // ZvlLen specifies the minimum required vlen. The upper bound provided by
+  // riscv-v-vector-bits-max should be no less than it.
+  if (RVVVectorBitsMax < ZvlLen)
+    report_fatal_error("riscv-v-vector-bits-max specified is lower "
+                       "than the Zvl*b limitation");
+
+  // FIXME: Change to >= 32 when VLEN = 32 is supported
+  assert(
+      RVVVectorBitsMax >= 64 && RVVVectorBitsMax <= 65536 &&
+      isPowerOf2_32(RVVVectorBitsMax) &&
+      "V or Zve* extension requires vector length to be in the range of 64 to "
+      "65536 and a power of 2!");
   assert(RVVVectorBitsMax >= RVVVectorBitsMin &&
          "Minimum V extension vector length should not be larger than its "
          "maximum!");
   unsigned Max = std::max(RVVVectorBitsMin, RVVVectorBitsMax);
-  return PowerOf2Floor((Max < 128 || Max > 65536) ? 0 : Max);
+  return PowerOf2Floor((Max < 64 || Max > 65536) ? 0 : Max);
 }
 
 unsigned RISCVSubtarget::getMinRVVVectorSizeInBits() const {
+  // ZvlLen specifies the minimum required vlen. The lower bound provided by
+  // riscv-v-vector-bits-min should be no less than it.
+  if (RVVVectorBitsMin != 0 && RVVVectorBitsMin < ZvlLen)
+    report_fatal_error("riscv-v-vector-bits-min specified is lower "
+                       "than the Zvl*b limitation");
+
   assert(hasVInstructions() &&
          "Tried to get vector length without Zve or V extension support!");
-  assert((RVVVectorBitsMin == 0 ||
-          (RVVVectorBitsMin >= 128 && RVVVectorBitsMax <= 65536 &&
-           isPowerOf2_32(RVVVectorBitsMin))) &&
-         "V extension requires vector length to be in the range of 128 to "
-         "65536 and a power of 2!");
+  // FIXME: Change to >= 32 when VLEN = 32 is supported
+  assert(
+      (RVVVectorBitsMin == 0 ||
+       (RVVVectorBitsMin >= 64 && RVVVectorBitsMin <= 65536 &&
+        isPowerOf2_32(RVVVectorBitsMin))) &&
+      "V or Zve* extension requires vector length to be in the range of 64 to "
+      "65536 and a power of 2!");
   assert((RVVVectorBitsMax >= RVVVectorBitsMin || RVVVectorBitsMax == 0) &&
          "Minimum V extension vector length should not be larger than its "
          "maximum!");
   unsigned Min = RVVVectorBitsMin;
   if (RVVVectorBitsMax != 0)
     Min = std::min(RVVVectorBitsMin, RVVVectorBitsMax);
-  return PowerOf2Floor((Min < 128 || Min > 65536) ? 0 : Min);
+  return PowerOf2Floor((Min < 64 || Min > 65536) ? 0 : Min);
 }
 
 unsigned RISCVSubtarget::getMaxLMULForFixedLengthVectors() const {
@@ -158,8 +201,9 @@ unsigned RISCVSubtarget::getMaxELENForFixedLengthVectors() const {
   assert(RVVVectorELENMax <= 64 && RVVVectorELENMax >= 8 &&
          isPowerOf2_32(RVVVectorELENMax) &&
          "V extension requires a ELEN to be a power of 2 between 8 and 64!");
+  unsigned ELEN = hasVInstructionsI64() ? 64 : 32;
   return PowerOf2Floor(
-      std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, 64), 8));
+      std::max<unsigned>(std::min<unsigned>(RVVVectorELENMax, ELEN), 8));
 }
 
 bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index d0330e6984a5..044dda0a1ccc 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -33,7 +33,33 @@ namespace llvm {
 class StringRef;
 
 class RISCVSubtarget : public RISCVGenSubtargetInfo {
+public:
+  enum ExtZvl : unsigned {
+    NotSet = 0,
+    Zvl32b = 32,
+    Zvl64b = 64,
+    Zvl128b = 128,
+    Zvl256b = 256,
+    Zvl512b = 512,
+    Zvl1024b = 1024,
+    Zvl2048b = 2048,
+    Zvl4096b = 4096,
+    Zvl8192b = 8192,
+    Zvl16384b = 16384,
+    Zvl32768b = 32768,
+    Zvl65536b = 65536
+  };
+
+  enum RISCVProcFamilyEnum : uint8_t {
+    Others,
+    SiFive7,
+  };
+
+private:
   virtual void anchor();
+
+  RISCVProcFamilyEnum RISCVProcFamily = Others;
+
   bool HasStdExtM = false;
   bool HasStdExtA = false;
   bool HasStdExtF = false;
@@ -50,15 +76,33 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtZbs = false;
   bool HasStdExtZbt = false;
   bool HasStdExtV = false;
-  bool HasStdExtZvlsseg = false;
+  bool HasStdExtZve32x = false;
+  bool HasStdExtZve32f = false;
+  bool HasStdExtZve64x = false;
+  bool HasStdExtZve64f = false;
+  bool HasStdExtZve64d = false;
   bool HasStdExtZfhmin = false;
   bool HasStdExtZfh = false;
+  bool HasStdExtZbkb = false;
+  bool HasStdExtZbkc = false;
+  bool HasStdExtZbkx = false;
+  bool HasStdExtZknd = false;
+  bool HasStdExtZkne = false;
+  bool HasStdExtZknh = false;
+  bool HasStdExtZksed = false;
+  bool HasStdExtZksh = false;
+  bool HasStdExtZkr = false;
+  bool HasStdExtZkn = false;
+  bool HasStdExtZks = false;
+  bool HasStdExtZkt = false;
+  bool HasStdExtZk = false;
   bool HasRV64 = false;
   bool IsRV32E = false;
   bool EnableLinkerRelax = false;
   bool EnableRVCHintInstrs = true;
   bool EnableSaveRestore = false;
   unsigned XLen = 32;
+  ExtZvl ZvlLen = ExtZvl::NotSet;
   MVT XLenVT = MVT::i32;
   uint8_t MaxInterleaveFactor = 2;
   RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
@@ -100,11 +144,19 @@ public:
     return &TSInfo;
   }
   bool enableMachineScheduler() const override { return true; }
+
+  /// Returns RISCV processor family.
+  /// Avoid this function! CPU specifics should be kept local to this class
+  /// and preferably modeled with SubtargetFeatures or properties in
+  /// initializeProperties().
+  RISCVProcFamilyEnum getProcFamily() const { return RISCVProcFamily; }
+
   bool hasStdExtM() const { return HasStdExtM; }
   bool hasStdExtA() const { return HasStdExtA; }
   bool hasStdExtF() const { return HasStdExtF; }
   bool hasStdExtD() const { return HasStdExtD; }
   bool hasStdExtC() const { return HasStdExtC; }
+  bool hasStdExtV() const { return HasStdExtV; }
   bool hasStdExtZba() const { return HasStdExtZba; }
   bool hasStdExtZbb() const { return HasStdExtZbb; }
   bool hasStdExtZbc() const { return HasStdExtZbc; }
@@ -115,10 +167,18 @@ public:
   bool hasStdExtZbr() const { return HasStdExtZbr; }
   bool hasStdExtZbs() const { return HasStdExtZbs; }
   bool hasStdExtZbt() const { return HasStdExtZbt; }
-  bool hasStdExtV() const { return HasStdExtV; }
-  bool hasStdExtZvlsseg() const { return HasStdExtZvlsseg; }
+  bool hasStdExtZvl() const { return ZvlLen != ExtZvl::NotSet; }
   bool hasStdExtZfhmin() const { return HasStdExtZfhmin; }
   bool hasStdExtZfh() const { return HasStdExtZfh; }
+  bool hasStdExtZbkb() const { return HasStdExtZbkb; }
+  bool hasStdExtZbkc() const { return HasStdExtZbkc; }
+  bool hasStdExtZbkx() const { return HasStdExtZbkx; }
+  bool hasStdExtZknd() const { return HasStdExtZknd; }
+  bool hasStdExtZkne() const { return HasStdExtZkne; }
+  bool hasStdExtZknh() const { return HasStdExtZknh; }
+  bool hasStdExtZksed() const { return HasStdExtZksed; }
+  bool hasStdExtZksh() const { return HasStdExtZksh; }
+  bool hasStdExtZkr() const { return HasStdExtZkr; }
   bool is64Bit() const { return HasRV64; }
   bool isRV32E() const { return IsRV32E; }
   bool enableLinkerRelax() const { return EnableLinkerRelax; }
@@ -126,6 +186,15 @@ public:
   bool enableSaveRestore() const { return EnableSaveRestore; }
   MVT getXLenVT() const { return XLenVT; }
   unsigned getXLen() const { return XLen; }
+  unsigned getFLen() const {
+    if (HasStdExtD)
+      return 64;
+
+    if (HasStdExtF)
+      return 32;
+
+    return 0;
+  }
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
   bool isRegisterReservedByUser(Register i) const {
     assert(i < RISCV::NUM_TARGET_REGS && "Register out of range");
@@ -133,11 +202,19 @@ public:
   }
 
   // Vector codegen related methods.
-  bool hasVInstructions() const { return HasStdExtV; }
-  bool hasVInstructionsI64() const { return HasStdExtV; }
-  bool hasVInstructionsF16() const { return HasStdExtV && hasStdExtZfh(); }
-  bool hasVInstructionsF32() const { return HasStdExtV && hasStdExtF(); }
-  bool hasVInstructionsF64() const { return HasStdExtV && hasStdExtD(); }
+  bool hasVInstructions() const { return HasStdExtV || HasStdExtZve32x; }
+  bool hasVInstructionsI64() const { return HasStdExtV || HasStdExtZve64x; }
+  bool hasVInstructionsF16() const {
+    return (HasStdExtV || HasStdExtZve32f) && HasStdExtZfh;
+  }
+  // FIXME: Consider Zfinx in the future
+  bool hasVInstructionsF32() const {
+    return HasStdExtV || (HasStdExtZve32f && HasStdExtF);
+  }
+  // FIXME: Consider Zdinx in the future
+  bool hasVInstructionsF64() const {
+    return HasStdExtV || (HasStdExtZve64d && HasStdExtD);
+  }
   // F16 and F64 both require F32.
   bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); }
   unsigned getMaxInterleaveFactor() const {
@@ -157,6 +234,12 @@ public:
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
 
+  bool useConstantPoolForLargeInts() const;
+
+  // Maximum cost used for building integers, integers will be put into constant
+  // pool if exceeded.
+  unsigned getMaxBuildIntsCost() const;
+
   // Return the known range for the bit length of RVV data registers. A value
   // of 0 means nothing is known about that particular limit beyond what's
   // implied by the architecture.
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index 5a4c579dd708..b9aa25b321b0 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -1,4 +1,4 @@
-//===- RISCVSystemOperands.td ----------------------------*- tablegen -*-===//
+//===- RISCVSystemOperands.td ------------------------------*- tablegen -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -70,16 +70,16 @@ def lookupSysRegByDeprecatedName : SearchIndex {
 // 2.3, 2.4 and  2.5 in the RISC-V Instruction Set Manual
 // Volume II: Privileged Architecture.
 
-//===--------------------------
+//===----------------------------------------------------------------------===//
 // User Trap Setup
-//===--------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"ustatus", 0x000>;
 def : SysReg<"uie", 0x004>;
 def : SysReg<"utvec", 0x005>;
 
-//===--------------------------
+//===----------------------------------------------------------------------===//
 // User Trap Handling
-//===--------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"uscratch", 0x040>;
 def : SysReg<"uepc", 0x041>;
 def : SysReg<"ucause", 0x042>;
@@ -87,100 +87,57 @@ let DeprecatedName = "ubadaddr" in
 def : SysReg<"utval", 0x043>;
 def : SysReg<"uip", 0x044>;
 
-//===--------------------------
+//===----------------------------------------------------------------------===//
 // User Floating-Point CSRs
-//===--------------------------
+//===----------------------------------------------------------------------===//
 
 def SysRegFFLAGS : SysReg<"fflags", 0x001>;
 def SysRegFRM    : SysReg<"frm", 0x002>;
 def SysRegFCSR   : SysReg<"fcsr", 0x003>;
 
-//===--------------------------
+//===----------------------------------------------------------------------===//
 // User Counter/Timers
-//===--------------------------
+//===----------------------------------------------------------------------===//
 def CYCLE   : SysReg<"cycle", 0xC00>;
 def TIME    : SysReg<"time", 0xC01>;
 def INSTRET : SysReg<"instret", 0xC02>;
 
-def : SysReg<"hpmcounter3", 0xC03>;
-def : SysReg<"hpmcounter4", 0xC04>;
-def : SysReg<"hpmcounter5", 0xC05>;
-def : SysReg<"hpmcounter6", 0xC06>;
-def : SysReg<"hpmcounter7", 0xC07>;
-def : SysReg<"hpmcounter8", 0xC08>;
-def : SysReg<"hpmcounter9", 0xC09>;
-def : SysReg<"hpmcounter10", 0xC0A>;
-def : SysReg<"hpmcounter11", 0xC0B>;
-def : SysReg<"hpmcounter12", 0xC0C>;
-def : SysReg<"hpmcounter13", 0xC0D>;
-def : SysReg<"hpmcounter14", 0xC0E>;
-def : SysReg<"hpmcounter15", 0xC0F>;
-def : SysReg<"hpmcounter16", 0xC10>;
-def : SysReg<"hpmcounter17", 0xC11>;
-def : SysReg<"hpmcounter18", 0xC12>;
-def : SysReg<"hpmcounter19", 0xC13>;
-def : SysReg<"hpmcounter20", 0xC14>;
-def : SysReg<"hpmcounter21", 0xC15>;
-def : SysReg<"hpmcounter22", 0xC16>;
-def : SysReg<"hpmcounter23", 0xC17>;
-def : SysReg<"hpmcounter24", 0xC18>;
-def : SysReg<"hpmcounter25", 0xC19>;
-def : SysReg<"hpmcounter26", 0xC1A>;
-def : SysReg<"hpmcounter27", 0xC1B>;
-def : SysReg<"hpmcounter28", 0xC1C>;
-def : SysReg<"hpmcounter29", 0xC1D>;
-def : SysReg<"hpmcounter30", 0xC1E>;
-def : SysReg<"hpmcounter31", 0xC1F>;
+// hpmcounter3-hpmcounter31 at 0xC03-0xC1F.
+foreach i = 3...31 in
+  def : SysReg<"hpmcounter"#i, !add(0xC03, !sub(i, 3))>;
 
 let isRV32Only = 1 in {
 def CYCLEH   : SysReg<"cycleh", 0xC80>;
 def TIMEH    : SysReg<"timeh", 0xC81>;
 def INSTRETH : SysReg<"instreth", 0xC82>;
 
-def: SysReg<"hpmcounter3h", 0xC83>;
-def: SysReg<"hpmcounter4h", 0xC84>;
-def: SysReg<"hpmcounter5h", 0xC85>;
-def: SysReg<"hpmcounter6h", 0xC86>;
-def: SysReg<"hpmcounter7h", 0xC87>;
-def: SysReg<"hpmcounter8h", 0xC88>;
-def: SysReg<"hpmcounter9h", 0xC89>;
-def: SysReg<"hpmcounter10h", 0xC8A>;
-def: SysReg<"hpmcounter11h", 0xC8B>;
-def: SysReg<"hpmcounter12h", 0xC8C>;
-def: SysReg<"hpmcounter13h", 0xC8D>;
-def: SysReg<"hpmcounter14h", 0xC8E>;
-def: SysReg<"hpmcounter15h", 0xC8F>;
-def: SysReg<"hpmcounter16h", 0xC90>;
-def: SysReg<"hpmcounter17h", 0xC91>;
-def: SysReg<"hpmcounter18h", 0xC92>;
-def: SysReg<"hpmcounter19h", 0xC93>;
-def: SysReg<"hpmcounter20h", 0xC94>;
-def: SysReg<"hpmcounter21h", 0xC95>;
-def: SysReg<"hpmcounter22h", 0xC96>;
-def: SysReg<"hpmcounter23h", 0xC97>;
-def: SysReg<"hpmcounter24h", 0xC98>;
-def: SysReg<"hpmcounter25h", 0xC99>;
-def: SysReg<"hpmcounter26h", 0xC9A>;
-def: SysReg<"hpmcounter27h", 0xC9B>;
-def: SysReg<"hpmcounter28h", 0xC9C>;
-def: SysReg<"hpmcounter29h", 0xC9D>;
-def: SysReg<"hpmcounter30h", 0xC9E>;
-def: SysReg<"hpmcounter31h", 0xC9F>;
+// hpmcounter3h-hpmcounter31h at 0xC83-0xC9F.
+foreach i = 3...31 in
+  def : SysReg<"hpmcounter"#i#"h", !add(0xC83, !sub(i, 3))>;
 }
 
-//===--------------------------
+//===----------------------------------------------------------------------===//
 // Supervisor Trap Setup
-//===--------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"sstatus", 0x100>;
 def : SysReg<"sedeleg", 0x102>;
 def : SysReg<"sideleg", 0x103>;
 def : SysReg<"sie", 0x104>;
 def : SysReg<"stvec", 0x105>;
 def : SysReg<"scounteren", 0x106>;
+def : SysReg<"stimecmp", 0x14D>;
+let isRV32Only = 1 in
+def : SysReg<"stimecmph", 0x15D>;
+
+//===----------------------------------------------------------------------===//
+// Supervisor Configuration
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"senvcfg", 0x10A>;
 
-//===--------------------------
+//===----------------------------------------------------------------------===//
 // Supervisor Trap Handling
-//===--------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"sscratch", 0x140>;
 def : SysReg<"sepc", 0x141>;
 def : SysReg<"scause", 0x142>;
@@ -188,24 +145,103 @@ let DeprecatedName = "sbadaddr" in
 def : SysReg<"stval", 0x143>;
 def : SysReg<"sip", 0x144>;
 
-//===-------------------------------------
+//===----------------------------------------------------------------------===//
 // Supervisor Protection and Translation
-//===-------------------------------------
+//===----------------------------------------------------------------------===//
 let DeprecatedName = "sptbr" in
 def : SysReg<"satp", 0x180>;
 
-//===-----------------------------
+//===----------------------------------------------------------------------===//
+// Debug/Trace Registers
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"scontext", 0x5A8>;
+
+//===----------------------------------------------------------------------===//
+// Supervisor Count Overflow (defined in Sscofpmf)
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"scountovf", 0xDA0>;
+
+//===----------------------------------------------------------------------===//
+// Hypervisor Trap Setup
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"hstatus", 0x600>;
+def : SysReg<"hedeleg", 0x602>;
+def : SysReg<"hideleg", 0x603>;
+def : SysReg<"hie", 0x604>;
+def : SysReg<"hcounteren", 0x606>;
+def : SysReg<"hgeie", 0x607>;
+
+//===----------------------------------------------------------------------===//
+// Hypervisor Trap Handling
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"htval", 0x643>;
+def : SysReg<"hip", 0x644>;
+def : SysReg<"hvip", 0x645>;
+def : SysReg<"htinst", 0x64A>;
+def : SysReg<"hgeip", 0xE12>;
+
+//===----------------------------------------------------------------------===//
+// Hypervisor Configuration
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"henvcfg", 0x60A>;
+let isRV32Only = 1 in
+def : SysReg<"henvcfgh", 0x61A>;
+
+//===----------------------------------------------------------------------===//
+// Hypervisor Protection and Translation
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"hgatp", 0x680>;
+
+//===----------------------------------------------------------------------===//
+// Debug/Trace Registers
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"hcontext", 0x6A8>;
+
+//===----------------------------------------------------------------------===//
+// Hypervisor Counter/Timer Virtualization Registers
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"htimedelta", 0x605>;
+let isRV32Only = 1 in
+def : SysReg<"htimedeltah", 0x615>;
+
+//===----------------------------------------------------------------------===//
+// Virtual Supervisor Registers
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"vsstatus", 0x200>;
+def : SysReg<"vsie", 0x204>;
+def : SysReg<"vstvec", 0x205>;
+def : SysReg<"vsscratch", 0x240>;
+def : SysReg<"vsepc", 0x241>;
+def : SysReg<"vscause", 0x242>;
+def : SysReg<"vstval", 0x243>;
+def : SysReg<"vsip", 0x244>;
+def : SysReg<"vstimecmp", 0x24D>;
+let isRV32Only = 1 in
+def : SysReg<"vstimecmph", 0x25D>;
+def : SysReg<"vsatp", 0x280>;
+
+//===----------------------------------------------------------------------===//
 // Machine Information Registers
-//===-----------------------------
+//===----------------------------------------------------------------------===//
 
 def : SysReg<"mvendorid", 0xF11>;
 def : SysReg<"marchid", 0xF12>;
 def : SysReg<"mimpid", 0xF13>;
 def : SysReg<"mhartid", 0xF14>;
+def : SysReg<"mconfigptr", 0xF15>;
 
-//===-----------------------------
+//===----------------------------------------------------------------------===//
 // Machine Trap Setup
-//===-----------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"mstatus", 0x300>;
 def : SysReg<"misa", 0x301>;
 def : SysReg<"medeleg", 0x302>;
@@ -213,163 +249,93 @@ def : SysReg<"mideleg", 0x303>;
 def : SysReg<"mie", 0x304>;
 def : SysReg<"mtvec", 0x305>;
 def : SysReg<"mcounteren", 0x306>;
+let isRV32Only = 1 in
+def : SysReg<"mstatush", 0x310>;
 
-//===-----------------------------
+//===----------------------------------------------------------------------===//
 // Machine Trap Handling
-//===-----------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"mscratch", 0x340>;
 def : SysReg<"mepc", 0x341>;
 def : SysReg<"mcause", 0x342>;
 let DeprecatedName = "mbadaddr" in
 def : SysReg<"mtval", 0x343>;
 def : SysReg<"mip", 0x344>;
+def : SysReg<"mtinst", 0x34A>;
+def : SysReg<"mtval2", 0x34B>;
 
-//===----------------------------------
+//===----------------------------------------------------------------------===//
+// Machine Configuration
+//===----------------------------------------------------------------------===//
+
+def : SysReg<"menvcfg", 0x30A>;
+let isRV32Only = 1 in
+def : SysReg<"menvcfgh", 0x31A>;
+def : SysReg<"mseccfg", 0x747>;
+let isRV32Only = 1 in
+def : SysReg<"mseccfgh", 0x757>;
+
+//===----------------------------------------------------------------------===//
 // Machine Protection and Translation
-//===----------------------------------
-def : SysReg<"pmpcfg0", 0x3A0>;
-def : SysReg<"pmpcfg2", 0x3A2>;
-let isRV32Only = 1 in {
-def : SysReg<"pmpcfg1", 0x3A1>;
-def : SysReg<"pmpcfg3", 0x3A3>;
+//===----------------------------------------------------------------------===//
+
+// pmpcfg0-pmpcfg15 at 0x3A0-0x3AF. Odd-numbered registers are RV32-only.
+foreach i = 0...15 in {
+  let isRV32Only = !and(i, 1) in
+  def : SysReg<"pmpcfg"#i, !add(0x3A0, i)>;
 }
 
-def : SysReg<"pmpaddr0", 0x3B0>;
-def : SysReg<"pmpaddr1", 0x3B1>;
-def : SysReg<"pmpaddr2", 0x3B2>;
-def : SysReg<"pmpaddr3", 0x3B3>;
-def : SysReg<"pmpaddr4", 0x3B4>;
-def : SysReg<"pmpaddr5", 0x3B5>;
-def : SysReg<"pmpaddr6", 0x3B6>;
-def : SysReg<"pmpaddr7", 0x3B7>;
-def : SysReg<"pmpaddr8", 0x3B8>;
-def : SysReg<"pmpaddr9", 0x3B9>;
-def : SysReg<"pmpaddr10", 0x3BA>;
-def : SysReg<"pmpaddr11", 0x3BB>;
-def : SysReg<"pmpaddr12", 0x3BC>;
-def : SysReg<"pmpaddr13", 0x3BD>;
-def : SysReg<"pmpaddr14", 0x3BE>;
-def : SysReg<"pmpaddr15", 0x3BF>;
-
-
-//===--------------------------
+// pmpaddr0-pmpaddr63 at 0x3B0-0x3EF.
+foreach i = 0...63 in
+  def : SysReg<"pmpaddr"#i, !add(0x3B0, i)>;
+
+//===----------------------------------------------------------------------===//
 // Machine Counter and Timers
-//===--------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"mcycle", 0xB00>;
 def : SysReg<"minstret", 0xB02>;
 
-def : SysReg<"mhpmcounter3", 0xB03>;
-def : SysReg<"mhpmcounter4", 0xB04>;
-def : SysReg<"mhpmcounter5", 0xB05>;
-def : SysReg<"mhpmcounter6", 0xB06>;
-def : SysReg<"mhpmcounter7", 0xB07>;
-def : SysReg<"mhpmcounter8", 0xB08>;
-def : SysReg<"mhpmcounter9", 0xB09>;
-def : SysReg<"mhpmcounter10", 0xB0A>;
-def : SysReg<"mhpmcounter11", 0xB0B>;
-def : SysReg<"mhpmcounter12", 0xB0C>;
-def : SysReg<"mhpmcounter13", 0xB0D>;
-def : SysReg<"mhpmcounter14", 0xB0E>;
-def : SysReg<"mhpmcounter15", 0xB0F>;
-def : SysReg<"mhpmcounter16", 0xB10>;
-def : SysReg<"mhpmcounter17", 0xB11>;
-def : SysReg<"mhpmcounter18", 0xB12>;
-def : SysReg<"mhpmcounter19", 0xB13>;
-def : SysReg<"mhpmcounter20", 0xB14>;
-def : SysReg<"mhpmcounter21", 0xB15>;
-def : SysReg<"mhpmcounter22", 0xB16>;
-def : SysReg<"mhpmcounter23", 0xB17>;
-def : SysReg<"mhpmcounter24", 0xB18>;
-def : SysReg<"mhpmcounter25", 0xB19>;
-def : SysReg<"mhpmcounter26", 0xB1A>;
-def : SysReg<"mhpmcounter27", 0xB1B>;
-def : SysReg<"mhpmcounter28", 0xB1C>;
-def : SysReg<"mhpmcounter29", 0xB1D>;
-def : SysReg<"mhpmcounter30", 0xB1E>;
-def : SysReg<"mhpmcounter31", 0xB1F>;
+// mhpmcounter3-mhpmcounter31 at 0xB03-0xB1F.
+foreach i = 3...31 in
+  def : SysReg<"mhpmcounter"#i, !add(0xB03, !sub(i, 3))>;
 
 let isRV32Only = 1 in {
 def: SysReg<"mcycleh", 0xB80>;
 def: SysReg<"minstreth", 0xB82>;
 
-def: SysReg<"mhpmcounter3h", 0xB83>;
-def: SysReg<"mhpmcounter4h", 0xB84>;
-def: SysReg<"mhpmcounter5h", 0xB85>;
-def: SysReg<"mhpmcounter6h", 0xB86>;
-def: SysReg<"mhpmcounter7h", 0xB87>;
-def: SysReg<"mhpmcounter8h", 0xB88>;
-def: SysReg<"mhpmcounter9h", 0xB89>;
-def: SysReg<"mhpmcounter10h", 0xB8A>;
-def: SysReg<"mhpmcounter11h", 0xB8B>;
-def: SysReg<"mhpmcounter12h", 0xB8C>;
-def: SysReg<"mhpmcounter13h", 0xB8D>;
-def: SysReg<"mhpmcounter14h", 0xB8E>;
-def: SysReg<"mhpmcounter15h", 0xB8F>;
-def: SysReg<"mhpmcounter16h", 0xB90>;
-def: SysReg<"mhpmcounter17h", 0xB91>;
-def: SysReg<"mhpmcounter18h", 0xB92>;
-def: SysReg<"mhpmcounter19h", 0xB93>;
-def: SysReg<"mhpmcounter20h", 0xB94>;
-def: SysReg<"mhpmcounter21h", 0xB95>;
-def: SysReg<"mhpmcounter22h", 0xB96>;
-def: SysReg<"mhpmcounter23h", 0xB97>;
-def: SysReg<"mhpmcounter24h", 0xB98>;
-def: SysReg<"mhpmcounter25h", 0xB99>;
-def: SysReg<"mhpmcounter26h", 0xB9A>;
-def: SysReg<"mhpmcounter27h", 0xB9B>;
-def: SysReg<"mhpmcounter28h", 0xB9C>;
-def: SysReg<"mhpmcounter29h", 0xB9D>;
-def: SysReg<"mhpmcounter30h", 0xB9E>;
-def: SysReg<"mhpmcounter31h", 0xB9F>;
+// mhpmcounter3h-mhpmcounter31h at 0xB83-0xB9F.
+foreach i = 3...31 in
+  def : SysReg<"mhpmcounter"#i#"h", !add(0xB83, !sub(i, 3))>;
 }
 
-//===--------------------------
+//===----------------------------------------------------------------------===//
 // Machine Counter Setup
-//===--------------------------
+//===----------------------------------------------------------------------===//
 let AltName = "mucounteren" in // Privileged spec v1.9.1 Name
 def : SysReg<"mcountinhibit", 0x320>;
 
-def : SysReg<"mhpmevent3", 0x323>;
-def : SysReg<"mhpmevent4", 0x324>;
-def : SysReg<"mhpmevent5", 0x325>;
-def : SysReg<"mhpmevent6", 0x326>;
-def : SysReg<"mhpmevent7", 0x327>;
-def : SysReg<"mhpmevent8", 0x328>;
-def : SysReg<"mhpmevent9", 0x329>;
-def : SysReg<"mhpmevent10", 0x32A>;
-def : SysReg<"mhpmevent11", 0x32B>;
-def : SysReg<"mhpmevent12", 0x32C>;
-def : SysReg<"mhpmevent13", 0x32D>;
-def : SysReg<"mhpmevent14", 0x32E>;
-def : SysReg<"mhpmevent15", 0x32F>;
-def : SysReg<"mhpmevent16", 0x330>;
-def : SysReg<"mhpmevent17", 0x331>;
-def : SysReg<"mhpmevent18", 0x332>;
-def : SysReg<"mhpmevent19", 0x333>;
-def : SysReg<"mhpmevent20", 0x334>;
-def : SysReg<"mhpmevent21", 0x335>;
-def : SysReg<"mhpmevent22", 0x336>;
-def : SysReg<"mhpmevent23", 0x337>;
-def : SysReg<"mhpmevent24", 0x338>;
-def : SysReg<"mhpmevent25", 0x339>;
-def : SysReg<"mhpmevent26", 0x33A>;
-def : SysReg<"mhpmevent27", 0x33B>;
-def : SysReg<"mhpmevent28", 0x33C>;
-def : SysReg<"mhpmevent29", 0x33D>;
-def : SysReg<"mhpmevent30", 0x33E>;
-def : SysReg<"mhpmevent31", 0x33F>;
+// mhpmevent3-mhpmevent31 at 0x323-0x33F.
+foreach i = 3...31 in
+  def : SysReg<"mhpmevent"#i, !add(0x323, !sub(i, 3))>;
 
-//===-----------------------------------------------
+// mhpmevent3h-mhpmevent31h at 0x723-0x73F
+foreach i = 3...31 in {
+  let isRV32Only = 1 in
+  def : SysReg<"mhpmevent"#i#"h", !add(0x723, !sub(i, 3))>;
+}
+
+//===----------------------------------------------------------------------===//
 // Debug/ Trace Registers (shared with Debug Mode)
-//===-----------------------------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"tselect", 0x7A0>;
 def : SysReg<"tdata1", 0x7A1>;
 def : SysReg<"tdata2", 0x7A2>;
 def : SysReg<"tdata3", 0x7A3>;
+def : SysReg<"mcontext", 0x7A8>;
 
-//===-----------------------------------------------
+//===----------------------------------------------------------------------===//
 // Debug Mode Registers
-//===-----------------------------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"dcsr", 0x7B0>;
 def : SysReg<"dpc", 0x7B1>;
 
@@ -379,9 +345,9 @@ let AltName = "dscratch" in
 def : SysReg<"dscratch0", 0x7B2>;
 def : SysReg<"dscratch1", 0x7B3>;
 
-//===-----------------------------------------------
+//===----------------------------------------------------------------------===//
 // User Vector CSRs
-//===-----------------------------------------------
+//===----------------------------------------------------------------------===//
 def : SysReg<"vstart", 0x008>;
 def : SysReg<"vxsat", 0x009>;
 def : SysReg<"vxrm", 0x00A>;
@@ -389,3 +355,26 @@ def : SysReg<"vcsr", 0x00F>;
 def : SysReg<"vl", 0xC20>;
 def : SysReg<"vtype", 0xC21>;
 def SysRegVLENB: SysReg<"vlenb", 0xC22>;
+
+//===----------------------------------------------------------------------===//
+// State Enable Extension (Smstateen)
+//===----------------------------------------------------------------------===//
+
+// sstateen0-sstateen3 at 0x10C-0x10F, mstateen0-mstateen3 at 0x30C-0x30F,
+// mstateen0h-mstateen3h at 0x31C-0x31F, hstateen0-hstateen3 at 0x60C-0x60F,
+// and hstateen0h-hstateen3h at 0x61C-0x61F.
+foreach i = 0...3 in {
+  def : SysReg<"sstateen"#i, !add(0x10C, i)>;
+  def : SysReg<"mstateen"#i, !add(0x30C, i)>;
+  let isRV32Only = 1 in
+  def : SysReg<"mstateen"#i#"h", !add(0x31C, i)>;
+  def : SysReg<"hstateen"#i, !add(0x60C, i)>;
+  let isRV32Only = 1 in
+  def : SysReg<"hstateen"#i#"h", !add(0x61C, i)>;
+}
+
+//===-----------------------------------------------
+// Entropy Source CSR
+//===-----------------------------------------------
+
+def SEED : SysReg<"seed", 0x015>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b421eba8d442..db5e2f1eeb6f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -39,6 +39,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeGlobalISel(*PR);
   initializeRISCVGatherScatterLoweringPass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
+  initializeRISCVSExtWRemovalPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
 }
@@ -140,6 +141,7 @@ public:
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
   void addPreSched2() override;
+  void addMachineSSAOptimization() override;
   void addPreRegAlloc() override;
 };
 } // namespace
@@ -194,6 +196,13 @@ void RISCVPassConfig::addPreEmitPass2() {
   addPass(createRISCVExpandAtomicPseudoPass());
 }
 
+void RISCVPassConfig::addMachineSSAOptimization() {
+  TargetPassConfig::addMachineSSAOptimization();
+
+  if (TM->getTargetTriple().getArch() == Triple::riscv64)
+    addPass(createRISCVSExtWRemovalPass());
+}
+
 void RISCVPassConfig::addPreRegAlloc() {
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createRISCVMergeBaseOffsetOptPass());
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index c435430a1288..99e6774a02e4 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -15,6 +15,13 @@ using namespace llvm;
 
 #define DEBUG_TYPE "riscvtti"
 
+static cl::opt<unsigned> RVVRegisterWidthLMUL(
+    "riscv-v-register-bit-width-lmul",
+    cl::desc(
+        "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
+        "by autovectorized code. Fractional LMULs are not supported."),
+    cl::init(1), cl::Hidden);
+
 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                             TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy() &&
@@ -137,6 +144,24 @@ Optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
   return BaseT::getMaxVScale();
 }
 
+TypeSize
+RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
+  unsigned LMUL = PowerOf2Floor(
+      std::max<unsigned>(std::min<unsigned>(RVVRegisterWidthLMUL, 8), 1));
+  switch (K) {
+  case TargetTransformInfo::RGK_Scalar:
+    return TypeSize::getFixed(ST->getXLen());
+  case TargetTransformInfo::RGK_FixedWidthVector:
+    return TypeSize::getFixed(
+        ST->hasVInstructions() ? LMUL * ST->getMinRVVVectorSizeInBits() : 0);
+  case TargetTransformInfo::RGK_ScalableVector:
+    return TypeSize::getScalable(
+        ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0);
+  }
+
+  llvm_unreachable("Unsupported register kind");
+}
+
 InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
@@ -172,10 +197,7 @@ void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   // Support explicit targets enabled for SiFive with the unrolling preferences
   // below
   bool UseDefaultPreferences = true;
-  if (ST->getTuneCPU().contains("sifive-e76") ||
-      ST->getTuneCPU().contains("sifive-s76") ||
-      ST->getTuneCPU().contains("sifive-u74") ||
-      ST->getTuneCPU().contains("sifive-7"))
+  if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
     UseDefaultPreferences = false;
 
   if (UseDefaultPreferences)
@@ -253,3 +275,16 @@ void RISCVTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::PeelingPreferences &PP) {
   BaseT::getPeelingPreferences(L, SE, PP);
 }
+
+InstructionCost RISCVTTIImpl::getRegUsageForType(Type *Ty) {
+  TypeSize Size = Ty->getPrimitiveSizeInBits();
+  if (Ty->isVectorTy()) {
+    if (Size.isScalable() && ST->hasVInstructions())
+      return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
+
+    if (ST->useRVVForFixedLengthVectors())
+      return divideCeil(Size, ST->getMinRVVVectorSizeInBits());
+  }
+
+  return BaseT::getRegUsageForType(Ty);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 7353496f4684..e79c4f75712b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -58,20 +58,9 @@ public:
   bool supportsScalableVectors() const { return ST->hasVInstructions(); }
   Optional<unsigned> getMaxVScale() const;
 
-  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
-    switch (K) {
-    case TargetTransformInfo::RGK_Scalar:
-      return TypeSize::getFixed(ST->getXLen());
-    case TargetTransformInfo::RGK_FixedWidthVector:
-      return TypeSize::getFixed(
-          ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0);
-    case TargetTransformInfo::RGK_ScalableVector:
-      return TypeSize::getScalable(
-          ST->hasVInstructions() ? RISCV::RVVBitsPerBlock : 0);
-    }
+  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
 
-    llvm_unreachable("Unsupported register kind");
-  }
+  InstructionCost getRegUsageForType(Type *Ty);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
@@ -81,7 +70,7 @@ public:
                              TTI::PeelingPreferences &PP);
 
   unsigned getMinVectorRegisterBitWidth() const {
-    return ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0;
+    return ST->useRVVForFixedLengthVectors() ? 16 : 0;
   }
 
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
@@ -189,6 +178,20 @@ public:
     // Let regular unroll to unroll the loop.
     return VF == 1 ? 1 : ST->getMaxInterleaveFactor();
   }
+
+  // TODO: We should define RISC-V's own register classes.
+  //       e.g. register class for FPR.
+  unsigned getNumberOfRegisters(unsigned ClassID) const {
+    bool Vector = (ClassID == 1);
+    if (Vector) {
+      if (ST->hasVInstructions())
+        return 32;
+      return 0;
+    }
+    // 31 = 32 GPR - x0 (zero register)
+    // FIXME: Should we exclude fixed registers like SP, TP or GP?
+    return 31;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 48e6903bd1b1..af3304f0907d 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -257,7 +257,7 @@ private:
   };
 
 public:
-  SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  SparcOperand(KindTy K) : Kind(K) {}
 
   bool isToken() const override { return Kind == k_Token; }
   bool isReg() const override { return Kind == k_Register; }
diff --git a/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index afb69899e724..c5d0f1de7dfd 100644
--- a/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -66,7 +66,7 @@ private:
 }  // end anonymous namespace
 
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
-  unsigned GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
+  Register GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
   return CurDAG->getRegister(GlobalBaseReg,
                              TLI->getPointerTy(CurDAG->getDataLayout()))
       .getNode();
@@ -168,8 +168,7 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
   // placement.
 
   SDLoc dl(N);
-  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
-                                   : SDValue(nullptr,0);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps - 1) : SDValue();
 
   SmallVector<bool, 8> OpChanged;
   // Glue node will be appended late.
@@ -221,8 +220,8 @@ bool SparcDAGToDAGISel::tryInlineAsm(SDNode *N){
     assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
     SDValue V0 = N->getOperand(i+1);
     SDValue V1 = N->getOperand(i+2);
-    unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
-    unsigned Reg1 = cast<RegisterSDNode>(V1)->getReg();
+    Register Reg0 = cast<RegisterSDNode>(V0)->getReg();
+    Register Reg1 = cast<RegisterSDNode>(V1)->getReg();
     SDValue PairedReg;
     MachineRegisterInfo &MRI = MF->getRegInfo();
 
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index ed1faf6b1fe8..6d6879bc94b3 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -826,7 +826,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
       // sret only allowed on first argument
       assert(Outs[realArgIdx].OrigArgIndex == 0);
       PointerType *Ty = cast<PointerType>(CLI.getArgs()[0].Ty);
-      Type *ElementTy = Ty->getElementType();
+      Type *ElementTy = Ty->getPointerElementType();
       SRetArgSize = DAG.getDataLayout().getTypeAllocSize(ElementTy);
       continue;
     }
@@ -2684,7 +2684,7 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
   SDValue RetAddr;
   if (depth == 0) {
     auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
-    unsigned RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
+    Register RetReg = MF.addLiveIn(SP::I7, TLI.getRegClassFor(PtrVT));
     RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
     return RetAddr;
   }
@@ -3245,7 +3245,7 @@ LowerAsmOperandForConstraint(SDValue Op,
                              std::string &Constraint,
                              std::vector<SDValue> &Ops,
                              SelectionDAG &DAG) const {
-  SDValue Result(nullptr, 0);
+  SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1)
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.h b/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
index 9bbe602b32b3..f30ddc7b4955 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.h
@@ -18,9 +18,7 @@ class TargetMachine;
 
 class SparcELFTargetObjectFile : public TargetLoweringObjectFileELF {
 public:
-  SparcELFTargetObjectFile() :
-    TargetLoweringObjectFileELF()
-  {}
+  SparcELFTargetObjectFile() {}
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 899fec6c3328..e76fa03af3bf 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -23,11 +23,7 @@ class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
-class StringRef;
 class Target;
-class Triple;
-class raw_pwrite_stream;
-class raw_ostream;
 
 namespace SystemZMC {
 // How many bytes are in the ABI-defined, caller-allocated part of
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index bedbd061ea5c..5be19f0e3b46 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -20,6 +20,7 @@
 namespace llvm {
 class SystemZTargetMachine;
 class FunctionPass;
+class PassRegistry;
 
 namespace SystemZ {
 // Condition-code mask values.
@@ -196,6 +197,15 @@ FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZTDCPass();
+
+void initializeSystemZElimComparePass(PassRegistry &);
+void initializeSystemZShortenInstPass(PassRegistry &);
+void initializeSystemZLongBranchPass(PassRegistry &);
+void initializeSystemZLDCleanupPass(PassRegistry &);
+void initializeSystemZCopyPhysRegsPass(PassRegistry &);
+void initializeSystemZPostRewritePass(PassRegistry &);
+void initializeSystemZTDCPassPass(PassRegistry &);
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index defab665f924..e01adcce04ab 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -786,6 +786,50 @@ void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) {
   emitStackMaps(SM);
 }
 
+void SystemZAsmPrinter::emitFunctionEntryLabel() {
+  const SystemZSubtarget &Subtarget =
+      static_cast<const SystemZSubtarget &>(MF->getSubtarget());
+
+  if (Subtarget.getTargetTriple().isOSzOS()) {
+    MCContext &OutContext = OutStreamer->getContext();
+    MCSymbol *EPMarkerSym = OutContext.createTempSymbol("CM_", true);
+
+    // EntryPoint Marker
+    const MachineFrameInfo &MFFrame = MF->getFrameInfo();
+    bool IsUsingAlloca = MFFrame.hasVarSizedObjects();
+
+    // Set Flags
+    uint8_t Flags = 0;
+    if (IsUsingAlloca)
+      Flags |= 0x04;
+
+    uint32_t DSASize = MFFrame.getStackSize();
+
+    // Combine into top 27 bits of DSASize and bottom 5 bits of Flags.
+    uint32_t DSAAndFlags = DSASize & 0xFFFFFFE0; // (x/32) << 5
+    DSAAndFlags |= Flags;
+
+    // Emit entry point marker section.
+    OutStreamer->AddComment("XPLINK Routine Layout Entry");
+    OutStreamer->emitLabel(EPMarkerSym);
+    OutStreamer->AddComment("Eyecatcher 0x00C300C500C500");
+    OutStreamer->emitIntValueInHex(0x00C300C500C500, 7); // Eyecatcher.
+    OutStreamer->AddComment("Mark Type C'1'");
+    OutStreamer->emitInt8(0xF1); // Mark Type.
+    if (OutStreamer->isVerboseAsm()) {
+      OutStreamer->AddComment("DSA Size 0x" + Twine::utohexstr(DSASize));
+      OutStreamer->AddComment("Entry Flags");
+      if (Flags & 0x04)
+        OutStreamer->AddComment("  Bit 2: 1 = Uses alloca");
+      else
+        OutStreamer->AddComment("  Bit 2: 0 = Does not use alloca");
+    }
+    OutStreamer->emitInt32(DSAAndFlags);
+  }
+
+  AsmPrinter::emitFunctionEntryLabel();
+}
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZAsmPrinter() {
   RegisterAsmPrinter<SystemZAsmPrinter> X(getTheSystemZTarget());
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index 6cfd7bd4c486..80d68d1b93ff 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -19,7 +19,6 @@
 
 namespace llvm {
 class MCStreamer;
-class MachineBasicBlock;
 class MachineInstr;
 class Module;
 class raw_ostream;
@@ -52,6 +51,7 @@ public:
     SM.reset();
     return AsmPrinter::doInitialization(M);
   }
+  void emitFunctionEntryLabel() override;
 
 private:
   void LowerFENTRY_CALL(const MachineInstr &MI, SystemZMCInstLower &MCIL);
diff --git a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
index 7d21d29d270e..763aa8c0e41f 100644
--- a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
@@ -25,12 +25,6 @@
 
 using namespace llvm;
 
-#define SYSTEMZ_COPYPHYSREGS_NAME "SystemZ Copy Physregs"
-
-namespace llvm {
-  void initializeSystemZCopyPhysRegsPass(PassRegistry&);
-}
-
 namespace {
 
 class SystemZCopyPhysRegs : public MachineFunctionPass {
@@ -41,8 +35,6 @@ public:
     initializeSystemZCopyPhysRegsPass(*PassRegistry::getPassRegistry());
   }
 
-  StringRef getPassName() const override { return SYSTEMZ_COPYPHYSREGS_NAME; }
-
   bool runOnMachineFunction(MachineFunction &MF) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
@@ -59,7 +51,7 @@ char SystemZCopyPhysRegs::ID = 0;
 } // end anonymous namespace
 
 INITIALIZE_PASS(SystemZCopyPhysRegs, "systemz-copy-physregs",
-                SYSTEMZ_COPYPHYSREGS_NAME, false, false)
+                "SystemZ Copy Physregs", false, false)
 
 FunctionPass *llvm::createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM) {
   return new SystemZCopyPhysRegs();
diff --git a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
index 631cbff303e8..4893acc81335 100644
--- a/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -65,11 +65,8 @@ class SystemZElimCompare : public MachineFunctionPass {
 public:
   static char ID;
 
-  SystemZElimCompare(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID) {}
-
-  StringRef getPassName() const override {
-    return "SystemZ Comparison Elimination";
+  SystemZElimCompare() : MachineFunctionPass(ID) {
+    initializeSystemZElimComparePass(*PassRegistry::getPassRegistry());
   }
 
   bool processBlock(MachineBasicBlock &MBB);
@@ -106,6 +103,9 @@ char SystemZElimCompare::ID = 0;
 
 } // end anonymous namespace
 
+INITIALIZE_PASS(SystemZElimCompare, DEBUG_TYPE,
+                "SystemZ Comparison Elimination", false, false)
+
 // Returns true if MI is an instruction whose output equals the value in Reg.
 static bool preservesValueOf(MachineInstr &MI, unsigned Reg) {
   switch (MI.getOpcode()) {
@@ -746,5 +746,5 @@ bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
 }
 
 FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
-  return new SystemZElimCompare(TM);
+  return new SystemZElimCompare();
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 99ab4c5455d6..ccc7d0737f53 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -103,7 +103,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots(
   unsigned HighGPR = SystemZ::R15D;
   int StartSPOffset = SystemZMC::ELFCallFrameSize;
   for (auto &CS : CSI) {
-    unsigned Reg = CS.getReg();
+    Register Reg = CS.getReg();
     int Offset = getRegSpillOffset(MF, Reg);
     if (Offset) {
       if (SystemZ::GR64BitRegClass.contains(Reg) && StartSPOffset > Offset) {
@@ -124,7 +124,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots(
     // Also save the GPR varargs, if any.  R6D is call-saved, so would
     // already be included, but we also need to handle the call-clobbered
     // argument registers.
-    unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
+    Register FirstGPR = ZFI->getVarArgsFirstGPR();
     if (FirstGPR < SystemZ::ELFNumArgGPRs) {
       unsigned Reg = SystemZ::ELFArgGPRs[FirstGPR];
       int Offset = getRegSpillOffset(MF, Reg);
@@ -143,7 +143,7 @@ bool SystemZELFFrameLowering::assignCalleeSavedSpillSlots(
   for (auto &CS : CSI) {
     if (CS.getFrameIdx() != INT32_MAX)
       continue;
-    unsigned Reg = CS.getReg();
+    Register Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     unsigned Size = TRI->getSpillSize(*RC);
     CurrOffset -= Size;
@@ -271,7 +271,7 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters(
     // Make sure all call-saved GPRs are included as operands and are
     // marked as live on entry.
     for (const CalleeSavedInfo &I : CSI) {
-      unsigned Reg = I.getReg();
+      Register Reg = I.getReg();
       if (SystemZ::GR64BitRegClass.contains(Reg))
         addSavedGPR(MBB, MIB, Reg, true);
     }
@@ -284,7 +284,7 @@ bool SystemZELFFrameLowering::spillCalleeSavedRegisters(
 
   // Save FPRs/VRs in the normal TargetInstrInfo way.
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
@@ -314,7 +314,7 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters(
 
   // Restore FPRs/VRs in the normal TargetInstrInfo way.
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, I.getFrameIdx(),
                                 &SystemZ::FP64BitRegClass, TRI);
@@ -346,7 +346,7 @@ bool SystemZELFFrameLowering::restoreCalleeSavedRegisters(
 
     // Do a second scan adding regs as being defined by instruction
     for (const CalleeSavedInfo &I : CSI) {
-      unsigned Reg = I.getReg();
+      Register Reg = I.getReg();
       if (Reg != RestoreGPRs.LowGPR && Reg != RestoreGPRs.HighGPR &&
           SystemZ::GR64BitRegClass.contains(Reg))
         MIB.addReg(Reg, RegState::ImplicitDefine);
@@ -500,7 +500,7 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF,
 
     // Add CFI for the GPR saves.
     for (auto &Save : CSI) {
-      unsigned Reg = Save.getReg();
+      Register Reg = Save.getReg();
       if (SystemZ::GR64BitRegClass.contains(Reg)) {
         int FI = Save.getFrameIdx();
         int64_t Offset = MFFrame.getObjectOffset(FI);
@@ -580,7 +580,7 @@ void SystemZELFFrameLowering::emitPrologue(MachineFunction &MF,
   // Skip over the FPR/VR saves.
   SmallVector<unsigned, 8> CFIIndexes;
   for (auto &Save : CSI) {
-    unsigned Reg = Save.getReg();
+    Register Reg = Save.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       if (MBBI != MBB.end() &&
           (MBBI->getOpcode() == SystemZ::STD ||
@@ -764,8 +764,7 @@ void SystemZELFFrameLowering::inlineStackProbe(
 
 bool SystemZELFFrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
-          MF.getFrameInfo().hasVarSizedObjects() ||
-          MF.getInfo<SystemZMachineFunctionInfo>()->getManipulatesSP());
+          MF.getFrameInfo().hasVarSizedObjects());
 }
 
 StackOffset SystemZELFFrameLowering::getFrameIndexReference(
@@ -850,7 +849,7 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
 
   auto ProcessCSI = [&](std::vector<CalleeSavedInfo> &CSIList) {
     for (auto &CS : CSIList) {
-      unsigned Reg = CS.getReg();
+      Register Reg = CS.getReg();
       int Offset = RegSpillOffsets[Reg];
       if (Offset >= 0) {
         if (GRRegClass.contains(Reg)) {
@@ -895,7 +894,7 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
   for (auto &CS : CSI) {
     if (CS.getFrameIdx() != INT32_MAX)
       continue;
-    unsigned Reg = CS.getReg();
+    Register Reg = CS.getReg();
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     Align Alignment = TRI->getSpillAlign(*RC);
     unsigned Size = TRI->getSpillSize(*RC);
@@ -966,7 +965,7 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
     // marked as live on entry.
     auto &GRRegClass = SystemZ::GR64BitRegClass;
     for (const CalleeSavedInfo &I : CSI) {
-      unsigned Reg = I.getReg();
+      Register Reg = I.getReg();
       if (GRRegClass.contains(Reg))
         addSavedGPR(MBB, MIB, Reg, true);
     }
@@ -974,7 +973,7 @@ bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
 
   // Spill FPRs to the stack in the normal TargetInstrInfo way
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg)) {
       MBB.addLiveIn(Reg);
       TII->storeRegToStackSlot(MBB, MBBI, Reg, true, I.getFrameIdx(),
@@ -1007,7 +1006,7 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters(
 
   // Restore FPRs in the normal TargetInstrInfo way.
   for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-    unsigned Reg = CSI[I].getReg();
+    Register Reg = CSI[I].getReg();
     if (SystemZ::FP64BitRegClass.contains(Reg))
       TII->loadRegFromStackSlot(MBB, MBBI, Reg, CSI[I].getFrameIdx(),
                                 &SystemZ::FP64BitRegClass, TRI);
@@ -1041,7 +1040,7 @@ bool SystemZXPLINKFrameLowering::restoreCalleeSavedRegisters(
 
       // Do a second scan adding regs as being defined by instruction
       for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-        unsigned Reg = CSI[I].getReg();
+        Register Reg = CSI[I].getReg();
         if (Reg > RestoreGPRs.LowGPR && Reg < RestoreGPRs.HighGPR)
           MIB.addReg(Reg, RegState::ImplicitDefine);
       }
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 106b9e8ebe06..3a1af888d8f9 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -17,7 +17,6 @@
 #include "llvm/Support/TypeSize.h"
 
 namespace llvm {
-class SystemZTargetMachine;
 class SystemZSubtarget;
 
 class SystemZFrameLowering : public TargetFrameLowering {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 39a82e2c07e0..cf55318d328d 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -62,8 +62,7 @@ struct SystemZAddressingMode {
   bool IncludesDynAlloc;
 
   SystemZAddressingMode(AddrForm form, DispRange dr)
-    : Form(form), DR(dr), Base(), Disp(0), Index(),
-      IncludesDynAlloc(false) {}
+      : Form(form), DR(dr), Disp(0), IncludesDynAlloc(false) {}
 
   // True if the address can have an index register.
   bool hasIndexField() { return Form != FormBD; }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 24de52850771..f10651d5c5d7 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -318,8 +318,6 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
   setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, PtrVT, Custom);
 
-  // Use custom expanders so that we can force the function to use
-  // a frame pointer.
   setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
 
@@ -1571,7 +1569,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
         int FI =
           MFI.CreateFixedObject(8, -SystemZMC::ELFCallFrameSize + Offset, true);
         SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-        unsigned VReg = MF.addLiveIn(SystemZ::ELFArgFPRs[I],
+        Register VReg = MF.addLiveIn(SystemZ::ELFArgFPRs[I],
                                      &SystemZ::FP64BitRegClass);
         SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f64);
         MemOps[I] = DAG.getStore(ArgValue.getValue(1), DL, ArgValue, FIN,
@@ -3417,7 +3415,7 @@ SDValue SystemZTargetLowering::lowerRETURNADDR(SDValue Op,
   }
 
   // Return R14D, which has the return address. Mark it an implicit live-in.
-  unsigned LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
+  Register LinkReg = MF.addLiveIn(SystemZ::R14D, &SystemZ::GR64BitRegClass);
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, LinkReg, PtrVT);
 }
 
@@ -4194,7 +4192,6 @@ SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
   auto *Regs = Subtarget->getSpecialRegisters();
-  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     report_fatal_error("Variable-sized stack allocations are not supported "
                        "in GHC calling convention");
@@ -4207,7 +4204,6 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   const SystemZSubtarget *Subtarget = &MF.getSubtarget<SystemZSubtarget>();
   auto *Regs = Subtarget->getSpecialRegisters();
-  MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
   bool StoreBackchain = MF.getFunction().hasFnAttribute("backchain");
 
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
@@ -8318,13 +8314,11 @@ MachineBasicBlock *SystemZTargetLowering::emitTransactionBegin(
   // Add FPR/VR clobbers.
   if (!NoFloat && (Control & 4) != 0) {
     if (Subtarget.hasVector()) {
-      for (int I = 0; I < 32; I++) {
-        unsigned Reg = SystemZMC::VR128Regs[I];
+      for (unsigned Reg : SystemZMC::VR128Regs) {
         MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
       }
     } else {
-      for (int I = 0; I < 16; I++) {
-        unsigned Reg = SystemZMC::FP64Regs[I];
+      for (unsigned Reg : SystemZMC::FP64Regs) {
         MI.addOperand(MachineOperand::CreateReg(Reg, true, true));
       }
     }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 940c0a857ea4..a8ddb8c62d18 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -381,7 +381,6 @@ enum {
 } // end namespace SystemZICMP
 
 class SystemZSubtarget;
-class SystemZTargetMachine;
 
 class SystemZTargetLowering : public TargetLowering {
 public:
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index e80496e37781..6db9bf3056b7 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -1309,7 +1309,7 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     // allocated regs are in an FP reg-class per previous check above.
     for (const MachineOperand &MO : MIB->operands())
       if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (MRI.getRegClass(Reg) == &SystemZ::VR32BitRegClass)
           MRI.setRegClass(Reg, &SystemZ::FP32BitRegClass);
         else if (MRI.getRegClass(Reg) == &SystemZ::VR64BitRegClass)
diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
index 06d893d043e9..d6c795985448 100644
--- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -29,11 +29,8 @@ namespace {
 class SystemZLDCleanup : public MachineFunctionPass {
 public:
   static char ID;
-  SystemZLDCleanup(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {}
-
-  StringRef getPassName() const override {
-    return "SystemZ Local Dynamic TLS Access Clean-up";
+  SystemZLDCleanup() : MachineFunctionPass(ID), TII(nullptr), MF(nullptr) {
+    initializeSystemZLDCleanupPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -52,8 +49,11 @@ char SystemZLDCleanup::ID = 0;
 
 } // end anonymous namespace
 
+INITIALIZE_PASS(SystemZLDCleanup, "systemz-ld-cleanup",
+                "SystemZ Local Dynamic TLS Access Clean-up", false, false)
+
 FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
-  return new SystemZLDCleanup(TM);
+  return new SystemZLDCleanup();
 }
 
 void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
index 9c985c16f082..d53693154d40 100644
--- a/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -135,10 +135,9 @@ class SystemZLongBranch : public MachineFunctionPass {
 public:
   static char ID;
 
-  SystemZLongBranch(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID) {}
-
-  StringRef getPassName() const override { return "SystemZ Long Branch"; }
+  SystemZLongBranch() : MachineFunctionPass(ID) {
+    initializeSystemZLongBranchPass(*PassRegistry::getPassRegistry());
+  }
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
@@ -174,6 +173,9 @@ const uint64_t MaxForwardRange = 0xfffe;
 
 } // end anonymous namespace
 
+INITIALIZE_PASS(SystemZLongBranch, DEBUG_TYPE, "SystemZ Long Branch", false,
+                false)
+
 // Position describes the state immediately before Block.  Update Block
 // accordingly and move Position to the end of the block's non-terminator
 // instructions.
@@ -481,5 +483,5 @@ bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
 }
 
 FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
-  return new SystemZLongBranch(TM);
+  return new SystemZLongBranch();
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
index 14ad06488312..eb09033d1850 100644
--- a/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/llvm/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -18,7 +18,6 @@ class MCInst;
 class MCOperand;
 class MachineInstr;
 class MachineOperand;
-class Mangler;
 class SystemZAsmPrinter;
 
 class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index f755d5cd3d5b..ec4b812eb0e1 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -34,14 +34,12 @@ class SystemZMachineFunctionInfo : public MachineFunctionInfo {
   unsigned VarArgsFrameIndex;
   unsigned RegSaveFrameIndex;
   int FramePointerSaveIndex;
-  bool ManipulatesSP;
   unsigned NumLocalDynamics;
 
 public:
   explicit SystemZMachineFunctionInfo(MachineFunction &MF)
     : VarArgsFirstGPR(0), VarArgsFirstFPR(0), VarArgsFrameIndex(0),
-      RegSaveFrameIndex(0), FramePointerSaveIndex(0), ManipulatesSP(false),
-      NumLocalDynamics(0) {}
+      RegSaveFrameIndex(0), FramePointerSaveIndex(0), NumLocalDynamics(0) {}
 
   // Get and set the first and last call-saved GPR that should be saved by
   // this function and the SP offset for the STMG.  These are 0 if no GPRs
@@ -85,11 +83,6 @@ public:
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
 
-  // Get and set whether the function directly manipulates the stack pointer,
-  // e.g. through STACKSAVE or STACKRESTORE.
-  bool getManipulatesSP() const { return ManipulatesSP; }
-  void setManipulatesSP(bool MSP) { ManipulatesSP = MSP; }
-
   // Count number of local-dynamic TLS symbols used.
   unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
diff --git a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
index aaa7f8fc88f5..5a2cfc53da49 100644
--- a/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZPostRewrite.cpp
@@ -21,16 +21,10 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 using namespace llvm;
 
-#define SYSTEMZ_POSTREWRITE_NAME "SystemZ Post Rewrite pass"
-
 #define DEBUG_TYPE "systemz-postrewrite"
 STATISTIC(MemFoldCopies, "Number of copies inserted before folded mem ops.");
 STATISTIC(LOCRMuxJumps, "Number of LOCRMux jump-sequences (lower is better)");
 
-namespace llvm {
-  void initializeSystemZPostRewritePass(PassRegistry&);
-}
-
 namespace {
 
 class SystemZPostRewrite : public MachineFunctionPass {
@@ -44,8 +38,6 @@ public:
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
 
-  StringRef getPassName() const override { return SYSTEMZ_POSTREWRITE_NAME; }
-
 private:
   void selectLOCRMux(MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator MBBI,
@@ -70,7 +62,7 @@ char SystemZPostRewrite::ID = 0;
 } // end anonymous namespace
 
 INITIALIZE_PASS(SystemZPostRewrite, "systemz-post-rewrite",
-                SYSTEMZ_POSTREWRITE_NAME, false, false)
+                "SystemZ Post Rewrite pass", false, false)
 
 /// Returns an instance of the Post Rewrite pass.
 FunctionPass *llvm::createSystemZPostRewritePass(SystemZTargetMachine &TM) {
@@ -178,15 +170,15 @@ bool SystemZPostRewrite::expandCondMove(MachineBasicBlock &MBB,
   MF.insert(std::next(MachineFunction::iterator(MBB)), RestMBB);
   RestMBB->splice(RestMBB->begin(), &MBB, MI, MBB.end());
   RestMBB->transferSuccessors(&MBB);
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    RestMBB->addLiveIn(*I);
+  for (MCPhysReg R : LiveRegs)
+    RestMBB->addLiveIn(R);
 
   // Create a new block MoveMBB to hold the move instruction.
   MachineBasicBlock *MoveMBB = MF.CreateMachineBasicBlock(BB);
   MF.insert(std::next(MachineFunction::iterator(MBB)), MoveMBB);
   MoveMBB->addLiveIn(SrcReg);
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    MoveMBB->addLiveIn(*I);
+  for (MCPhysReg R : LiveRegs)
+    MoveMBB->addLiveIn(R);
 
   // At the end of MBB, create a conditional branch to RestMBB if the
   // condition is false, otherwise fall through to MoveMBB.
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index a4a5b1fbdf90..da6725777e43 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -17,8 +17,6 @@
 
 namespace llvm {
 
-class SystemZTargetMachine;
-
 class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
   explicit SystemZSelectionDAGInfo() = default;
diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
index 254e5e92449b..92930dad80ef 100644
--- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -26,11 +26,7 @@ namespace {
 class SystemZShortenInst : public MachineFunctionPass {
 public:
   static char ID;
-  SystemZShortenInst(const SystemZTargetMachine &tm);
-
-  StringRef getPassName() const override {
-    return "SystemZ Instruction Shortening";
-  }
+  SystemZShortenInst();
 
   bool processBlock(MachineBasicBlock &MBB);
   bool runOnMachineFunction(MachineFunction &F) override;
@@ -56,12 +52,17 @@ private:
 char SystemZShortenInst::ID = 0;
 } // end anonymous namespace
 
+INITIALIZE_PASS(SystemZShortenInst, DEBUG_TYPE,
+                "SystemZ Instruction Shortening", false, false)
+
 FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
-  return new SystemZShortenInst(TM);
+  return new SystemZShortenInst();
 }
 
-SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
-  : MachineFunctionPass(ID), TII(nullptr) {}
+SystemZShortenInst::SystemZShortenInst()
+    : MachineFunctionPass(ID), TII(nullptr) {
+  initializeSystemZShortenInstPass(*PassRegistry::getPassRegistry());
+}
 
 // Tie operands if MI has become a two-address instruction.
 static void tieOpsIfNeeded(MachineInstr &MI) {
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 0f03d96655bf..75c0d454d904 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -89,7 +89,7 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasSoftFloat(false), TargetTriple(TT),
       SpecialRegisters(initializeSpecialRegisters()),
       InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this),
-      TSInfo(), FrameLowering(SystemZFrameLowering::create(*this)) {}
+      FrameLowering(SystemZFrameLowering::create(*this)) {}
 
 bool SystemZSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 67c5b8eb09b6..98f7094fcb48 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -85,7 +85,7 @@ private:
 
   SystemZSubtarget &initializeSubtargetDependencies(StringRef CPU,
                                                     StringRef FS);
-  SystemZCallingConventionRegisters *initializeSpecialRegisters(void);
+  SystemZCallingConventionRegisters *initializeSpecialRegisters();
 
 public:
   SystemZSubtarget(const Triple &TT, const std::string &CPU,
diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
index 7cb7dca2ea28..f62afb8ddfcf 100644
--- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
@@ -61,10 +61,6 @@
 
 using namespace llvm;
 
-namespace llvm {
-  void initializeSystemZTDCPassPass(PassRegistry&);
-}
-
 namespace {
 
 class SystemZTDCPass : public FunctionPass {
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index deb3358102ed..f1469fe8f56b 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -32,6 +32,14 @@ using namespace llvm;
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() {
   // Register the target.
   RegisterTargetMachine<SystemZTargetMachine> X(getTheSystemZTarget());
+  auto &PR = *PassRegistry::getPassRegistry();
+  initializeSystemZElimComparePass(PR);
+  initializeSystemZShortenInstPass(PR);
+  initializeSystemZLongBranchPass(PR);
+  initializeSystemZLDCleanupPass(PR);
+  initializeSystemZShortenInstPass(PR);
+  initializeSystemZPostRewritePass(PR);
+  initializeSystemZTDCPassPass(PR);
 }
 
 // Determine whether we use the vector ABI.
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
index fd9dc32b04f5..4a318e493c52 100644
--- a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -210,7 +210,7 @@ private:
   };
 
 public:
-  VEOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+  VEOperand(KindTy K) : Kind(K) {}
 
   bool isToken() const override { return Kind == k_Token; }
   bool isReg() const override { return Kind == k_Register; }
diff --git a/llvm/lib/Target/VE/LVLGen.cpp b/llvm/lib/Target/VE/LVLGen.cpp
index c4588926af9e..4db6a59284c2 100644
--- a/llvm/lib/Target/VE/LVLGen.cpp
+++ b/llvm/lib/Target/VE/LVLGen.cpp
@@ -125,8 +125,8 @@ bool LVLGen::runOnMachineFunction(MachineFunction &F) {
   TII = Subtarget.getInstrInfo();
   TRI = Subtarget.getRegisterInfo();
 
-  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    Changed |= runOnMachineBasicBlock(*FI);
+  for (MachineBasicBlock &MBB : F)
+    Changed |= runOnMachineBasicBlock(MBB);
 
   if (Changed) {
     LLVM_DEBUG(dbgs() << "\n");
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
index 7fb8a556aa74..f0bb6e3acdee 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
@@ -27,10 +27,6 @@ class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
-class Triple;
-class StringRef;
-class raw_pwrite_stream;
-class raw_ostream;
 
 MCCodeEmitter *createVEMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCRegisterInfo &MRI, MCContext &Ctx);
diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h
index 8c1fa840f19c..2a729a1a311c 100644
--- a/llvm/lib/Target/VE/VE.h
+++ b/llvm/lib/Target/VE/VE.h
@@ -22,7 +22,6 @@
 namespace llvm {
 class FunctionPass;
 class VETargetMachine;
-class formatted_raw_ostream;
 class AsmPrinter;
 class MCInst;
 class MachineInstr;
diff --git a/llvm/lib/Target/VE/VECustomDAG.cpp b/llvm/lib/Target/VE/VECustomDAG.cpp
new file mode 100644
index 000000000000..af3e4af13814
--- /dev/null
+++ b/llvm/lib/Target/VE/VECustomDAG.cpp
@@ -0,0 +1,81 @@
+//===-- VECustomDAG.h - VE Custom DAG Nodes ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that VE uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "VECustomDAG.h"
+
+#ifndef DEBUG_TYPE
+#define DEBUG_TYPE "vecustomdag"
+#endif
+
+namespace llvm {
+
+static const int StandardVectorWidth = 256;
+
+bool isPackedVectorType(EVT SomeVT) {
+  if (!SomeVT.isVector())
+    return false;
+  return SomeVT.getVectorNumElements() > StandardVectorWidth;
+}
+
+/// \returns the VVP_* SDNode opcode corresponsing to \p OC.
+Optional<unsigned> getVVPOpcode(unsigned Opcode) {
+  switch (Opcode) {
+#define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
+  case ISD::VPOPC:                                                             \
+    return VEISD::VVPNAME;
+#define ADD_VVP_OP(VVPNAME, SDNAME)                                            \
+  case VEISD::VVPNAME:                                                         \
+  case ISD::SDNAME:                                                            \
+    return VEISD::VVPNAME;
+#include "VVPNodes.def"
+  }
+  return None;
+}
+
+bool isVVPBinaryOp(unsigned VVPOpcode) {
+  switch (VVPOpcode) {
+#define ADD_BINARY_VVP_OP(VVPNAME, ...)                                        \
+  case VEISD::VVPNAME:                                                         \
+    return true;
+#include "VVPNodes.def"
+  }
+  return false;
+}
+
+SDValue VECustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget,
+                                 bool IsOpaque) const {
+  return DAG.getConstant(Val, DL, VT, IsTarget, IsOpaque);
+}
+
+SDValue VECustomDAG::getBroadcast(EVT ResultVT, SDValue Scalar,
+                                  SDValue AVL) const {
+  assert(ResultVT.isVector());
+  auto ScaVT = Scalar.getValueType();
+  assert(ScaVT != MVT::i1 && "TODO: Mask broadcasts");
+
+  if (isPackedVectorType(ResultVT)) {
+    // v512x packed mode broadcast
+    // Replicate the scalar reg (f32 or i32) onto the opposing half of the full
+    // scalar register. If it's an I64 type, assume that this has already
+    // happened.
+    if (ScaVT == MVT::f32) {
+      Scalar = getNode(VEISD::REPL_F32, MVT::i64, Scalar);
+    } else if (ScaVT == MVT::i32) {
+      Scalar = getNode(VEISD::REPL_I32, MVT::i64, Scalar);
+    }
+  }
+
+  return getNode(VEISD::VEC_BROADCAST, ResultVT, {Scalar, AVL});
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Target/VE/VECustomDAG.h b/llvm/lib/Target/VE/VECustomDAG.h
new file mode 100644
index 000000000000..ddd6ce783366
--- /dev/null
+++ b/llvm/lib/Target/VE/VECustomDAG.h
@@ -0,0 +1,79 @@
+//===------------ VECustomDAG.h - VE Custom DAG Nodes -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the helper functions that VE uses to lower LLVM code into a
+// selection DAG.  For example, hiding SDLoc, and easy to use SDNodeFlags.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_VECUSTOMDAG_H
+#define LLVM_LIB_TARGET_VE_VECUSTOMDAG_H
+
+#include "VE.h"
+#include "VEISelLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
+
+namespace llvm {
+
+Optional<unsigned> getVVPOpcode(unsigned Opcode);
+
+bool isVVPBinaryOp(unsigned Opcode);
+
+bool isPackedVectorType(EVT SomeVT);
+
+class VECustomDAG {
+  SelectionDAG &DAG;
+  SDLoc DL;
+
+public:
+  SelectionDAG *getDAG() const { return &DAG; }
+
+  VECustomDAG(SelectionDAG &DAG, SDLoc DL) : DAG(DAG), DL(DL) {}
+
+  VECustomDAG(SelectionDAG &DAG, SDValue WhereOp) : DAG(DAG), DL(WhereOp) {}
+
+  VECustomDAG(SelectionDAG &DAG, const SDNode *WhereN) : DAG(DAG), DL(WhereN) {}
+
+  /// getNode {
+  SDValue getNode(unsigned OC, SDVTList VTL, ArrayRef<SDValue> OpV,
+                  Optional<SDNodeFlags> Flags = None) const {
+    auto N = DAG.getNode(OC, DL, VTL, OpV);
+    if (Flags)
+      N->setFlags(*Flags);
+    return N;
+  }
+
+  SDValue getNode(unsigned OC, ArrayRef<EVT> ResVT, ArrayRef<SDValue> OpV,
+                  Optional<SDNodeFlags> Flags = None) const {
+    auto N = DAG.getNode(OC, DL, ResVT, OpV);
+    if (Flags)
+      N->setFlags(*Flags);
+    return N;
+  }
+
+  SDValue getNode(unsigned OC, EVT ResVT, ArrayRef<SDValue> OpV,
+                  Optional<SDNodeFlags> Flags = None) const {
+    auto N = DAG.getNode(OC, DL, ResVT, OpV);
+    if (Flags)
+      N->setFlags(*Flags);
+    return N;
+  }
+
+  SDValue getUNDEF(EVT VT) const { return DAG.getUNDEF(VT); }
+  /// } getNode
+
+  SDValue getConstant(uint64_t Val, EVT VT, bool IsTarget = false,
+                      bool IsOpaque = false) const;
+
+  SDValue getBroadcast(EVT ResultVT, SDValue Scalar, SDValue AVL) const;
+};
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_VE_VECUSTOMDAG_H
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index 5ef223d6030b..9137c476777e 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "VEISelLowering.h"
 #include "MCTargetDesc/VEMCExpr.h"
+#include "VECustomDAG.h"
 #include "VEInstrBuilder.h"
 #include "VEMachineFunctionInfo.h"
 #include "VERegisterInfo.h"
@@ -419,7 +420,7 @@ SDValue VETargetLowering::LowerFormalArguments(
       // All integer register arguments are promoted by the caller to i64.
 
       // Create a virtual register for the promoted live-in value.
-      unsigned VReg =
+      Register VReg =
           MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
 
@@ -754,7 +755,7 @@ SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(!VA.needsCustom() && "Unexpected custom lowering");
-    unsigned Reg = VA.getLocReg();
+    Register Reg = VA.getLocReg();
 
     // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
     // reside in the same register in the high and low bits. Reuse the
@@ -898,6 +899,8 @@ const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
     TARGET_NODE_CASE(RET_FLAG)
     TARGET_NODE_CASE(TS1AM)
     TARGET_NODE_CASE(VEC_BROADCAST)
+    TARGET_NODE_CASE(REPL_I32)
+    TARGET_NODE_CASE(REPL_F32)
 
     // Register the VVP_* SDNodes.
 #define ADD_VVP_OP(VVP_NAME, ...) TARGET_NODE_CASE(VVP_NAME)
@@ -1545,7 +1548,7 @@ static SDValue lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG,
 
   unsigned Depth = Op.getConstantOperandVal(0);
   const VERegisterInfo *RegInfo = Subtarget->getRegisterInfo();
-  unsigned FrameReg = RegInfo->getFrameRegister(MF);
+  Register FrameReg = RegInfo->getFrameRegister(MF);
   SDValue FrameAddr =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, FrameReg, PtrVT);
   while (Depth--)
@@ -1640,28 +1643,26 @@ static SDValue getSplatValue(SDNode *N) {
 
 SDValue VETargetLowering::lowerBUILD_VECTOR(SDValue Op,
                                             SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  unsigned NumEls = Op.getValueType().getVectorNumElements();
-  MVT ElemVT = Op.getSimpleValueType().getVectorElementType();
+  VECustomDAG CDAG(DAG, Op);
+  MVT ResultVT = Op.getSimpleValueType();
 
   // If there is just one element, expand to INSERT_VECTOR_ELT.
   unsigned UniqueIdx;
   if (getUniqueInsertion(Op.getNode(), UniqueIdx)) {
-    SDValue AccuV = DAG.getUNDEF(Op.getValueType());
+    SDValue AccuV = CDAG.getUNDEF(Op.getValueType());
     auto ElemV = Op->getOperand(UniqueIdx);
-    SDValue IdxV = DAG.getConstant(UniqueIdx, DL, MVT::i64);
-    return DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), AccuV,
-                       ElemV, IdxV);
+    SDValue IdxV = CDAG.getConstant(UniqueIdx, MVT::i64);
+    return CDAG.getNode(ISD::INSERT_VECTOR_ELT, ResultVT, {AccuV, ElemV, IdxV});
   }
 
   // Else emit a broadcast.
   if (SDValue ScalarV = getSplatValue(Op.getNode())) {
-    // lower to VEC_BROADCAST
-    MVT LegalResVT = MVT::getVectorVT(ElemVT, 256);
-
-    auto AVL = DAG.getConstant(NumEls, DL, MVT::i32);
-    return DAG.getNode(VEISD::VEC_BROADCAST, DL, LegalResVT, Op.getOperand(0),
-                       AVL);
+    unsigned NumEls = ResultVT.getVectorNumElements();
+    // TODO: Legalize packed-mode AVL.
+    //       For now, cap the AVL at 256.
+    auto CappedLength = std::min<unsigned>(256, NumEls);
+    auto AVL = CDAG.getConstant(CappedLength, MVT::i32);
+    return CDAG.getBroadcast(ResultVT, Op.getOperand(0), AVL);
   }
 
   // Expand
@@ -1720,7 +1721,7 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
 
-#define ADD_BINARY_VVP_OP(VVP_NAME, VP_NAME, ISD_NAME) case ISD::ISD_NAME:
+#define ADD_VVP_OP(VVP_NAME, ISD_NAME) case ISD::ISD_NAME:
 #include "VVPNodes.def"
     return lowerToVVP(Op, DAG);
   }
@@ -2666,21 +2667,6 @@ bool VETargetLowering::hasAndNot(SDValue Y) const {
   return true;
 }
 
-/// \returns the VVP_* SDNode opcode corresponsing to \p OC.
-static Optional<unsigned> getVVPOpcode(unsigned Opcode) {
-  switch (Opcode) {
-#define HANDLE_VP_TO_VVP(VPOPC, VVPNAME)                                       \
-  case ISD::VPOPC:                                                             \
-    return VEISD::VVPNAME;
-#define ADD_VVP_OP(VVPNAME, SDNAME)                                            \
-  case VEISD::VVPNAME:                                                         \
-  case ISD::SDNAME:                                                            \
-    return VEISD::VVPNAME;
-#include "VVPNodes.def"
-  }
-  return None;
-}
-
 SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
   // Can we represent this as a VVP node.
   const unsigned Opcode = Op->getOpcode();
@@ -2691,7 +2677,7 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
   const bool FromVP = ISD::isVPOpcode(Opcode);
 
   // The representative and legalized vector type of this operation.
-  SDLoc DL(Op);
+  VECustomDAG CDAG(DAG, Op);
   MVT MaskVT = MVT::v256i1; // TODO: packed mode.
   EVT OpVecVT = Op.getValueType();
   EVT LegalVecVT = getTypeToTransformTo(*DAG.getContext(), OpVecVT);
@@ -2708,27 +2694,21 @@ SDValue VETargetLowering::lowerToVVP(SDValue Op, SelectionDAG &DAG) const {
 
   } else {
     // Materialize the VL parameter.
-    AVL = DAG.getConstant(OpVecVT.getVectorNumElements(), DL, MVT::i32);
-    SDValue ConstTrue = DAG.getConstant(1, DL, MVT::i32);
-    Mask = DAG.getNode(VEISD::VEC_BROADCAST, DL, MaskVT,
-                       ConstTrue); // emit a VEISD::VEC_BROADCAST here.
+    AVL = CDAG.getConstant(OpVecVT.getVectorNumElements(), MVT::i32);
+    SDValue ConstTrue = CDAG.getConstant(1, MVT::i32);
+    Mask = CDAG.getBroadcast(MaskVT, ConstTrue, AVL);
   }
 
-  // Categories we are interested in.
-  bool IsBinaryOp = false;
-
-  switch (VVPOpcode) {
-#define ADD_BINARY_VVP_OP(VVPNAME, ...)                                        \
-  case VEISD::VVPNAME:                                                         \
-    IsBinaryOp = true;                                                         \
-    break;
-#include "VVPNodes.def"
-  }
-
-  if (IsBinaryOp) {
+  if (isVVPBinaryOp(VVPOpcode)) {
     assert(LegalVecVT.isSimple());
-    return DAG.getNode(VVPOpcode, DL, LegalVecVT, Op->getOperand(0),
-                       Op->getOperand(1), Mask, AVL);
+    return CDAG.getNode(VVPOpcode, LegalVecVT,
+                        {Op->getOperand(0), Op->getOperand(1), Mask, AVL});
+  }
+  if (VVPOpcode == VEISD::VVP_SELECT) {
+    auto Mask = Op->getOperand(0);
+    auto OnTrue = Op->getOperand(1);
+    auto OnFalse = Op->getOperand(2);
+    return CDAG.getNode(VVPOpcode, LegalVecVT, {OnTrue, OnFalse, Mask, AVL});
   }
   llvm_unreachable("lowerToVVP called for unexpected SDNode.");
 }
@@ -2750,7 +2730,7 @@ SDValue VETargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue Idx = Op.getOperand(1);
   SDLoc DL(Op);
   SDValue Result = Op;
-  if (0 /* Idx->isConstant() */) {
+  if (false /* Idx->isConstant() */) {
     // TODO: optimized implementation using constant values
   } else {
     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
@@ -2808,7 +2788,7 @@ SDValue VETargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
   Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
 
   SDValue Result = Op;
-  if (0 /* Idx->isConstant()*/) {
+  if (false /* Idx->isConstant()*/) {
     // TODO: optimized implementation using constant values
   } else {
     SDValue Const1 = DAG.getConstant(1, DL, MVT::i64);
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index b4ce8906fd51..09bd19e83717 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -40,6 +40,8 @@ enum NodeType : unsigned {
   TS1AM,                  // A TS1AM instruction used for 1/2 bytes swap.
   VEC_BROADCAST,          // A vector broadcast instruction.
                           //   0: scalar value, 1: VL
+  REPL_I32,
+  REPL_F32, // Replicate subregister to other half.
 
 // VVP_* nodes.
 #define ADD_VVP_OP(VVP_NAME, ...) VVP_NAME,
@@ -219,4 +221,4 @@ public:
 };
 } // namespace llvm
 
-#endif // VE_ISELLOWERING_H
+#endif // LLVM_LIB_TARGET_VE_VEISELLOWERING_H
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 46846edfeafb..7c1bd5201867 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -248,7 +248,7 @@ unsigned VEInstrInfo::insertBranch(MachineBasicBlock &MBB,
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   MachineFunction *MF = MBB.getParent();
   const MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned Reg = Cond[2].getReg();
+  Register Reg = Cond[2].getReg();
   if (IsIntegerCC(Cond[0].getImm())) {
     if (TRI->getRegSizeInBits(Reg, MRI) == 32) {
       opc[0] = VE::BRCFWir;
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index c3abbe2cafab..717427c3f48d 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -1576,6 +1576,12 @@ def f2l : OutPatFrag<(ops node:$exp),
 def l2f : OutPatFrag<(ops node:$exp),
                      (EXTRACT_SUBREG $exp, sub_f32)>;
 
+// Zero out subregisters.
+def zero_i32 : OutPatFrag<(ops node:$expr),
+                          (ANDrm $expr, 32)>;
+def zero_f32 : OutPatFrag<(ops node:$expr),
+                          (ANDrm $expr, !add(32, 64))>;
+
 // Small immediates.
 def : Pat<(i32 simm7:$val), (EXTRACT_SUBREG (ORim (LO7 $val), 0), sub_i32)>;
 def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>;
@@ -2287,6 +2293,16 @@ class IsVLVT<int OpIdx> : SDTCisVT<OpIdx,i32>;
 def vec_broadcast       : SDNode<"VEISD::VEC_BROADCAST", SDTypeProfile<1, 2,
                                  [SDTCisVec<0>, IsVLVT<2>]>>;
 
+// replicate lower 32bit to upper 32bit (f32 scalar replication).
+def repl_f32            : SDNode<"VEISD::REPL_F32",
+                            SDTypeProfile<1, 1,
+                              [SDTCisInt<0>, SDTCisFP<1>]>>;
+// replicate upper 32bit to lower 32 bit (i32 scalar replication).
+def repl_i32            : SDNode<"VEISD::REPL_I32",
+                            SDTypeProfile<1, 1,
+                              [SDTCisInt<0>, SDTCisInt<1>]>>;
+
+
 // Whether this is an all-true mask (assuming undef-bits above VL are all-true).
 def true_mask           : PatLeaf<
                             (vec_broadcast (i32 nonzero), (i32 srcvalue))>;
diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td
index dc3c913c918a..6c5b80315efb 100644
--- a/llvm/lib/Target/VE/VEInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -15,6 +15,17 @@
 // Instruction format superclass
 //===----------------------------------------------------------------------===//
 
+// Sub-register replication for packed broadcast.
+def: Pat<(i64 (repl_f32 f32:$val)),
+            (ORrr
+              (SRLri (f2l $val), 32),
+              (zero_i32 (f2l $val)))>;
+def: Pat<(i64 (repl_i32 i32:$val)),
+            (ORrr
+              (zero_f32 (i2l $val)),
+              (SLLri (i2l $val), 32))>;
+
+
 multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
                        SDNodeXForm ImmCast, OutPatFrag SuperRegCast> {
   // VBRDil
@@ -89,3 +100,8 @@ defm : patterns_elem32<v256f32, f32, simm7fp, LO7FP, l2f, f2l>;
 
 defm : patterns_elem64<v256i64, i64, simm7, LO7>;
 defm : patterns_elem64<v256f64, f64, simm7fp, LO7FP>;
+
+defm : vbrd_elem64<v512i32, i64, simm7, LO7>;
+defm : vbrd_elem64<v512f32, i64, simm7, LO7>;
+defm : vbrd_elem64<v512i32, f64, simm7fp, LO7FP>;
+defm : vbrd_elem64<v512f32, f64, simm7fp, LO7FP>;
diff --git a/llvm/lib/Target/VE/VEMCInstLower.cpp b/llvm/lib/Target/VE/VEMCInstLower.cpp
index bc5577ce4f97..57195f238cf6 100644
--- a/llvm/lib/Target/VE/VEMCInstLower.cpp
+++ b/llvm/lib/Target/VE/VEMCInstLower.cpp
@@ -78,8 +78,7 @@ void llvm::LowerVEMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                        AsmPrinter &AP) {
   OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     MCOperand MCOp = LowerOperand(MI, MO, AP);
 
     if (MCOp.isValid())
diff --git a/llvm/lib/Target/VE/VEMachineFunctionInfo.h b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
index 16b25fed3f11..3160f6a552d7 100644
--- a/llvm/lib/Target/VE/VEMachineFunctionInfo.h
+++ b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
@@ -29,10 +29,9 @@ private:
   bool IsLeafProc;
 
 public:
-  VEMachineFunctionInfo()
-      : GlobalBaseReg(), VarArgsFrameOffset(0), IsLeafProc(false) {}
+  VEMachineFunctionInfo() : VarArgsFrameOffset(0), IsLeafProc(false) {}
   explicit VEMachineFunctionInfo(MachineFunction &MF)
-      : GlobalBaseReg(), VarArgsFrameOffset(0), IsLeafProc(false) {}
+      : VarArgsFrameOffset(0), IsLeafProc(false) {}
 
   Register getGlobalBaseReg() const { return GlobalBaseReg; }
   void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
diff --git a/llvm/lib/Target/VE/VESubtarget.h b/llvm/lib/Target/VE/VESubtarget.h
index 213aca2ea3f9..0c3dc0a08072 100644
--- a/llvm/lib/Target/VE/VESubtarget.h
+++ b/llvm/lib/Target/VE/VESubtarget.h
@@ -76,7 +76,7 @@ public:
 
   /// Get the size of RSA, return address, and frame pointer as described
   /// in VEFrameLowering.cpp.
-  unsigned getRsaSize(void) const { return 176; };
+  unsigned getRsaSize() const { return 176; };
 
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 };
diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
index 99566e91ec11..ef9c238066c0 100644
--- a/llvm/lib/Target/VE/VVPInstrInfo.td
+++ b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -39,6 +39,15 @@ def SDTFPBinOpVVP : SDTypeProfile<1, 4, [      // vvp_fadd, etc.
   IsVLVT<4>
 ]>;
 
+// Select(OnTrue, OnFalse, SelMask, vl)
+def SDTSelectVVP : SDTypeProfile<1, 4, [       // vp_select, vp_merge
+  SDTCisVec<0>,
+  SDTCisSameNumEltsAs<0, 3>,
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<1, 2>,
+  IsVLVT<4>
+]>;
+
 // Binary operator commutative pattern.
 class vvp_commutative<SDNode RootOp> :
   PatFrags<
@@ -79,3 +88,5 @@ def c_vvp_fmul  : vvp_commutative<vvp_fmul>;
 def vvp_fdiv    : SDNode<"VEISD::VVP_FDIV",  SDTFPBinOpVVP>;
 
 // } Binary Operators
+
+def vvp_select : SDNode<"VEISD::VVP_SELECT", SDTSelectVVP>;
diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
index 8d5d9d103547..74720fd1f419 100644
--- a/llvm/lib/Target/VE/VVPInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -20,8 +20,22 @@ include "VVPInstrInfo.td"
 multiclass Binary_rv<SDPatternOperator OpNode,
     ValueType ScalarVT, ValueType DataVT,
     ValueType MaskVT, string OpBaseName> {
-  // Masked with select, broadcast.
-  // TODO
+  // Masked with passthru, broadcast.
+  def : Pat<(vvp_select
+                (OpNode
+                    (any_broadcast ScalarVT:$sx),
+                    DataVT:$vy,
+                    (MaskVT srcvalue),
+                    (i32 srcvalue)),
+                DataVT:$vfalse,
+                MaskVT:$mask,
+                i32:$pivot),
+            (!cast<Instruction>(OpBaseName#"rvml_v")
+                ScalarVT:$sx,
+                $vy,
+                $mask,
+                $pivot,
+                $vfalse)>;
 
   // Unmasked, broadcast.
   def : Pat<(OpNode
@@ -42,8 +56,22 @@ multiclass Binary_rv<SDPatternOperator OpNode,
 multiclass Binary_vr<SDPatternOperator OpNode,
     ValueType ScalarVT, ValueType DataVT,
     ValueType MaskVT, string OpBaseName> {
-  // Masked with select, broadcast.
-  // TODO
+  // Masked with passthru, broadcast.
+  def : Pat<(vvp_select
+                (OpNode
+                    DataVT:$vx,
+                    (any_broadcast ScalarVT:$sy),
+                    (MaskVT srcvalue),
+                    (i32 srcvalue)),
+                DataVT:$vfalse,
+                MaskVT:$mask,
+                i32:$pivot),
+            (!cast<Instruction>(OpBaseName#"vrml_v")
+                $vx,
+                ScalarVT:$sy,
+                $mask,
+                $pivot,
+                $vfalse)>;
 
   // Unmasked, broadcast.
   def : Pat<(OpNode
@@ -64,6 +92,23 @@ multiclass Binary_vr<SDPatternOperator OpNode,
 multiclass Binary_vv<SDPatternOperator OpNode,
     ValueType DataVT,
     ValueType MaskVT, string OpBaseName> {
+  // Masked with passthru, broadcast.
+  def : Pat<(vvp_select
+                (OpNode
+                    DataVT:$vx,
+                    DataVT:$vy,
+                    (MaskVT srcvalue),
+                    (i32 srcvalue)),
+                DataVT:$vfalse,
+                MaskVT:$mask,
+                i32:$pivot),
+            (!cast<Instruction>(OpBaseName#"vvml_v")
+                $vx,
+                $vy,
+                $mask,
+                $pivot,
+                $vfalse)>;
+
   // Masked with select.
   // TODO
 
@@ -191,3 +236,35 @@ defm : Binary_rv_vv_ShortLong<vvp_fsub,
 defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv,
                               f64, v256f64, "VFDIVD",
                               f32, v256f32, "VFDIVS">;
+
+multiclass Merge_mvv<
+    SDPatternOperator OpNode,
+    ValueType DataVT, ValueType MaskVT,
+    string OpBaseName> {
+  // Masked.
+  def : Pat<(OpNode
+                DataVT:$vtrue, DataVT:$vfalse,
+                MaskVT:$vm,
+                i32:$avl),
+            (!cast<Instruction>(OpBaseName#"vvml_v")
+                $vfalse, $vtrue, $vm, $avl, $vfalse)>;
+}
+
+multiclass Merge_mvv_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongDataVT, ValueType ShortDataVT,
+    string OpBaseName> {
+  defm : Merge_mvv<OpNode,
+                   LongDataVT, v256i1,
+                   OpBaseName>;
+  defm : Merge_mvv<OpNode,
+                   ShortDataVT, v256i1,
+                   OpBaseName>;
+}
+
+defm : Merge_mvv_ShortLong<vvp_select,
+                           v256f64,
+                           v256f32, "VMRG">;
+defm : Merge_mvv_ShortLong<vvp_select,
+                           v256i64,
+                           v256i32, "VMRG">;
diff --git a/llvm/lib/Target/VE/VVPNodes.def b/llvm/lib/Target/VE/VVPNodes.def
index 8a9231f7d3e6..8000f84c5dbe 100644
--- a/llvm/lib/Target/VE/VVPNodes.def
+++ b/llvm/lib/Target/VE/VVPNodes.def
@@ -59,6 +59,11 @@ ADD_BINARY_VVP_OP_COMPACT(FSUB)
 ADD_BINARY_VVP_OP_COMPACT(FMUL)
 ADD_BINARY_VVP_OP_COMPACT(FDIV)
 
+// Shuffles.
+ADD_VVP_OP(VVP_SELECT,VSELECT)
+HANDLE_VP_TO_VVP(VP_SELECT, VVP_SELECT)
+HANDLE_VP_TO_VVP(VP_MERGE, VVP_SELECT)
+
 #undef ADD_BINARY_VVP_OP
 #undef ADD_BINARY_VVP_OP_COMPACT
 #undef ADD_VVP_OP
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index b2f10ca93a4f..75d5d0675990 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -26,7 +26,6 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCInstrInfo;
 class MCObjectTargetWriter;
-class MVT;
 class Triple;
 
 MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
diff --git a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
index d024185defb4..57e40f6cd8d7 100644
--- a/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
+++ b/llvm/lib/Target/WebAssembly/Utils/WebAssemblyUtilities.h
@@ -25,7 +25,6 @@ class MachineInstr;
 class MachineOperand;
 class MCContext;
 class MCSymbolWasm;
-class StringRef;
 class WebAssemblyFunctionInfo;
 class WebAssemblySubtarget;
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 910a4e5e0d1a..eeec0fc671cc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -406,7 +406,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   // TODO: Sort the locals for better compression.
   MFI.setNumLocals(CurLocal - MFI.getParams().size());
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = Register::index2VirtReg(I);
+    Register Reg = Register::index2VirtReg(I);
     auto RL = Reg2Local.find(Reg);
     if (RL == Reg2Local.end() || RL->second < MFI.getParams().size())
       continue;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 642aa6b4028a..406edef8ff3f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -286,7 +286,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
           }
           if (S == 1 && Addr.isRegBase() && Addr.getReg() == 0) {
             // An unscaled add of a register. Set it as the new base.
-            unsigned Reg = getRegForValue(Op);
+            Register Reg = getRegForValue(Op);
             if (Reg == 0)
               return false;
             Addr.setReg(Reg);
@@ -372,7 +372,7 @@ bool WebAssemblyFastISel::computeAddress(const Value *Obj, Address &Addr) {
   if (Addr.isSet()) {
     return false;
   }
-  unsigned Reg = getRegForValue(Obj);
+  Register Reg = getRegForValue(Obj);
   if (Reg == 0)
     return false;
   Addr.setReg(Reg);
@@ -430,7 +430,7 @@ unsigned WebAssemblyFastISel::getRegForI1Value(const Value *V,
       }
 
   Not = false;
-  unsigned Reg = getRegForValue(V);
+  Register Reg = getRegForValue(V);
   if (Reg == 0)
     return 0;
   return maskI1Value(Reg, V);
@@ -458,12 +458,12 @@ unsigned WebAssemblyFastISel::zeroExtendToI32(unsigned Reg, const Value *V,
     return 0;
   }
 
-  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  Register Imm = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(WebAssembly::CONST_I32), Imm)
       .addImm(~(~uint64_t(0) << MVT(From).getSizeInBits()));
 
-  unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+  Register Result = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(WebAssembly::AND_I32), Result)
       .addReg(Reg)
@@ -488,18 +488,18 @@ unsigned WebAssemblyFastISel::signExtendToI32(unsigned Reg, const Value *V,
     return 0;
   }
 
-  unsigned Imm = createResultReg(&WebAssembly::I32RegClass);
+  Register Imm = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(WebAssembly::CONST_I32), Imm)
       .addImm(32 - MVT(From).getSizeInBits());
 
-  unsigned Left = createResultReg(&WebAssembly::I32RegClass);
+  Register Left = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(WebAssembly::SHL_I32), Left)
       .addReg(Reg)
       .addReg(Imm);
 
-  unsigned Right = createResultReg(&WebAssembly::I32RegClass);
+  Register Right = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(WebAssembly::SHR_S_I32), Right)
       .addReg(Left)
@@ -517,7 +517,7 @@ unsigned WebAssemblyFastISel::zeroExtend(unsigned Reg, const Value *V,
 
     Reg = zeroExtendToI32(Reg, V, From);
 
-    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    Register Result = createResultReg(&WebAssembly::I64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(WebAssembly::I64_EXTEND_U_I32), Result)
         .addReg(Reg);
@@ -539,7 +539,7 @@ unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
 
     Reg = signExtendToI32(Reg, V, From);
 
-    unsigned Result = createResultReg(&WebAssembly::I64RegClass);
+    Register Result = createResultReg(&WebAssembly::I64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(WebAssembly::I64_EXTEND_S_I32), Result)
         .addReg(Reg);
@@ -555,7 +555,7 @@ unsigned WebAssemblyFastISel::signExtend(unsigned Reg, const Value *V,
 unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
   MVT::SimpleValueType From = getSimpleType(V->getType());
   MVT::SimpleValueType To = getLegalType(From);
-  unsigned VReg = getRegForValue(V);
+  Register VReg = getRegForValue(V);
   if (VReg == 0)
     return 0;
   return zeroExtend(VReg, V, From, To);
@@ -564,7 +564,7 @@ unsigned WebAssemblyFastISel::getRegForUnsignedValue(const Value *V) {
 unsigned WebAssemblyFastISel::getRegForSignedValue(const Value *V) {
   MVT::SimpleValueType From = getSimpleType(V->getType());
   MVT::SimpleValueType To = getLegalType(From);
-  unsigned VReg = getRegForValue(V);
+  Register VReg = getRegForValue(V);
   if (VReg == 0)
     return 0;
   return signExtend(VReg, V, From, To);
@@ -578,7 +578,7 @@ unsigned WebAssemblyFastISel::getRegForPromotedValue(const Value *V,
 unsigned WebAssemblyFastISel::notValue(unsigned Reg) {
   assert(MRI.getRegClass(Reg) == &WebAssembly::I32RegClass);
 
-  unsigned NotReg = createResultReg(&WebAssembly::I32RegClass);
+  Register NotReg = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(WebAssembly::EQZ_I32), NotReg)
       .addReg(Reg);
@@ -586,7 +586,7 @@ unsigned WebAssemblyFastISel::notValue(unsigned Reg) {
 }
 
 unsigned WebAssemblyFastISel::copyValue(unsigned Reg) {
-  unsigned ResultReg = createResultReg(MRI.getRegClass(Reg));
+  Register ResultReg = createResultReg(MRI.getRegClass(Reg));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(WebAssembly::COPY),
           ResultReg)
       .addReg(Reg);
@@ -598,7 +598,7 @@ unsigned WebAssemblyFastISel::fastMaterializeAlloca(const AllocaInst *AI) {
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg =
+    Register ResultReg =
         createResultReg(Subtarget->hasAddr64() ? &WebAssembly::I64RegClass
                                                : &WebAssembly::I32RegClass);
     unsigned Opc =
@@ -617,7 +617,7 @@ unsigned WebAssemblyFastISel::fastMaterializeConstant(const Constant *C) {
       return 0;
     if (GV->isThreadLocal())
       return 0;
-    unsigned ResultReg =
+    Register ResultReg =
         createResultReg(Subtarget->hasAddr64() ? &WebAssembly::I64RegClass
                                                : &WebAssembly::I32RegClass);
     unsigned Opc = Subtarget->hasAddr64() ? WebAssembly::CONST_I64
@@ -715,7 +715,7 @@ bool WebAssemblyFastISel::fastLowerArguments() {
     default:
       return false;
     }
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
         .addImm(I);
     updateValueMap(&Arg, ResultReg);
@@ -887,7 +887,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     if (Subtarget->hasAddr64()) {
       auto Wrap = BuildMI(*FuncInfo.MBB, std::prev(FuncInfo.InsertPt), DbgLoc,
                           TII.get(WebAssembly::I32_WRAP_I64));
-      unsigned Reg32 = createResultReg(&WebAssembly::I32RegClass);
+      Register Reg32 = createResultReg(&WebAssembly::I32RegClass);
       Wrap.addReg(Reg32, RegState::Define);
       Wrap.addReg(CalleeReg);
       CalleeReg = Reg32;
@@ -914,11 +914,11 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
   if (CondReg == 0)
     return false;
 
-  unsigned TrueReg = getRegForValue(Select->getTrueValue());
+  Register TrueReg = getRegForValue(Select->getTrueValue());
   if (TrueReg == 0)
     return false;
 
-  unsigned FalseReg = getRegForValue(Select->getFalseValue());
+  Register FalseReg = getRegForValue(Select->getFalseValue());
   if (FalseReg == 0)
     return false;
 
@@ -959,7 +959,7 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
     return false;
   }
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(TrueReg)
       .addReg(FalseReg)
@@ -972,12 +972,12 @@ bool WebAssemblyFastISel::selectSelect(const Instruction *I) {
 bool WebAssemblyFastISel::selectTrunc(const Instruction *I) {
   const auto *Trunc = cast<TruncInst>(I);
 
-  unsigned Reg = getRegForValue(Trunc->getOperand(0));
+  Register Reg = getRegForValue(Trunc->getOperand(0));
   if (Reg == 0)
     return false;
 
   if (Trunc->getOperand(0)->getType()->isIntegerTy(64)) {
-    unsigned Result = createResultReg(&WebAssembly::I32RegClass);
+    Register Result = createResultReg(&WebAssembly::I32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(WebAssembly::I32_WRAP_I64), Result)
         .addReg(Reg);
@@ -994,7 +994,7 @@ bool WebAssemblyFastISel::selectZExt(const Instruction *I) {
   const Value *Op = ZExt->getOperand(0);
   MVT::SimpleValueType From = getSimpleType(Op->getType());
   MVT::SimpleValueType To = getLegalType(getSimpleType(ZExt->getType()));
-  unsigned In = getRegForValue(Op);
+  Register In = getRegForValue(Op);
   if (In == 0)
     return false;
   unsigned Reg = zeroExtend(In, Op, From, To);
@@ -1011,7 +1011,7 @@ bool WebAssemblyFastISel::selectSExt(const Instruction *I) {
   const Value *Op = SExt->getOperand(0);
   MVT::SimpleValueType From = getSimpleType(Op->getType());
   MVT::SimpleValueType To = getLegalType(getSimpleType(SExt->getType()));
-  unsigned In = getRegForValue(Op);
+  Register In = getRegForValue(Op);
   if (In == 0)
     return false;
   unsigned Reg = signExtend(In, Op, From, To);
@@ -1075,7 +1075,7 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
   if (RHS == 0)
     return false;
 
-  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  Register ResultReg = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(LHS)
       .addReg(RHS);
@@ -1086,11 +1086,11 @@ bool WebAssemblyFastISel::selectICmp(const Instruction *I) {
 bool WebAssemblyFastISel::selectFCmp(const Instruction *I) {
   const auto *FCmp = cast<FCmpInst>(I);
 
-  unsigned LHS = getRegForValue(FCmp->getOperand(0));
+  Register LHS = getRegForValue(FCmp->getOperand(0));
   if (LHS == 0)
     return false;
 
-  unsigned RHS = getRegForValue(FCmp->getOperand(1));
+  Register RHS = getRegForValue(FCmp->getOperand(1));
   if (RHS == 0)
     return false;
 
@@ -1136,7 +1136,7 @@ bool WebAssemblyFastISel::selectFCmp(const Instruction *I) {
     return false;
   }
 
-  unsigned ResultReg = createResultReg(&WebAssembly::I32RegClass);
+  Register ResultReg = createResultReg(&WebAssembly::I32RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(LHS)
       .addReg(RHS);
@@ -1157,7 +1157,7 @@ bool WebAssemblyFastISel::selectBitCast(const Instruction *I) {
   if (!VT.isSimple() || !RetVT.isSimple())
     return false;
 
-  unsigned In = getRegForValue(I->getOperand(0));
+  Register In = getRegForValue(I->getOperand(0));
   if (In == 0)
     return false;
 
@@ -1229,7 +1229,7 @@ bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
 
   materializeLoadStoreOperands(Addr);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   auto MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
                      ResultReg);
 
@@ -1284,7 +1284,7 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) {
 
   materializeLoadStoreOperands(Addr);
 
-  unsigned ValueReg = getRegForValue(Store->getValueOperand());
+  Register ValueReg = getRegForValue(Store->getValueOperand());
   if (ValueReg == 0)
     return false;
   if (VTIsi1)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 38ed4c73fb93..a221f37cfd94 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -1491,8 +1491,7 @@ bool WebAssemblyTargetLowering::MatchTableForLowering(SelectionDAG &DAG,
     if (GA) {
       // We are in Case 2 above.
       Idx = Base->getOperand(1);
-      if (!Idx || GA->getNumValues() != 1 || Idx->getNumValues() != 1)
-        return false;
+      assert(GA->getNumValues() == 1);
     } else {
       // This might be Case 1 above (or an error)
       SDValue V = Base->getOperand(0);
@@ -1629,7 +1628,7 @@ SDValue WebAssemblyTargetLowering::LowerCopyToReg(SDValue Op,
     // local.copy between Op and its FI operand.
     SDValue Chain = Op.getOperand(0);
     SDLoc DL(Op);
-    unsigned Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg();
+    Register Reg = cast<RegisterSDNode>(Op.getOperand(1))->getReg();
     EVT VT = Src.getValueType();
     SDValue Copy(DAG.getMachineNode(VT == MVT::i32 ? WebAssembly::COPY_I32
                                                    : WebAssembly::COPY_I64,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 23aaa5160abd..fe656753889f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -279,6 +279,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
 
@@ -454,12 +455,12 @@ static Function *getEmscriptenFunction(FunctionType *Ty, const Twine &Name,
   // Tell the linker that this function is expected to be imported from the
   // 'env' module.
   if (!F->hasFnAttribute("wasm-import-module")) {
-    llvm::AttrBuilder B;
+    llvm::AttrBuilder B(M->getContext());
     B.addAttribute("wasm-import-module", "env");
     F->addFnAttrs(B);
   }
   if (!F->hasFnAttribute("wasm-import-name")) {
-    llvm::AttrBuilder B;
+    llvm::AttrBuilder B(M->getContext());
     B.addAttribute("wasm-import-name", F->getName());
     F->addFnAttrs(B);
   }
@@ -547,7 +548,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   for (unsigned I = 0, E = CI->arg_size(); I < E; ++I)
     ArgAttributes.push_back(InvokeAL.getParamAttrs(I));
 
-  AttrBuilder FnAttrs(InvokeAL.getFnAttrs());
+  AttrBuilder FnAttrs(CI->getContext(), InvokeAL.getFnAttrs());
   if (FnAttrs.contains(Attribute::AllocSize)) {
     // The allocsize attribute (if any) referes to parameters by index and needs
     // to be adjusted.
@@ -610,6 +611,8 @@ static bool canLongjmp(const Value *Callee) {
     return false;
   StringRef CalleeName = Callee->getName();
 
+  // TODO Include more functions or consider checking with mangled prefixes
+
   // The reason we include malloc/free here is to exclude the malloc/free
   // calls generated in setjmp prep / cleanup routines.
   if (CalleeName == "setjmp" || CalleeName == "malloc" || CalleeName == "free")
@@ -626,11 +629,50 @@ static bool canLongjmp(const Value *Callee) {
     return false;
 
   // Exception-catching related functions
-  if (CalleeName == "__cxa_begin_catch" || CalleeName == "__cxa_end_catch" ||
+  //
+  // We intentionally excluded __cxa_end_catch here even though it surely cannot
+  // longjmp, in order to maintain the unwind relationship from all existing
+  // catchpads (and calls within them) to catch.dispatch.longjmp.
+  //
+  // In Wasm EH + Wasm SjLj, we
+  // 1. Make all catchswitch and cleanuppad that unwind to caller unwind to
+  //    catch.dispatch.longjmp instead
+  // 2. Convert all longjmpable calls to invokes that unwind to
+  //    catch.dispatch.longjmp
+  // But catchswitch BBs are removed in isel, so if an EH catchswitch (generated
+  // from an exception)'s catchpad does not contain any calls that are converted
+  // into invokes unwinding to catch.dispatch.longjmp, this unwind relationship
+  // (EH catchswitch BB -> catch.dispatch.longjmp BB) is lost and
+  // catch.dispatch.longjmp BB can be placed before the EH catchswitch BB in
+  // CFGSort.
+  // int ret = setjmp(buf);
+  // try {
+  //   foo(); // longjmps
+  // } catch (...) {
+  // }
+  // Then in this code, if 'foo' longjmps, it first unwinds to 'catch (...)'
+  // catchswitch, and is not caught by that catchswitch because it is a longjmp,
+  // then it should next unwind to catch.dispatch.longjmp BB. But if this 'catch
+  // (...)' catchswitch -> catch.dispatch.longjmp unwind relationship is lost,
+  // it will not unwind to catch.dispatch.longjmp, producing an incorrect
+  // result.
+  //
+  // Every catchpad generated by Wasm C++ contains __cxa_end_catch, so we
+  // intentionally treat it as longjmpable to work around this problem. This is
+  // a hacky fix but an easy one.
+  //
+  // The comment block in findWasmUnwindDestinations() in
+  // SelectionDAGBuilder.cpp is addressing a similar problem.
+  if (CalleeName == "__cxa_begin_catch" ||
       CalleeName == "__cxa_allocate_exception" || CalleeName == "__cxa_throw" ||
       CalleeName == "__clang_call_terminate")
     return false;
 
+  // std::terminate, which is generated when another exception occurs while
+  // handling an exception, cannot longjmp.
+  if (CalleeName == "_ZSt9terminatev")
+    return false;
+
   // Otherwise we don't know
   return true;
 }
@@ -817,6 +859,32 @@ static bool containsLongjmpableCalls(const Function *F) {
   return false;
 }
 
+// When a function contains a setjmp call but not other calls that can longjmp,
+// we don't do setjmp transformation for that setjmp. But we need to convert the
+// setjmp calls into "i32 0" so they don't cause link time errors. setjmp always
+// returns 0 when called directly.
+static void nullifySetjmp(Function *F) {
+  Module &M = *F->getParent();
+  IRBuilder<> IRB(M.getContext());
+  Function *SetjmpF = M.getFunction("setjmp");
+  SmallVector<Instruction *, 1> ToErase;
+
+  for (User *U : SetjmpF->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    // FIXME 'invoke' to setjmp can happen when we use Wasm EH + Wasm SjLj, but
+    // we don't support two being used together yet.
+    if (!CI)
+      report_fatal_error("Wasm EH + Wasm SjLj is not fully supported yet");
+    BasicBlock *BB = CI->getParent();
+    if (BB->getParent() != F) // in other function
+      continue;
+    ToErase.push_back(CI);
+    CI->replaceAllUsesWith(IRB.getInt32(0));
+  }
+  for (auto *I : ToErase)
+    I->eraseFromParent();
+}
+
 bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   LLVM_DEBUG(dbgs() << "********** Lower Emscripten EH & SjLj **********\n");
 
@@ -886,6 +954,10 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
     EHTypeIDF = getEmscriptenFunction(EHTypeIDTy, "llvm_eh_typeid_for", &M);
   }
 
+  // Functions that contains calls to setjmp but don't have other longjmpable
+  // calls within them.
+  SmallPtrSet<Function *, 4> SetjmpUsersToNullify;
+
   if ((EnableEmSjLj || EnableWasmSjLj) && SetjmpF) {
     // Precompute setjmp users
     for (User *U : SetjmpF->users()) {
@@ -896,6 +968,8 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
         // so can ignore it
         if (containsLongjmpableCalls(UserF))
           SetjmpUsers.insert(UserF);
+        else
+          SetjmpUsersToNullify.insert(UserF);
       } else {
         std::string S;
         raw_string_ostream SS(S);
@@ -975,6 +1049,14 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
         runSjLjOnFunction(*F);
   }
 
+  // Replace unnecessary setjmp calls with 0
+  if ((EnableEmSjLj || EnableWasmSjLj) && !SetjmpUsersToNullify.empty()) {
+    Changed = true;
+    assert(SetjmpF);
+    for (Function *F : SetjmpUsersToNullify)
+      nullifySetjmp(F);
+  }
+
   if (!Changed) {
     // Delete unused global variables and functions
     if (ResumeF)
@@ -1078,20 +1160,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     } else {
       // This can't throw, and we don't need this invoke, just replace it with a
       // call+branch
-      SmallVector<Value *, 16> Args(II->args());
-      CallInst *NewCall =
-          IRB.CreateCall(II->getFunctionType(), II->getCalledOperand(), Args);
-      NewCall->takeName(II);
-      NewCall->setCallingConv(II->getCallingConv());
-      NewCall->setDebugLoc(II->getDebugLoc());
-      NewCall->setAttributes(II->getAttributes());
-      II->replaceAllUsesWith(NewCall);
-      ToErase.push_back(II);
-
-      IRB.CreateBr(II->getNormalDest());
-
-      // Remove any PHI node entries from the exception destination
-      II->getUnwindDest()->removePredecessor(&BB);
+      changeToCall(II);
     }
   }
 
@@ -1243,16 +1312,19 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   // Setjmp transformation
   SmallVector<PHINode *, 4> SetjmpRetPHIs;
   Function *SetjmpF = M.getFunction("setjmp");
-  for (User *U : SetjmpF->users()) {
-    auto *CI = dyn_cast<CallInst>(U);
-    // FIXME 'invoke' to setjmp can happen when we use Wasm EH + Wasm SjLj, but
-    // we don't support two being used together yet.
-    if (!CI)
-      report_fatal_error("Wasm EH + Wasm SjLj is not fully supported yet");
-    BasicBlock *BB = CI->getParent();
+  for (auto *U : make_early_inc_range(SetjmpF->users())) {
+    auto *CB = dyn_cast<CallBase>(U);
+    BasicBlock *BB = CB->getParent();
     if (BB->getParent() != &F) // in other function
       continue;
 
+    CallInst *CI = nullptr;
+    // setjmp cannot throw. So if it is an invoke, lower it to a call
+    if (auto *II = dyn_cast<InvokeInst>(CB))
+      CI = llvm::changeToCall(II);
+    else
+      CI = cast<CallInst>(CB);
+
     // The tail is everything right after the call, and will be reached once
     // when setjmp is called, and later when longjmp returns to the setjmp
     BasicBlock *Tail = SplitBlock(BB, CI->getNextNode());
@@ -1568,6 +1640,13 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForEmscriptenSjLj(
     I->eraseFromParent();
 }
 
+static BasicBlock *getCleanupRetUnwindDest(const CleanupPadInst *CPI) {
+  for (const User *U : CPI->users())
+    if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+      return CRI->getUnwindDest();
+  return nullptr;
+}
+
 // Create a catchpad in which we catch a longjmp's env and val arguments, test
 // if the longjmp corresponds to one of setjmps in the current function, and if
 // so, jump to the setjmp dispatch BB from which we go to one of post-setjmp
@@ -1619,18 +1698,18 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
       BasicBlock::Create(C, "setjmp.dispatch", &F, OrigEntry);
   cast<BranchInst>(Entry->getTerminator())->setSuccessor(0, SetjmpDispatchBB);
 
-  // Create catch.dispatch.longjmp BB a catchswitch instruction
-  BasicBlock *CatchSwitchBB =
+  // Create catch.dispatch.longjmp BB and a catchswitch instruction
+  BasicBlock *CatchDispatchLongjmpBB =
       BasicBlock::Create(C, "catch.dispatch.longjmp", &F);
-  IRB.SetInsertPoint(CatchSwitchBB);
-  CatchSwitchInst *CatchSwitch =
+  IRB.SetInsertPoint(CatchDispatchLongjmpBB);
+  CatchSwitchInst *CatchSwitchLongjmp =
       IRB.CreateCatchSwitch(ConstantTokenNone::get(C), nullptr, 1);
 
   // Create catch.longjmp BB and a catchpad instruction
   BasicBlock *CatchLongjmpBB = BasicBlock::Create(C, "catch.longjmp", &F);
-  CatchSwitch->addHandler(CatchLongjmpBB);
+  CatchSwitchLongjmp->addHandler(CatchLongjmpBB);
   IRB.SetInsertPoint(CatchLongjmpBB);
-  CatchPadInst *CatchPad = IRB.CreateCatchPad(CatchSwitch, {});
+  CatchPadInst *CatchPad = IRB.CreateCatchPad(CatchSwitchLongjmp, {});
 
   // Wasm throw and catch instructions can throw and catch multiple values, but
   // that requires multivalue support in the toolchain, which is currently not
@@ -1696,9 +1775,9 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
 
   // Convert all longjmpable call instructions to invokes that unwind to the
   // newly created catch.dispatch.longjmp BB.
-  SmallVector<Instruction *, 64> ToErase;
+  SmallVector<CallInst *, 64> LongjmpableCalls;
   for (auto *BB = &*F.begin(); BB; BB = BB->getNextNode()) {
-    for (Instruction &I : *BB) {
+    for (auto &I : *BB) {
       auto *CI = dyn_cast<CallInst>(&I);
       if (!CI)
         continue;
@@ -1716,29 +1795,66 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
       // setjmps in this function. We should not convert this call to an invoke.
       if (CI == WasmLongjmpCI)
         continue;
-      ToErase.push_back(CI);
+      LongjmpableCalls.push_back(CI);
+    }
+  }
 
-      // Even if the callee function has attribute 'nounwind', which is true for
-      // all C functions, it can longjmp, which means it can throw a Wasm
-      // exception now.
-      CI->removeFnAttr(Attribute::NoUnwind);
-      if (Function *CalleeF = CI->getCalledFunction()) {
-        CalleeF->removeFnAttr(Attribute::NoUnwind);
+  for (auto *CI : LongjmpableCalls) {
+    // Even if the callee function has attribute 'nounwind', which is true for
+    // all C functions, it can longjmp, which means it can throw a Wasm
+    // exception now.
+    CI->removeFnAttr(Attribute::NoUnwind);
+    if (Function *CalleeF = CI->getCalledFunction())
+      CalleeF->removeFnAttr(Attribute::NoUnwind);
+
+    // Change it to an invoke and make it unwind to the catch.dispatch.longjmp
+    // BB. If the call is enclosed in another catchpad/cleanuppad scope, unwind
+    // to its parent pad's unwind destination instead to preserve the scope
+    // structure. It will eventually unwind to the catch.dispatch.longjmp.
+    SmallVector<OperandBundleDef, 1> Bundles;
+    BasicBlock *UnwindDest = nullptr;
+    if (auto Bundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
+      Instruction *FromPad = cast<Instruction>(Bundle->Inputs[0]);
+      while (!UnwindDest && FromPad) {
+        if (auto *CPI = dyn_cast<CatchPadInst>(FromPad)) {
+          UnwindDest = CPI->getCatchSwitch()->getUnwindDest();
+          FromPad = nullptr; // stop searching
+        } else if (auto *CPI = dyn_cast<CleanupPadInst>(FromPad)) {
+          // getCleanupRetUnwindDest() can return nullptr when
+          // 1. This cleanuppad's matching cleanupret uwninds to caller
+          // 2. There is no matching cleanupret because it ends with
+          //    unreachable.
+          // In case of 2, we need to traverse the parent pad chain.
+          UnwindDest = getCleanupRetUnwindDest(CPI);
+          FromPad = cast<Instruction>(CPI->getParentPad());
+        }
       }
+    }
+    if (!UnwindDest)
+      UnwindDest = CatchDispatchLongjmpBB;
+    changeToInvokeAndSplitBasicBlock(CI, UnwindDest);
+  }
 
-      IRB.SetInsertPoint(CI);
-      BasicBlock *Tail = SplitBlock(BB, CI->getNextNode());
-      // We will add a new invoke. So remove the branch created when we split
-      // the BB
-      ToErase.push_back(BB->getTerminator());
-      SmallVector<Value *, 8> Args(CI->args());
-      InvokeInst *II =
-          IRB.CreateInvoke(CI->getFunctionType(), CI->getCalledOperand(), Tail,
-                           CatchSwitchBB, Args);
-      II->takeName(CI);
-      II->setDebugLoc(CI->getDebugLoc());
-      II->setAttributes(CI->getAttributes());
-      CI->replaceAllUsesWith(II);
+  SmallVector<Instruction *, 16> ToErase;
+  for (auto &BB : F) {
+    if (auto *CSI = dyn_cast<CatchSwitchInst>(BB.getFirstNonPHI())) {
+      if (CSI != CatchSwitchLongjmp && CSI->unwindsToCaller()) {
+        IRB.SetInsertPoint(CSI);
+        ToErase.push_back(CSI);
+        auto *NewCSI = IRB.CreateCatchSwitch(CSI->getParentPad(),
+                                             CatchDispatchLongjmpBB, 1);
+        NewCSI->addHandler(*CSI->handler_begin());
+        NewCSI->takeName(CSI);
+        CSI->replaceAllUsesWith(NewCSI);
+      }
+    }
+
+    if (auto *CRI = dyn_cast<CleanupReturnInst>(BB.getTerminator())) {
+      if (CRI->unwindsToCaller()) {
+        IRB.SetInsertPoint(CRI);
+        ToErase.push_back(CRI);
+        IRB.CreateCleanupRet(CRI->getCleanupPad(), CatchDispatchLongjmpBB);
+      }
     }
   }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 3a0bef8c765c..ca6f3f194645 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <map>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-lower-global-dtors"
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index 9d83a75a8247..6a6cac6d956f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -82,7 +82,7 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
   // Split multiple-VN LiveIntervals into multiple LiveIntervals.
   SmallVector<LiveInterval *, 4> SplitLIs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = Register::index2VirtReg(I);
+    Register Reg = Register::index2VirtReg(I);
     auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
 
     if (MRI.reg_nodbg_empty(Reg))
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
index 8b8593ddcbdd..5682cadc1a64 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyPrepareForLiveIntervals.cpp
@@ -95,7 +95,7 @@ bool WebAssemblyPrepareForLiveIntervals::runOnMachineFunction(
   // TODO: This is fairly heavy-handed; find a better approach.
   //
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = Register::index2VirtReg(I);
+    Register Reg = Register::index2VirtReg(I);
 
     // Skip unused registers.
     if (MRI.use_nodbg_empty(Reg))
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index fe127dec8aed..5252db4858b9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -98,7 +98,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
 
   LLVM_DEBUG(dbgs() << "Interesting register intervals:\n");
   for (unsigned I = 0; I < NumVRegs; ++I) {
-    unsigned VReg = Register::index2VirtReg(I);
+    Register VReg = Register::index2VirtReg(I);
     if (MFI.isVRegStackified(VReg))
       continue;
     // Skip unused registers, which can use $drop.
@@ -135,7 +135,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
     LiveInterval *LI = SortedIntervals[I];
-    unsigned Old = LI->reg();
+    Register Old = LI->reg();
     size_t Color = I;
     const TargetRegisterClass *RC = MRI->getRegClass(Old);
 
@@ -152,7 +152,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
       continue_outer:;
       }
 
-    unsigned New = SortedIntervals[Color]->reg();
+    Register New = SortedIntervals[Color]->reg();
     SlotMapping[I] = New;
     Changed |= Old != New;
     UsedColors.set(Color);
@@ -168,7 +168,7 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
 
   // Rewrite register operands.
   for (size_t I = 0, E = SortedIntervals.size(); I < E; ++I) {
-    unsigned Old = SortedIntervals[I]->reg();
+    Register Old = SortedIntervals[I]->reg();
     unsigned New = SlotMapping[I];
     if (Old != New)
       MRI->replaceRegWith(Old, New);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index c73b8a29daeb..76c78cd23130 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -89,7 +89,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
   // Start the numbering for locals after the arg regs
   unsigned CurReg = MFI.getParams().size();
   for (unsigned VRegIdx = 0; VRegIdx < NumVRegs; ++VRegIdx) {
-    unsigned VReg = Register::index2VirtReg(VRegIdx);
+    Register VReg = Register::index2VirtReg(VRegIdx);
     // Skip unused registers.
     if (MRI.use_empty(VReg))
       continue;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 42419259802e..d3ad47147ac8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -909,8 +909,8 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
                SubsequentUse != Use.getParent()->uses().end()) {
           if (!SubsequentDef->isReg() || !SubsequentUse->isReg())
             break;
-          unsigned DefReg = SubsequentDef->getReg();
-          unsigned UseReg = SubsequentUse->getReg();
+          Register DefReg = SubsequentDef->getReg();
+          Register UseReg = SubsequentUse->getReg();
           // TODO: This single-use restriction could be relaxed by using tees
           if (DefReg != UseReg || !MRI.hasOneUse(DefReg))
             break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index add3c799f4aa..912f61765579 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -42,8 +42,7 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
                                            const std::string &FS,
                                            const TargetMachine &TM)
     : WebAssemblyGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS),
-      TargetTriple(TT), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)), TSInfo(),
+      TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableAtomicExpand() const {
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 2ba0b97229cc..e9ecff3bf514 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -216,7 +216,7 @@ private:
       // The operator on the top of the stack has higher precedence than the
       // new operator.
       unsigned ParenCount = 0;
-      while (1) {
+      while (true) {
         // Nothing to process.
         if (InfixOperatorStack.empty())
           break;
@@ -3030,7 +3030,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   ForcedDispEncoding = DispEncoding_Default;
 
   // Parse pseudo prefixes.
-  while (1) {
+  while (true) {
     if (Name == "{") {
       if (getLexer().isNot(AsmToken::Identifier))
         return Error(Parser.getTok().getLoc(), "Unexpected token after '{'");
@@ -3370,7 +3370,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
       Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
 
     // Read the operands.
-    while(1) {
+    while (true) {
       if (ParseOperand(Operands))
         return true;
       if (HandleAVX512Operand(Operands))
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
new file mode 100644
index 000000000000..78379290aae9
--- /dev/null
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -0,0 +1,64 @@
+//===------------------- X86CustomBehaviour.cpp -----------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements methods from the X86CustomBehaviour class.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86CustomBehaviour.h"
+#include "TargetInfo/X86TargetInfo.h"
+#include "X86InstrInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/WithColor.h"
+
+namespace llvm {
+namespace mca {
+
+void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
+                                         const MCInst &MCI) {
+  switch (MCI.getOpcode()) {
+  case X86::MFENCE:
+    Inst->setLoadBarrier(true);
+    Inst->setStoreBarrier(true);
+    break;
+  case X86::LFENCE:
+    Inst->setLoadBarrier(true);
+    break;
+  case X86::SFENCE:
+    Inst->setStoreBarrier(true);
+    break;
+  }
+}
+
+void X86InstrPostProcess::postProcessInstruction(
+    std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
+  // Currently, we only modify certain instructions' IsALoadBarrier and
+  // IsAStoreBarrier flags.
+  setMemBarriers(Inst, MCI);
+}
+
+} // namespace mca
+} // namespace llvm
+
+using namespace llvm;
+using namespace mca;
+
+static InstrPostProcess *createX86InstrPostProcess(const MCSubtargetInfo &STI,
+                                                   const MCInstrInfo &MCII) {
+  return new X86InstrPostProcess(STI, MCII);
+}
+
+/// Extern function to initialize the targets for the X86 backend
+
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMCA() {
+  TargetRegistry::RegisterInstrPostProcess(getTheX86_32Target(),
+                                           createX86InstrPostProcess);
+  TargetRegistry::RegisterInstrPostProcess(getTheX86_64Target(),
+                                           createX86InstrPostProcess);
+}
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
new file mode 100644
index 000000000000..24d26751f0a1
--- /dev/null
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -0,0 +1,47 @@
+//===-------------------- X86CustomBehaviour.h ------------------*-C++ -* -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines the X86CustomBehaviour class which inherits from
+/// CustomBehaviour. This class is used by the tool llvm-mca to enforce
+/// target specific behaviour that is not expressed well enough in the
+/// scheduling model for mca to enforce it automatically.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H
+#define LLVM_LIB_TARGET_X86_MCA_X86CUSTOMBEHAVIOUR_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MCA/CustomBehaviour.h"
+#include "llvm/Support/TargetParser.h"
+
+namespace llvm {
+namespace mca {
+
+class X86InstrPostProcess : public InstrPostProcess {
+  void processWaitCnt(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
+  /// Called within X86InstrPostProcess to specify certain instructions
+  /// as load and store barriers.
+  void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
+public:
+  X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
+      : InstrPostProcess(STI, MCII) {}
+
+  ~X86InstrPostProcess() {}
+
+  void postProcessInstruction(std::unique_ptr<Instruction> &Inst,
+                              const MCInst &MCI) override;
+};
+
+} // namespace mca
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index bb12ede3b729..fd82bdcd1a23 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -40,4 +40,4 @@ protected:
 
 } // end namespace llvm
 
-#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86ATTINSTPRINTER_H
+#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86INSTPRINTERCOMMON_H
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 9da0a8129f23..8913e405539e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -111,6 +111,15 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
 
       {codeview::RegisterId::EFLAGS, X86::EFLAGS},
 
+      {codeview::RegisterId::ST0, X86::ST0},
+      {codeview::RegisterId::ST1, X86::ST1},
+      {codeview::RegisterId::ST2, X86::ST2},
+      {codeview::RegisterId::ST3, X86::ST3},
+      {codeview::RegisterId::ST4, X86::ST4},
+      {codeview::RegisterId::ST5, X86::ST5},
+      {codeview::RegisterId::ST6, X86::ST6},
+      {codeview::RegisterId::ST7, X86::ST7},
+
       {codeview::RegisterId::ST0, X86::FP0},
       {codeview::RegisterId::ST1, X86::FP1},
       {codeview::RegisterId::ST2, X86::FP2},
@@ -281,8 +290,8 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
       {codeview::RegisterId::AMD64_XMM31, X86::XMM31},
 
   };
-  for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
-    MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
+  for (const auto &I : RegMap)
+    MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
 }
 
 MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index b22f25af26cf..94679e6e3d11 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -23,7 +23,6 @@ class MCCodeEmitter;
 class MCStreamer;
 class X86Subtarget;
 class TargetMachine;
-struct ASanAccessInfo;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget = nullptr;
diff --git a/llvm/lib/Target/X86/X86CallLowering.h b/llvm/lib/Target/X86/X86CallLowering.h
index ac5b92bf4aae..0ad67cfd3532 100644
--- a/llvm/lib/Target/X86/X86CallLowering.h
+++ b/llvm/lib/Target/X86/X86CallLowering.h
@@ -20,8 +20,6 @@
 namespace llvm {
 
 template <typename T> class ArrayRef;
-class DataLayout;
-class MachineRegisterInfo;
 class X86TargetLowering;
 
 class X86CallLowering : public CallLowering {
diff --git a/llvm/lib/Target/X86/X86FastTileConfig.cpp b/llvm/lib/Target/X86/X86FastTileConfig.cpp
index 47874e82ff3b..061fff50bcea 100644
--- a/llvm/lib/Target/X86/X86FastTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86FastTileConfig.cpp
@@ -56,8 +56,6 @@ public:
   bool isTileLoad(MachineInstr &MI);
   bool isTileStore(MachineInstr &MI);
   bool isAMXInstr(MachineInstr &MI);
-  void getTileStoreShape(MachineInstr &MI,
-                         SmallVector<MachineOperand *> &ShapedTiles);
 
   MachineInstr *getKeyAMXInstr(MachineInstr *MI);
   void getTileShapesCfg(MachineInstr *MI,
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 0a7aea467809..51f2ced321bb 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -149,6 +149,17 @@ static unsigned getLEArOpcode(bool IsLP64) {
   return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
 
+static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
+  if (Use64BitReg) {
+    if (isUInt<32>(Imm))
+      return X86::MOV32ri64;
+    if (isInt<32>(Imm))
+      return X86::MOV64ri32;
+    return X86::MOV64ri;
+  }
+  return X86::MOV32ri;
+}
+
 static bool isEAXLiveIn(MachineBasicBlock &MBB) {
   for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
     unsigned Reg = RegMask.PhysReg;
@@ -237,11 +248,10 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
     else
       Reg = TRI->findDeadCallerSavedReg(MBB, MBBI);
 
-    unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
     unsigned AddSubRROpc =
         isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
     if (Reg) {
-      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
+      BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Reg)
           .addImm(Offset)
           .setMIFlag(Flag);
       MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
@@ -267,7 +277,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
         Offset = -(Offset - SlotSize);
       else
         Offset = Offset + SlotSize;
-      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
+      BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Offset)), Rax)
           .addImm(Offset)
           .setMIFlag(Flag);
       MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
@@ -434,7 +444,7 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
 
 /// Emits Dwarf Info specifying offsets of callee saved registers and
 /// frame pointer. This is called only when basic block sections are enabled.
-void X86FrameLowering::emitCalleeSavedFrameMoves(
+void X86FrameLowering::emitCalleeSavedFrameMovesFullCFA(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
   MachineFunction &MF = *MBB.getParent();
   if (!hasFP(MF)) {
@@ -469,7 +479,7 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
   // Calculate offsets.
   for (const CalleeSavedInfo &I : CSI) {
     int64_t Offset = MFI.getObjectOffset(I.getFrameIdx());
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
 
     if (IsPrologue) {
@@ -637,6 +647,8 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
     uint64_t AlignOffset) const {
   assert(Offset && "null offset");
 
+  const bool NeedsDwarfCFI = needsDwarfCFI(MF);
+  const bool HasFP = hasFP(MF);
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   const X86TargetLowering &TLI = *STI.getTargetLowering();
   const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
@@ -676,17 +688,36 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   Register FinalStackProbed = Uses64BitFramePtr ? X86::R11
                               : Is64Bit         ? X86::R11D
                                                 : X86::EAX;
+
   BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
 
   // save loop bound
   {
-    const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+    const unsigned BoundOffset = alignDown(Offset, StackProbeSize);
+    const unsigned SUBOpc = getSUBriOpcode(Uses64BitFramePtr, BoundOffset);
     BuildMI(MBB, MBBI, DL, TII.get(SUBOpc), FinalStackProbed)
         .addReg(FinalStackProbed)
-        .addImm(Offset / StackProbeSize * StackProbeSize)
+        .addImm(BoundOffset)
         .setMIFlag(MachineInstr::FrameSetup);
+
+    // while in the loop, use loop-invariant reg for CFI,
+    // instead of the stack pointer, which changes during the loop
+    if (!HasFP && NeedsDwarfCFI) {
+      // x32 uses the same DWARF register numbers as x86-64,
+      // so there isn't a register number for r11d, we must use r11 instead
+      const Register DwarfFinalStackProbed =
+          STI.isTarget64BitILP32()
+              ? Register(getX86SubSuperRegister(FinalStackProbed, 64))
+              : FinalStackProbed;
+
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createDefCfaRegister(
+                   nullptr, TRI->getDwarfRegNum(DwarfFinalStackProbed, true)));
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createAdjustCfaOffset(nullptr, BoundOffset));
+    }
   }
 
   // allocate a page
@@ -725,15 +756,30 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   MBB.addSuccessor(testMBB);
 
   // handle tail
-  unsigned TailOffset = Offset % StackProbeSize;
+  const unsigned TailOffset = Offset % StackProbeSize;
+  MachineBasicBlock::iterator TailMBBIter = tailMBB->begin();
   if (TailOffset) {
     const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset);
-    BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr)
+    BuildMI(*tailMBB, TailMBBIter, DL, TII.get(Opc), StackPtr)
         .addReg(StackPtr)
         .addImm(TailOffset)
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // after the loop, switch back to stack pointer for CFI
+  if (!HasFP && NeedsDwarfCFI) {
+    // x32 uses the same DWARF register numbers as x86-64,
+    // so there isn't a register number for esp, we must use rsp instead
+    const Register DwarfStackPtr =
+        STI.isTarget64BitILP32()
+            ? Register(getX86SubSuperRegister(StackPtr, 64))
+            : Register(StackPtr);
+
+    BuildCFI(*tailMBB, TailMBBIter, DL,
+             MCCFIInstruction::createDefCfaRegister(
+                 nullptr, TRI->getDwarfRegNum(DwarfStackPtr, true)));
+  }
+
   // Update Live In information
   recomputeLiveIns(*testMBB);
   recomputeLiveIns(*tailMBB);
@@ -1705,19 +1751,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       // Handle the 64-bit Windows ABI case where we need to call __chkstk.
       // Function prologue is responsible for adjusting the stack pointer.
       int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
-      if (isUInt<32>(Alloc)) {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
-            .addImm(Alloc)
-            .setMIFlag(MachineInstr::FrameSetup);
-      } else if (isInt<32>(Alloc)) {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri32), X86::RAX)
-            .addImm(Alloc)
-            .setMIFlag(MachineInstr::FrameSetup);
-      } else {
-        BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
-            .addImm(Alloc)
-            .setMIFlag(MachineInstr::FrameSetup);
-      }
+      BuildMI(MBB, MBBI, DL, TII.get(getMOVriOpcode(Is64Bit, Alloc)), X86::RAX)
+          .addImm(Alloc)
+          .setMIFlag(MachineInstr::FrameSetup);
     } else {
       // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
       // We'll also use 4 already allocated bytes for EAX.
@@ -2497,7 +2533,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
 
   // Assign slots for GPRs. It increases frame size.
   for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
 
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
@@ -2514,7 +2550,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
 
   // Assign slots for XMMs.
   for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
 
@@ -2560,7 +2596,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
   const MachineFunction &MF = *MBB.getParent();
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
   for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
 
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
@@ -2594,7 +2630,7 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
   // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
   // It can be done by spilling XMMs to stack frame.
   for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
 
@@ -2672,7 +2708,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
 
   // Reload XMMs from stack frame.
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     if (X86::GR64RegClass.contains(Reg) ||
         X86::GR32RegClass.contains(Reg))
       continue;
@@ -2689,7 +2725,7 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
   // POP GPRs.
   unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     if (!X86::GR64RegClass.contains(Reg) &&
         !X86::GR32RegClass.contains(Reg))
       continue;
@@ -2944,15 +2980,16 @@ void X86FrameLowering::adjustForSegmentedStacks(
     const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
     const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
     const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
-    const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
 
     if (IsNested)
       BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
 
-    BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
-      .addImm(StackSize);
-    BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
-      .addImm(X86FI->getArgumentStackSize());
+    BuildMI(allocMBB, DL, TII.get(getMOVriOpcode(IsLP64, StackSize)), Reg10)
+        .addImm(StackSize);
+    BuildMI(allocMBB, DL,
+            TII.get(getMOVriOpcode(IsLP64, X86FI->getArgumentStackSize())),
+            Reg11)
+        .addImm(X86FI->getArgumentStackSize());
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index e18be0d26321..987facbfeae4 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -65,9 +65,8 @@ public:
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologMBB) const override;
 
-  void
-  emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI) const override;
+  void emitCalleeSavedFrameMovesFullCFA(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const override;
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 7ed05fd0331d..5b90c67deae6 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -80,9 +80,9 @@ namespace {
     bool NegateIndex = false;
 
     X86ISelAddressMode()
-        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
-          Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
-          MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {}
+        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), Disp(0), GV(nullptr),
+          CP(nullptr), BlockAddr(nullptr), ES(nullptr), MCSym(nullptr), JT(-1),
+          SymbolFlags(X86II::MO_NO_FLAG) {}
 
     bool hasSymbolicDisplacement() const {
       return GV != nullptr || CP != nullptr || ES != nullptr ||
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6f6361b6757b..aff72452af6c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1096,6 +1096,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ROTR,             VT, Custom);
     }
 
+    setOperationAction(ISD::FSHL,       MVT::v16i8, Custom);
+    setOperationAction(ISD::FSHR,       MVT::v16i8, Custom);
+    setOperationAction(ISD::FSHL,       MVT::v4i32, Custom);
+    setOperationAction(ISD::FSHR,       MVT::v4i32, Custom);
+
     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
     setOperationAction(ISD::STRICT_FADD,        MVT::v2f64, Legal);
     setOperationAction(ISD::STRICT_FSUB,        MVT::v2f64, Legal);
@@ -1284,6 +1289,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ROTR, VT, Custom);
     }
 
+    setOperationAction(ISD::FSHL,       MVT::v32i8, Custom);
+    setOperationAction(ISD::FSHR,       MVT::v32i8, Custom);
+    setOperationAction(ISD::FSHL,       MVT::v8i32, Custom);
+    setOperationAction(ISD::FSHR,       MVT::v8i32, Custom);
+
     // These types need custom splitting if their input is a 128-bit vector.
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i64,  Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i32, Custom);
@@ -1688,6 +1698,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
     }
 
+    setOperationAction(ISD::FSHL,       MVT::v64i8, Custom);
+    setOperationAction(ISD::FSHR,       MVT::v64i8, Custom);
+    setOperationAction(ISD::FSHL,      MVT::v16i32, Custom);
+    setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
+
     if (Subtarget.hasDQI()) {
       setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
       setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
@@ -5475,10 +5490,9 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 /// materialize the FP immediate as a load from a constant pool.
 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
                                      bool ForCodeSize) const {
-  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
-    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+  for (const APFloat &FPImm : LegalFPImmediates)
+    if (Imm.bitwiseIsEqual(FPImm))
       return true;
-  }
   return false;
 }
 
@@ -6132,6 +6146,29 @@ static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
   return DAG.getBitcast(VT, Vec);
 }
 
+// Helper to determine if the ops are all the extracted subvectors come from a
+// single source. If we allow commute they don't have to be in order (Lo/Hi).
+static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
+  if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+      RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+      LHS.getValueType() != RHS.getValueType() ||
+      LHS.getOperand(0) != RHS.getOperand(0))
+    return SDValue();
+
+  SDValue Src = LHS.getOperand(0);
+  if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
+    return SDValue();
+
+  unsigned NumElts = LHS.getValueType().getVectorNumElements();
+  if ((LHS.getConstantOperandAPInt(1) == 0 &&
+       RHS.getConstantOperandAPInt(1) == NumElts) ||
+      (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
+       LHS.getConstantOperandAPInt(1) == NumElts))
+    return Src;
+
+  return SDValue();
+}
+
 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
                                 const SDLoc &dl, unsigned vectorWidth) {
   EVT VT = Vec.getValueType();
@@ -6850,8 +6887,8 @@ static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
         DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
       return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
 
-    if (DAG.ComputeMinSignedBits(LHS) <= EltSizeInBits &&
-        DAG.ComputeMinSignedBits(RHS) <= EltSizeInBits)
+    if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
+        DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
       return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
   }
 
@@ -7907,6 +7944,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
 // Attempt to decode ops that could be represented as a shuffle mask.
 // The decoded shuffle mask may contain a different number of elements to the
 // destination value type.
+// TODO: Merge into getTargetShuffleInputs()
 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
@@ -8355,6 +8393,9 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
                                    APInt &KnownUndef, APInt &KnownZero,
                                    const SelectionDAG &DAG, unsigned Depth,
                                    bool ResolveKnownElts) {
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return false; // Limit search depth.
+
   EVT VT = Op.getValueType();
   if (!VT.isSimple() || !VT.isVector())
     return false;
@@ -9233,8 +9274,13 @@ static bool isFoldableUseOfShuffle(SDNode *N) {
       return true;
     if (Opc == ISD::BITCAST) // Ignore bitcasts
       return isFoldableUseOfShuffle(U);
-    if (N->hasOneUse())
+    if (N->hasOneUse()) {
+      // TODO, there may be some general way to know if a SDNode can
+      // be folded. We now only know whether an MI is foldable.
+      if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
+        return false;
       return true;
+    }
   }
   return false;
 }
@@ -10055,13 +10101,18 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
   if (IsSubAdd)
     return SDValue();
 
-  // Do not generate X86ISD::ADDSUB node for 512-bit types even though
-  // the ADDSUB idiom has been successfully recognized. There are no known
-  // X86 targets with 512-bit ADDSUB instructions!
-  // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
-  // recognition.
-  if (VT.is512BitVector())
-    return SDValue();
+  // There are no known X86 targets with 512-bit ADDSUB instructions!
+  // Convert to blend(fsub,fadd).
+  if (VT.is512BitVector()) {
+    SmallVector<int> Mask;
+    for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
+        Mask.push_back(I);
+        Mask.push_back(I + E + 1);
+    }
+    SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
+    SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
+    return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
+  }
 
   return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
 }
@@ -12162,12 +12213,13 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
   return SDValue();
 }
 
-/// Check whether a compaction lowering can be done by dropping even
-/// elements and compute how many times even elements must be dropped.
+/// Check whether a compaction lowering can be done by dropping even/odd
+/// elements and compute how many times even/odd elements must be dropped.
 ///
 /// This handles shuffles which take every Nth element where N is a power of
 /// two. Example shuffle masks:
 ///
+/// (even)
 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
 ///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 ///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
@@ -12175,16 +12227,20 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
 ///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
 ///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
 ///
+/// (odd)
+///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15,  0,  2,  4,  6,  8, 10, 12, 14
+///  N = 1:  1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+///
 /// Any of these lanes can of course be undef.
 ///
 /// This routine only supports N <= 3.
 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
 /// for larger N.
 ///
-/// \returns N above, or the number of times even elements must be dropped if
-/// there is such a number. Otherwise returns zero.
-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
-                                          bool IsSingleInput) {
+/// \returns N above, or the number of times even/odd elements must be dropped
+/// if there is such a number. Otherwise returns zero.
+static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
+                                      bool IsSingleInput) {
   // The modulus for the shuffle vector entries is based on whether this is
   // a single input or not.
   int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
@@ -12192,6 +12248,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
          "We should only be called with masks with a power-of-2 size!");
 
   uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+  int Offset = MatchEven ? 0 : 1;
 
   // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
   // and 2^3 simultaneously. This is because we may have ambiguity with
@@ -12210,7 +12267,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
         uint64_t N = j + 1;
 
         // The shuffle mask must be equal to (i * 2^N) % M.
-        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+        if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
           IsAnyViable = true;
         else
           ViableForN[j] = false;
@@ -15724,7 +15781,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
   // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
   // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
-  int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
+  int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
   if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
       !Subtarget.hasVLX()) {
     // Check if this is part of a 256-bit vector truncation.
@@ -15758,6 +15815,20 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
   }
 
+  // When compacting odd (upper) elements, use PACKSS pre-SSE41.
+  int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
+  if (NumOddDrops == 1) {
+    bool HasSSE41 = Subtarget.hasSSE41();
+    V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
+                     DAG.getBitcast(MVT::v4i32, V1),
+                     DAG.getTargetConstant(16, DL, MVT::i8));
+    V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
+                     DAG.getBitcast(MVT::v4i32, V2),
+                     DAG.getTargetConstant(16, DL, MVT::i8));
+    return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
+                       MVT::v8i16, V1, V2);
+  }
+
   // Try to lower by permuting the inputs into an unpack instruction.
   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
                                                       Mask, Subtarget, DAG))
@@ -16024,7 +16095,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Check for compaction patterns.
   bool IsSingleInput = V2.isUndef();
-  int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
+  int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
 
   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   // with PSHUFB. It is important to do this before we attempt to generate any
@@ -16135,6 +16206,19 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Result;
   }
 
+  int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
+  if (NumOddDrops == 1) {
+    V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
+                     DAG.getBitcast(MVT::v8i16, V1),
+                     DAG.getTargetConstant(8, DL, MVT::i8));
+    if (!IsSingleInput)
+      V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
+                       DAG.getBitcast(MVT::v8i16, V2),
+                       DAG.getTargetConstant(8, DL, MVT::i8));
+    return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
+                       IsSingleInput ? V1 : V2);
+  }
+
   // Handle multi-input cases by blending/unpacking single-input shuffles.
   if (NumV2Elements > 0)
     return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
@@ -16538,20 +16622,19 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
   // If there are only inputs from one 128-bit lane, splitting will in fact be
   // less expensive. The flags track whether the given lane contains an element
   // that crosses to another lane.
+  bool AllLanes;
   if (!Subtarget.hasAVX2()) {
     bool LaneCrossing[2] = {false, false};
     for (int i = 0; i < Size; ++i)
       if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
         LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
-    if (!LaneCrossing[0] || !LaneCrossing[1])
-      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+    AllLanes = LaneCrossing[0] && LaneCrossing[1];
   } else {
     bool LaneUsed[2] = {false, false};
     for (int i = 0; i < Size; ++i)
       if (Mask[i] >= 0)
         LaneUsed[(Mask[i] % Size) / LaneSize] = true;
-    if (!LaneUsed[0] || !LaneUsed[1])
-      return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+    AllLanes = LaneUsed[0] && LaneUsed[1];
   }
 
   // TODO - we could support shuffling V2 in the Flipped input.
@@ -16569,6 +16652,11 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
   assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
          "In-lane shuffle mask expected");
 
+  // If we're not using both lanes in each lane and the inlane mask is not
+  // repeating, then we're better off splitting.
+  if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
+    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+
   // Flip the lanes, and shuffle the results which should now be in-lane.
   MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
   SDValue Flipped = DAG.getBitcast(PVT, V1);
@@ -22598,7 +22686,7 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
 /// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
 /// This mode isn't supported in hardware on X86. But as long as we aren't
 /// compiling with trapping math, we can emulate this with
-/// floor(X + copysign(nextafter(0.5, 0.0), X)).
+/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
 static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   SDLoc dl(Op);
@@ -23157,10 +23245,10 @@ static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
         // For equality comparisons try to use SIGN_EXTEND if the input was
         // truncate from something with enough sign bits.
         if (Op0.getOpcode() == ISD::TRUNCATE) {
-          if (DAG.ComputeMinSignedBits(Op0.getOperand(0)) <= 16)
+          if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
             ExtendOp = ISD::SIGN_EXTEND;
         } else if (Op1.getOpcode() == ISD::TRUNCATE) {
-          if (DAG.ComputeMinSignedBits(Op1.getOperand(0)) <= 16)
+          if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
             ExtendOp = ISD::SIGN_EXTEND;
         }
       }
@@ -24543,32 +24631,27 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-
-      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
       SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
 
-      // Apply further optimizations for special cases
-      // (select (x != 0), -1, 0) -> neg & sbb
-      // (select (x == 0), 0, -1) -> neg & sbb
-      if (isNullConstant(Y) &&
-          (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
+      // 'X - 1' sets the carry flag if X == 0.
+      // '0 - X' sets the carry flag if X != 0.
+      // Convert the carry flag to a -1/0 mask with sbb:
+      // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
+      // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
+      // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
+      // select (X == 0), -1, Y --> X - 1; or (sbb), Y
+      SDValue Sub;
+      if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
-        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
-        Zero = DAG.getConstant(0, DL, Op.getValueType());
-        return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
+        Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
+      } else {
+        SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
+        Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
       }
-
-      Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
-                        CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
-
-      SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
-      SDValue Res =   // Res = 0 or -1.
-        DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
-
-      if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
-        Res = DAG.getNOT(DL, Res, Res.getValueType());
-
-      return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
+      SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
+                                DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+                                Sub.getValue(1));
+      return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
                Cmp.getOperand(0).getOpcode() == ISD::AND &&
                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
@@ -25725,9 +25808,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
-                  SDValue PreservedSrc,
-                  const X86Subtarget &Subtarget,
-                  SelectionDAG &DAG) {
+                                    SDValue PreservedSrc,
+                                    const X86Subtarget &Subtarget,
+                                    SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
   unsigned OpcodeSelect = ISD::VSELECT;
@@ -29743,20 +29826,106 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
   bool IsFSHR = Op.getOpcode() == ISD::FSHR;
 
   if (VT.isVector()) {
-    assert(Subtarget.hasVBMI2() && "Expected VBMI2");
+    APInt APIntShiftAmt;
+    bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
 
-    if (IsFSHR)
-      std::swap(Op0, Op1);
+    if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
+      if (IsFSHR)
+        std::swap(Op0, Op1);
 
-    APInt APIntShiftAmt;
-    if (X86::isConstantSplat(Amt, APIntShiftAmt)) {
-      uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
-      SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
-      return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
-                           {Op0, Op1, Imm}, DAG, Subtarget);
+      if (IsCstSplat) {
+        uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
+        SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
+        return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
+                             {Op0, Op1, Imm}, DAG, Subtarget);
+      }
+      return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
+                           {Op0, Op1, Amt}, DAG, Subtarget);
     }
-    return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
-                         {Op0, Op1, Amt}, DAG, Subtarget);
+    assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
+            VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
+           "Unexpected funnel shift type!");
+
+    // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
+    // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
+    if (IsCstSplat)
+      return SDValue();
+
+    SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
+    SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
+    bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
+
+    unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
+    unsigned NumElts = VT.getVectorNumElements();
+    MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
+    MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
+
+    // Split 256-bit integers on XOP/pre-AVX2 targets.
+    // Split 512-bit integers on non 512-bit BWI targets.
+    if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 32) ||
+                                 !Subtarget.hasAVX2())) ||
+        (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
+         EltSizeInBits < 32)) {
+      // Pre-mask the amount modulo using the wider vector.
+      Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
+      return splitVectorOp(Op, DAG);
+    }
+
+    // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
+    if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
+      if (SDValue ScalarAmt = DAG.getSplatValue(AmtMod)) {
+        SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
+        SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
+        ScalarAmt = DAG.getZExtOrTrunc(ScalarAmt, DL, MVT::i32);
+        Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, Subtarget,
+                                 DAG);
+        Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, Subtarget,
+                                 DAG);
+        return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
+      }
+    }
+
+    MVT WideSVT = MVT::getIntegerVT(
+        std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
+    MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
+
+    // If per-element shifts are legal, fallback to generic expansion.
+    if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
+      return SDValue();
+
+    // Attempt to fold as:
+    // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+    // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
+    if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
+        supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
+      Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
+      Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
+      AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
+      Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
+                                       EltSizeInBits, DAG);
+      SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
+      Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
+      if (!IsFSHR)
+        Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
+                                         EltSizeInBits, DAG);
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
+    }
+
+    // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
+    if ((IsCst && !IsFSHR && EltSizeInBits == 8) ||
+        supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
+      SDValue Z = DAG.getConstant(0, DL, VT);
+      SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
+      SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
+      SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
+      SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
+      SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
+      SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
+      return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
+    }
+
+    // Fallback to generic expansion.
+    return SDValue();
   }
   assert(
       (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -29901,8 +30070,9 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   // Attempt to fold as unpack(x,x) << zext(splat(y)):
   // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
   // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
-  // TODO: Handle vXi16 cases.
-  if (EltSizeInBits == 8 || EltSizeInBits == 32) {
+  // TODO: Handle vXi16 cases on all targets.
+  if (EltSizeInBits == 8 || EltSizeInBits == 32 ||
+      (IsROTL && EltSizeInBits == 16 && !Subtarget.hasAVX())) {
     if (SDValue BaseRotAmt = DAG.getSplatValue(AmtMod)) {
       unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
       SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
@@ -33013,7 +33183,7 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
 
   // AVX512BW has shifts such as vpsllvw.
   if (Subtarget.hasBWI() && Bits == 16)
-      return false;
+    return false;
 
   // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
   // fully general vector.
@@ -33029,6 +33199,11 @@ bool X86TargetLowering::isBinOp(unsigned Opcode) const {
   case X86ISD::FMAX:
   case X86ISD::FMIN:
   case X86ISD::FANDN:
+  case X86ISD::VPSHA:
+  case X86ISD::VPSHL:
+  case X86ISD::VSHLV:
+  case X86ISD::VSRLV:
+  case X86ISD::VSRAV:
     return true;
   }
 
@@ -33285,9 +33460,7 @@ bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
 static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,
                               MachineBasicBlock *BB) {
   // Scan forward through BB for a use/def of EFLAGS.
-  for (MachineBasicBlock::iterator miI = std::next(Itr), miE = BB->end();
-         miI != miE; ++miI) {
-    const MachineInstr& mi = *miI;
+  for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
     if (mi.readsRegister(X86::EFLAGS))
       return true;
     // If we found a def, we can stop searching.
@@ -38724,6 +38897,8 @@ static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,
   case X86ISD::VBROADCAST:
   case X86ISD::MOVDDUP:
   case X86ISD::PSHUFD:
+  case X86ISD::PSHUFHW:
+  case X86ISD::PSHUFLW:
   case X86ISD::VPERMI:
   case X86ISD::VPERMILPI: {
     if (N.getOperand(0).getValueType() == ShuffleVT &&
@@ -38877,9 +39052,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
     return R;
 
-  if (SDValue R = canonicalizeShuffleWithBinOps(N, DAG, DL))
-    return R;
-
   // Handle specific target shuffles.
   switch (Opcode) {
   case X86ISD::MOVDDUP: {
@@ -39844,6 +40016,12 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
                                        DCI))
       return SDValue(N, 0);
+
+    // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
+    // Perform this after other shuffle combines to allow inner shuffles to be
+    // combined away first.
+    if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, SDLoc(N)))
+      return BinOp;
   }
 
   return SDValue();
@@ -40037,6 +40215,24 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
     break;
   }
+  case X86ISD::VPSHA:
+  case X86ISD::VPSHL:
+  case X86ISD::VSHLV:
+  case X86ISD::VSRLV:
+  case X86ISD::VSRAV: {
+    APInt LHSUndef, LHSZero;
+    APInt RHSUndef, RHSZero;
+    SDValue LHS = Op.getOperand(0);
+    SDValue RHS = Op.getOperand(1);
+    if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
+                                   Depth + 1))
+      return true;
+    if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
+                                   Depth + 1))
+      return true;
+    KnownZero = LHSZero;
+    break;
+  }
   case X86ISD::KSHIFTL: {
     SDValue Src = Op.getOperand(0);
     auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
@@ -41799,6 +41995,37 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// (mul (zext a), (sext, b))
+static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
+                         SDValue &Op1) {
+  Op0 = Mul.getOperand(0);
+  Op1 = Mul.getOperand(1);
+
+  // The operand1 should be signed extend
+  if (Op0.getOpcode() == ISD::SIGN_EXTEND)
+    std::swap(Op0, Op1);
+
+  auto IsFreeTruncation = [](SDValue &Op) -> bool {
+    if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
+         Op.getOpcode() == ISD::SIGN_EXTEND) &&
+        Op.getOperand(0).getScalarValueSizeInBits() <= 8)
+      return true;
+
+    auto *BV = dyn_cast<BuildVectorSDNode>(Op);
+    return (BV && BV->isConstant());
+  };
+
+  // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
+  // value, we need to check Op0 is zero extended value. Op1 should be signed
+  // value, so we just check the signed bits.
+  if ((IsFreeTruncation(Op0) &&
+       DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
+      (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
+    return true;
+
+  return false;
+}
+
 // Given a ABS node, detect the following pattern:
 // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
 // This is useful as it is the input into a SAD pattern.
@@ -41820,6 +42047,50 @@ static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
   return true;
 }
 
+static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,
+                              unsigned &LogBias, const SDLoc &DL,
+                              const X86Subtarget &Subtarget) {
+  // Extend or truncate to MVT::i8 first.
+  MVT Vi8VT =
+      MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
+  LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
+  RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
+
+  // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
+  // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
+  // The src A, B element type is i8, but the dst C element type is i32.
+  // When we calculate the reduce stage, we use src vector type vXi8 for it
+  // so we need logbias 2 to avoid extra 2 stages.
+  LogBias = 2;
+
+  unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
+  if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
+    RegSize = std::max(512u, RegSize);
+
+  // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
+  // fill in the missing vector elements with 0.
+  unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
+  SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
+  Ops[0] = LHS;
+  MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
+  SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+  Ops[0] = RHS;
+  SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
+
+  // Actually build the DotProduct, split as 256/512 bits for
+  // AVXVNNI/AVX512VNNI.
+  auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
+                       ArrayRef<SDValue> Ops) {
+    MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
+    return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
+  };
+  MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
+  SDValue Zero = DAG.getConstant(0, DL, DpVT);
+
+  return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
+                          DpBuilder, false);
+}
+
 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
 // to these zexts.
 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
@@ -41967,18 +42238,19 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
       Movmsk = DAG.getBitcast(MovmskVT, Match);
     } else {
-      // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
-      // PCMPEQQ (SSE41+), use PCMPEQD instead.
-      if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
-          Match.getOpcode() == ISD::SETCC &&
-          ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
+      // For all_of(setcc(x,y,eq))
+      // - avoid vXi64 comparisons without PCMPEQQ (SSE41+), use PCMPEQD.
+      // - avoid vXi16 comparisons, use PMOVMSKB(PCMPEQB()).
+      if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&
           cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
               ISD::CondCode::SETEQ) {
         SDValue Vec = Match.getOperand(0);
-        if (Vec.getValueType().getScalarType() == MVT::i64 &&
-            (2 * NumElts) <= MaxElts) {
+        EVT VecSVT = Vec.getValueType().getScalarType();
+        if ((VecSVT == MVT::i16 && !Subtarget.hasBWI()) ||
+            (VecSVT == MVT::i64 && !Subtarget.hasSSE41())) {
           NumElts *= 2;
-          EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+          VecSVT = VecSVT.getHalfSizedIntegerVT(*DAG.getContext());
+          EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumElts);
           MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
           Match = DAG.getSetCC(
               DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
@@ -42069,6 +42341,77 @@ static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,
   return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);
 }
 
+static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
+    return SDValue();
+
+  EVT ExtractVT = Extract->getValueType(0);
+  // Verify the type we're extracting is i32, as the output element type of
+  // vpdpbusd is i32.
+  if (ExtractVT != MVT::i32)
+    return SDValue();
+
+  EVT VT = Extract->getOperand(0).getValueType();
+  if (!isPowerOf2_32(VT.getVectorNumElements()))
+    return SDValue();
+
+  // Match shuffle + add pyramid.
+  ISD::NodeType BinOp;
+  SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
+
+  // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
+  // done by vpdpbusd compute a signed 16-bit product that will be sign extended
+  // before adding into the accumulator.
+  // TODO:
+  // We also need to verify that the multiply has at least 2x the number of bits
+  // of the input. We shouldn't match
+  // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
+  // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
+  //   Root = Root.getOperand(0);
+
+  // If there was a match, we want Root to be a mul.
+  if (!Root || Root.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  // Check whether we have an extend and mul pattern
+  SDValue LHS, RHS;
+  if (!detectExtMul(DAG, Root, LHS, RHS))
+    return SDValue();
+
+  // Create the dot product instruction.
+  SDLoc DL(Extract);
+  unsigned StageBias;
+  SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
+
+  // If the original vector was wider than 4 elements, sum over the results
+  // in the DP vector.
+  unsigned Stages = Log2_32(VT.getVectorNumElements());
+  EVT DpVT = DP.getValueType();
+
+  if (Stages > StageBias) {
+    unsigned DpElems = DpVT.getVectorNumElements();
+
+    for (unsigned i = Stages - StageBias; i > 0; --i) {
+      SmallVector<int, 16> Mask(DpElems, -1);
+      for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
+        Mask[j] = MaskEnd + j;
+
+      SDValue Shuffle =
+          DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
+      DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
+    }
+  }
+
+  // Return the lowest ExtractSizeInBits bits.
+  EVT ResVT =
+      EVT::getVectorVT(*DAG.getContext(), ExtractVT,
+                       DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
+  DP = DAG.getBitcast(ResVT, DP);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
+                     Extract->getOperand(1));
+}
+
 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   // PSADBW is only supported on SSE2 and up.
@@ -42676,6 +43019,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
     return SAD;
 
+  if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
+    return VPDPBUSD;
+
   // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
   if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
     return Cmp;
@@ -42903,6 +43249,15 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
   // multiplier, convert to 'and' + 'add'.
   const APInt &TrueVal = TrueC->getAPIntValue();
   const APInt &FalseVal = FalseC->getAPIntValue();
+
+  // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
+  if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
+      Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+    if (CC == ISD::SETEQ || CC == ISD::SETNE)
+      return SDValue();
+  }
+
   bool OV;
   APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
   if (OV)
@@ -44052,6 +44407,23 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
     // TESTZ(X,-1) == TESTZ(X,X)
     if (ISD::isBuildVectorAllOnes(Op1.getNode()))
       return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
+
+    // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
+    // TODO: Add COND_NE handling?
+    if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
+      SDValue Src0 = peekThroughBitcasts(Op0);
+      SDValue Src1 = peekThroughBitcasts(Op1);
+      if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
+        Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),
+                                 peekThroughBitcasts(Src0.getOperand(1)), true);
+        Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),
+                                 peekThroughBitcasts(Src1.getOperand(1)), true);
+        if (Src0 && Src1)
+          return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+                             DAG.getBitcast(MVT::v4i64, Src0),
+                             DAG.getBitcast(MVT::v4i64, Src1));
+      }
+    }
   }
 
   return SDValue();
@@ -44117,21 +44489,58 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
         BCNumEltBits > NumEltBits &&
         DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
       SDLoc DL(EFLAGS);
-      unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
+      APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
       return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
                          DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
                          DAG.getConstant(CmpMask, DL, MVT::i32));
     }
   }
 
+  // MOVMSK(CONCAT(X,Y)) == 0 ->  MOVMSK(OR(X,Y)).
+  // MOVMSK(CONCAT(X,Y)) != 0 ->  MOVMSK(OR(X,Y)).
+  // MOVMSK(CONCAT(X,Y)) == -1 ->  MOVMSK(AND(X,Y)).
+  // MOVMSK(CONCAT(X,Y)) != -1 ->  MOVMSK(AND(X,Y)).
+  if (VecVT.is256BitVector()) {
+    SmallVector<SDValue> Ops;
+    if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
+        Ops.size() == 2) {
+      SDLoc DL(EFLAGS);
+      EVT SubVT = Ops[0].getValueType();
+      APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
+      SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, Ops);
+      V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
+      return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+                         DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
+                         DAG.getConstant(CmpMask, DL, MVT::i32));
+    }
+  }
+
   // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
   // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
+  // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).
+  // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).
   if (IsAllOf && Subtarget.hasSSE41()) {
+    MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     SDValue BC = peekThroughBitcasts(Vec);
-    if (BC.getOpcode() == X86ISD::PCMPEQ &&
-        ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
-      MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
-      SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
+    if (BC.getOpcode() == X86ISD::PCMPEQ) {
+      SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),
+                              BC.getOperand(0), BC.getOperand(1));
+      V = DAG.getBitcast(TestVT, V);
+      return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+    }
+    // Check for 256-bit split vector cases.
+    if (BC.getOpcode() == ISD::AND &&
+        BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
+        BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
+      SDValue LHS = BC.getOperand(0);
+      SDValue RHS = BC.getOperand(1);
+      LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),
+                        LHS.getOperand(0), LHS.getOperand(1));
+      RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),
+                        RHS.getOperand(0), RHS.getOperand(1));
+      LHS = DAG.getBitcast(TestVT, LHS);
+      RHS = DAG.getBitcast(TestVT, RHS);
+      SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
       return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
     }
   }
@@ -44162,23 +44571,28 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
     // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
     // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
     if (CmpBits >= 16 && Subtarget.hasInt256() &&
-        VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-        VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-        VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
-        VecOp0.getConstantOperandAPInt(1) == 0 &&
-        VecOp1.getConstantOperandAPInt(1) == 8 &&
         (IsAnyOf || (SignExt0 && SignExt1))) {
-      SDLoc DL(EFLAGS);
-      SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
-      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
-      unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
-      if (!SignExt0 || !SignExt1) {
-        assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
-        Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
-                             DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+      if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
+        SDLoc DL(EFLAGS);
+        SDValue Result = peekThroughBitcasts(Src);
+        if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ) {
+          SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),
+                                  Result.getOperand(0), Result.getOperand(1));
+          V = DAG.getBitcast(MVT::v4i64, V);
+          return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+        }
+        Result = DAG.getBitcast(MVT::v32i8, Result);
+        Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+        unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
+        if (!SignExt0 || !SignExt1) {
+          assert(IsAnyOf &&
+                 "Only perform v16i16 signmasks for any_of patterns");
+          Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+                               DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+        }
+        return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+                           DAG.getConstant(CmpMask, DL, MVT::i32));
       }
-      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
-                         DAG.getConstant(CmpMask, DL, MVT::i32));
     }
   }
 
@@ -44732,7 +45146,8 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Sign bits must extend down to the lowest i16.
-  if (DAG.ComputeMinSignedBits(N1) > 16 || DAG.ComputeMinSignedBits(N0) > 16)
+  if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
+      DAG.ComputeMaxSignificantBits(N0) > 16)
     return SDValue();
 
   // At least one of the elements must be zero in the upper 17 bits, or can be
@@ -45224,33 +45639,28 @@ static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,
   // truncation trees that help us avoid lane crossing shuffles.
   // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
   // TODO: We don't handle vXf64 shuffles yet.
-  if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 &&
-      BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-      BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-      BC0.getOperand(0) == BC1.getOperand(0) &&
-      BC0.getOperand(0).getValueType().is256BitVector() &&
-      BC0.getConstantOperandAPInt(1) == 0 &&
-      BC1.getConstantOperandAPInt(1) ==
-          BC0.getValueType().getVectorNumElements()) {
-    SmallVector<SDValue> ShuffleOps;
-    SmallVector<int> ShuffleMask, ScaledMask;
-    SDValue Vec = peekThroughBitcasts(BC0.getOperand(0));
-    if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
-      resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
-      // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
-      // shuffle to a v4X64 width - we can probably relax this in the future.
-      if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
-          ShuffleOps[0].getValueType().is256BitVector() &&
-          scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
-        SDValue Lo, Hi;
-        MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
-        std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
-        Lo = DAG.getBitcast(SrcVT, Lo);
-        Hi = DAG.getBitcast(SrcVT, Hi);
-        SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
-        Res = DAG.getBitcast(ShufVT, Res);
-        Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
-        return DAG.getBitcast(VT, Res);
+  if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
+    if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
+      SmallVector<SDValue> ShuffleOps;
+      SmallVector<int> ShuffleMask, ScaledMask;
+      SDValue Vec = peekThroughBitcasts(BCSrc);
+      if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
+        resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);
+        // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
+        // shuffle to a v4X64 width - we can probably relax this in the future.
+        if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
+            ShuffleOps[0].getValueType().is256BitVector() &&
+            scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
+          SDValue Lo, Hi;
+          MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
+          std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
+          Lo = DAG.getBitcast(SrcVT, Lo);
+          Hi = DAG.getBitcast(SrcVT, Hi);
+          SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
+          Res = DAG.getBitcast(ShufVT, Res);
+          Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
+          return DAG.getBitcast(VT, Res);
+        }
       }
     }
   }
@@ -46047,6 +46457,49 @@ static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
 }
 
+// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
+// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
+// handles in InstCombine.
+static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opc = N->getOpcode();
+  assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
+         "Unexpected bit opcode");
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // Both operands must be single use.
+  if (!N0.hasOneUse() || !N1.hasOneUse())
+    return SDValue();
+
+  // Search for matching shifts.
+  SDValue BC0 = peekThroughOneUseBitcasts(N0);
+  SDValue BC1 = peekThroughOneUseBitcasts(N1);
+
+  unsigned BCOpc = BC0.getOpcode();
+  EVT BCVT = BC0.getValueType();
+  if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
+    return SDValue();
+
+  switch (BCOpc) {
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI:
+  case X86ISD::VSRAI: {
+    if (BC0.getOperand(1) != BC1.getOperand(1))
+      return SDValue();
+
+    SDLoc DL(N);
+    SDValue BitOp =
+        DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
+    SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
+    return DAG.getBitcast(VT, Shift);
+  }
+  }
+
+  return SDValue();
+}
+
 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
 /// with a shift-right to eliminate loading the vector constant mask value.
@@ -46350,6 +46803,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
     return R;
 
+  if (SDValue R = combineBitOpWithShift(N, DAG))
+    return R;
+
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
     return FPLogic;
 
@@ -46797,6 +47253,9 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
     return R;
 
+  if (SDValue R = combineBitOpWithShift(N, DAG))
+    return R;
+
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
     return FPLogic;
 
@@ -47837,7 +48296,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
       TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
       St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
-    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, St->getValue());
+    SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
+                              St->getValue().getOperand(0));
     return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
                              MVT::v16i8, St->getMemOperand());
   }
@@ -48630,7 +49090,7 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
     // originally concatenated from subvectors.
     SmallVector<SDValue> ConcatOps;
     if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
-    return SDValue();
+      return SDValue();
   }
 
   unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
@@ -48714,7 +49174,7 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
   // sequence or using AVX512 truncations. If the inputs are sext/zext then the
   // truncations may actually be free by peeking through to the ext source.
   auto IsSext = [&DAG](SDValue V) {
-    return DAG.ComputeMinSignedBits(V) <= 16;
+    return DAG.ComputeMaxSignificantBits(V) <= 16;
   };
   auto IsZext = [&DAG](SDValue V) {
     return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
@@ -49268,6 +49728,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
     return R;
 
+  if (SDValue R = combineBitOpWithShift(N, DAG))
+    return R;
+
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
     return FPLogic;
 
@@ -52185,6 +52648,22 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
 
     unsigned NumOps = Ops.size();
     switch (Op0.getOpcode()) {
+    case X86ISD::VBROADCAST: {
+      if (!IsSplat && VT == MVT::v4f64 && llvm::all_of(Ops, [](SDValue Op) {
+            return Op.getOperand(0).getValueType().is128BitVector();
+          }))
+        return DAG.getNode(X86ISD::MOVDDUP, DL, VT,
+                           ConcatSubOperand(VT, Ops, 0));
+      break;
+    }
+    case X86ISD::MOVDDUP:
+    case X86ISD::MOVSHDUP:
+    case X86ISD::MOVSLDUP: {
+      if (!IsSplat)
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           ConcatSubOperand(VT, Ops, 0));
+      break;
+    }
     case X86ISD::SHUFP: {
       // Add SHUFPD support if/when necessary.
       if (!IsSplat && VT.getScalarType() == MVT::f32 &&
@@ -52207,14 +52686,21 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       }
       LLVM_FALLTHROUGH;
     case X86ISD::VPERMILPI:
-      // TODO - add support for vXf64/vXi64 shuffles.
       if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&
-          Subtarget.hasAVX() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
+          Op0.getOperand(1) == Ops[1].getOperand(1)) {
         SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));
         Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,
                           Op0.getOperand(1));
         return DAG.getBitcast(VT, Res);
       }
+      if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
+        uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
+        uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
+        uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           ConcatSubOperand(VT, Ops, 0),
+                           DAG.getTargetConstant(Idx, DL, MVT::i8));
+      }
       break;
     case X86ISD::VPERMV3:
       if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
@@ -52268,6 +52754,9 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       }
       LLVM_FALLTHROUGH;
     case X86ISD::VSRAI:
+    case X86ISD::VSHL:
+    case X86ISD::VSRL:
+    case X86ISD::VSRA:
       if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
            (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
             (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index d1d6e319f16b..3f6d567d3f4d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1540,7 +1540,7 @@ namespace llvm {
     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
                                          SelectionDAG &DAG) const;
 
-    unsigned getAddressSpace(void) const;
+    unsigned getAddressSpace() const;
 
     SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
                             SDValue &Chain) const;
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 6642f46e64b2..7e751a4c8811 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -95,14 +95,45 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
   return Attrs.hasFnAttr(Attribute::ReturnsTwice);
 }
 
+// Checks if function should have an ENDBR in its prologue
+static bool needsPrologueENDBR(MachineFunction &MF, const Module *M) {
+  Function &F = MF.getFunction();
+
+  if (F.doesNoCfCheck())
+    return false;
+
+  const X86TargetMachine *TM =
+      static_cast<const X86TargetMachine *>(&MF.getTarget());
+  Metadata *IBTSeal = M->getModuleFlag("ibt-seal");
+
+  switch (TM->getCodeModel()) {
+  // Large code model functions always reachable through indirect calls.
+  case CodeModel::Large:
+    return true;
+  // Only address taken functions in LTO'ed kernel are reachable indirectly.
+  // IBTSeal implies LTO, thus only check if function is address taken.
+  case CodeModel::Kernel:
+    // Check if ibt-seal was enabled (implies LTO is being used).
+    if (IBTSeal) {
+      return F.hasAddressTaken();
+    }
+    // if !IBTSeal, fall into default case.
+    LLVM_FALLTHROUGH;
+  // Address taken or externally linked functions may be reachable.
+  default:
+    return (F.hasAddressTaken() || !F.hasLocalLinkage());
+  }
+}
+
 bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &SubTarget = MF.getSubtarget<X86Subtarget>();
 
+  const Module *M = MF.getMMI().getModule();
   // Check that the cf-protection-branch is enabled.
-  Metadata *isCFProtectionSupported =
-      MF.getMMI().getModule()->getModuleFlag("cf-protection-branch");
-  // NB: We need to enable IBT in jitted code if JIT compiler is CET
-  // enabled.
+  Metadata *isCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
+
+  //  NB: We need to enable IBT in jitted code if JIT compiler is CET
+  //  enabled.
   const X86TargetMachine *TM =
       static_cast<const X86TargetMachine *>(&MF.getTarget());
 #ifdef __CET__
@@ -119,13 +150,8 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
   TII = SubTarget.getInstrInfo();
   EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32;
 
-  // Large code model, non-internal function or function whose address
-  // was taken, can be accessed through indirect calls. Mark the first
-  // BB with ENDBR instruction unless nocf_check attribute is used.
-  if ((TM->getCodeModel() == CodeModel::Large ||
-       MF.getFunction().hasAddressTaken() ||
-       !MF.getFunction().hasLocalLinkage()) &&
-      !MF.getFunction().doesNoCfCheck()) {
+  // If function is reachable indirectly, mark the first BB with ENDBR.
+  if (needsPrologueENDBR(MF, M)) {
     auto MBB = MF.begin();
     Changed |= addENDBR(*MBB, MBB->begin());
   }
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index ecd4777c3533..bc67d1f89d7f 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -10537,13 +10537,12 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
 
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
-// TODO - Replace WriteMove with WriteVecTrunc?
 let Predicates = [prd] in
-  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteMove>, EVEX_V512;
+  defm Z : cvt_by_vec_width<opc, VTInfo.info512, OpcodeStr, WriteVecMoveZ>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteMove>, EVEX_V256;
-    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteMove>, EVEX_V128;
+    defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr, WriteVecMoveY>, EVEX_V256;
+    defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr, WriteVecMoveX>, EVEX_V128;
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index c379aa8d9258..4dcd886fa3b2 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4088,8 +4088,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
 bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
                                         Register SrcReg, Register SrcReg2,
                                         int64_t ImmMask, int64_t ImmValue,
-                                        const MachineInstr &OI,
-                                        bool *IsSwapped) const {
+                                        const MachineInstr &OI, bool *IsSwapped,
+                                        int64_t *ImmDelta) const {
   switch (OI.getOpcode()) {
   case X86::CMP64rr:
   case X86::CMP32rr:
@@ -4140,10 +4140,21 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
       int64_t OIMask;
       int64_t OIValue;
       if (analyzeCompare(OI, OISrcReg, OISrcReg2, OIMask, OIValue) &&
-          SrcReg == OISrcReg && ImmMask == OIMask && OIValue == ImmValue) {
-        assert(SrcReg2 == X86::NoRegister && OISrcReg2 == X86::NoRegister &&
-               "should not have 2nd register");
-        return true;
+          SrcReg == OISrcReg && ImmMask == OIMask) {
+        if (OIValue == ImmValue) {
+          *ImmDelta = 0;
+          return true;
+        } else if (static_cast<uint64_t>(ImmValue) ==
+                   static_cast<uint64_t>(OIValue) - 1) {
+          *ImmDelta = -1;
+          return true;
+        } else if (static_cast<uint64_t>(ImmValue) ==
+                   static_cast<uint64_t>(OIValue) + 1) {
+          *ImmDelta = 1;
+          return true;
+        } else {
+          return false;
+        }
       }
     }
     return FlagI.isIdenticalTo(OI);
@@ -4393,6 +4404,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   bool ShouldUpdateCC = false;
   bool IsSwapped = false;
   X86::CondCode NewCC = X86::COND_INVALID;
+  int64_t ImmDelta = 0;
 
   // Search backward from CmpInstr for the next instruction defining EFLAGS.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -4439,7 +4451,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
         //     ...           // EFLAGS not changed
         //     cmp x, y      // <-- can be removed
         if (isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask, CmpValue,
-                                 Inst, &IsSwapped)) {
+                                 Inst, &IsSwapped, &ImmDelta)) {
           Sub = &Inst;
           break;
         }
@@ -4473,7 +4485,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
   // If we are done with the basic block, we need to check whether EFLAGS is
   // live-out.
-  bool IsSafe = false;
+  bool FlagsMayLiveOut = true;
   SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
   MachineBasicBlock::iterator AfterCmpInstr =
       std::next(MachineBasicBlock::iterator(CmpInstr));
@@ -4483,7 +4495,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     // We should check the usage if this instruction uses and updates EFLAGS.
     if (!UseEFLAGS && ModifyEFLAGS) {
       // It is safe to remove CmpInstr if EFLAGS is updated again.
-      IsSafe = true;
+      FlagsMayLiveOut = false;
       break;
     }
     if (!UseEFLAGS && !ModifyEFLAGS)
@@ -4491,7 +4503,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
     // EFLAGS is used by this instruction.
     X86::CondCode OldCC = X86::COND_INVALID;
-    if (MI || IsSwapped) {
+    if (MI || IsSwapped || ImmDelta != 0) {
       // We decode the condition code from opcode.
       if (Instr.isBranch())
         OldCC = X86::getCondFromBranch(Instr);
@@ -4545,9 +4557,59 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
       ReplacementCC = getSwappedCondition(OldCC);
       if (ReplacementCC == X86::COND_INVALID)
         return false;
+      ShouldUpdateCC = true;
+    } else if (ImmDelta != 0) {
+      unsigned BitWidth = TRI->getRegSizeInBits(*MRI->getRegClass(SrcReg));
+      // Shift amount for min/max constants to adjust for 8/16/32 instruction
+      // sizes.
+      switch (OldCC) {
+      case X86::COND_L: // x <s (C + 1)  -->  x <=s C
+        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_LE;
+        break;
+      case X86::COND_B: // x <u (C + 1)  -->  x <=u C
+        if (ImmDelta != 1 || CmpValue == 0)
+          return false;
+        ReplacementCC = X86::COND_BE;
+        break;
+      case X86::COND_GE: // x >=s (C + 1)  -->  x >s C
+        if (ImmDelta != 1 || APInt::getSignedMinValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_G;
+        break;
+      case X86::COND_AE: // x >=u (C + 1)  -->  x >u C
+        if (ImmDelta != 1 || CmpValue == 0)
+          return false;
+        ReplacementCC = X86::COND_A;
+        break;
+      case X86::COND_G: // x >s (C - 1)  -->  x >=s C
+        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_GE;
+        break;
+      case X86::COND_A: // x >u (C - 1)  -->  x >=u C
+        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_AE;
+        break;
+      case X86::COND_LE: // x <=s (C - 1)  -->  x <s C
+        if (ImmDelta != -1 || APInt::getSignedMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_L;
+        break;
+      case X86::COND_BE: // x <=u (C - 1)  -->  x <u C
+        if (ImmDelta != -1 || APInt::getMaxValue(BitWidth) == CmpValue)
+          return false;
+        ReplacementCC = X86::COND_B;
+        break;
+      default:
+        return false;
+      }
+      ShouldUpdateCC = true;
     }
 
-    if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
+    if (ShouldUpdateCC && ReplacementCC != OldCC) {
       // Push the MachineInstr to OpsToUpdate.
       // If it is safe to remove CmpInstr, the condition code of these
       // instructions will be modified.
@@ -4555,14 +4617,14 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     }
     if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
       // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
-      IsSafe = true;
+      FlagsMayLiveOut = false;
       break;
     }
   }
 
-  // If EFLAGS is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if ((MI || IsSwapped) && !IsSafe) {
+  // If we have to update users but EFLAGS is live-out abort, since we cannot
+  // easily find all of the users.
+  if ((MI != nullptr || ShouldUpdateCC) && FlagsMayLiveOut) {
     for (MachineBasicBlock *Successor : CmpMBB.successors())
       if (Successor->isLiveIn(X86::EFLAGS))
         return false;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 537ada6222bf..33ce55bbdb2b 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -643,7 +643,8 @@ private:
   ///   CMP %1, %2   and  %3 = SUB %2, %1  ; IsSwapped=true
   bool isRedundantFlagInstr(const MachineInstr &FlagI, Register SrcReg,
                             Register SrcReg2, int64_t ImmMask, int64_t ImmValue,
-                            const MachineInstr &OI, bool *IsSwapped) const;
+                            const MachineInstr &OI, bool *IsSwapped,
+                            int64_t *ImmDelta) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 8abbaa92c8cf..28d57ca9ae3c 100644
--- a/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -153,8 +153,8 @@ private:
 X86InstructionSelector::X86InstructionSelector(const X86TargetMachine &TM,
                                                const X86Subtarget &STI,
                                                const X86RegisterBankInfo &RBI)
-    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI),
+    : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
+      RBI(RBI),
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "X86GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_INIT
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 6967a96ce83b..d0562214a025 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -610,7 +610,7 @@ MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
   auto replaceOldReg = [OldReg, NewReg](const MachineOperand &Op) {
     if (Op.isReg() && Op.getReg() == OldReg)
       return MachineOperand::CreateReg(NewReg, false, false, false, false,
-                                       false, false, false, false, 0,
+                                       false, false, false, false, false,
                                        /*IsRenamable*/ true);
     return Op;
   };
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 47ae517ae76d..e92b1b002bb0 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -129,10 +129,9 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
   bool MadeChange = false;
 
   // Pad the identified basic blocks with NOOPs
-  for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
-       I != ReturnBBs.end(); ++I) {
-    MachineBasicBlock *MBB = I->first;
-    unsigned Cycles = I->second;
+  for (const auto &ReturnBB : ReturnBBs) {
+    MachineBasicBlock *MBB = ReturnBB.first;
+    unsigned Cycles = ReturnBB.second;
 
     // Function::hasOptSize is already checked above.
     bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index babd923e7496..4342ac089cae 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -13,15 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
+#include "X86TargetMachine.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsX86.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
-#include "X86TargetMachine.h"
+#include "llvm/Support/KnownBits.h"
 
 using namespace llvm;
 
@@ -49,7 +50,7 @@ public:
   }
 
 private:
-  bool tryMAddReplacement(Instruction *Op);
+  bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
   bool trySADReplacement(Instruction *Op);
 };
 }
@@ -63,7 +64,43 @@ char X86PartialReduction::ID = 0;
 INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
                 "X86 Partial Reduction", false, false)
 
-bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
+// This function should be aligned with detectExtMul() in X86ISelLowering.cpp.
+static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul,
+                                 const DataLayout *DL) {
+  if (!ST->hasVNNI() && !ST->hasAVXVNNI())
+    return false;
+
+  Value *LHS = Mul->getOperand(0);
+  Value *RHS = Mul->getOperand(1);
+
+  if (isa<SExtInst>(LHS))
+    std::swap(LHS, RHS);
+
+  auto IsFreeTruncation = [&](Value *Op) {
+    if (auto *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getParent() == Mul->getParent() &&
+          (Cast->getOpcode() == Instruction::SExt ||
+           Cast->getOpcode() == Instruction::ZExt) &&
+          Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 8)
+        return true;
+    }
+
+    return isa<Constant>(Op);
+  };
+
+  // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
+  // value, we need to check LHS is zero extended value. RHS should be signed
+  // value, so we just check the signed bits.
+  if ((IsFreeTruncation(LHS) &&
+       computeKnownBits(LHS, *DL).countMaxActiveBits() <= 8) &&
+      (IsFreeTruncation(RHS) && ComputeMaxSignificantBits(RHS, *DL) <= 8))
+    return true;
+
+  return false;
+}
+
+bool X86PartialReduction::tryMAddReplacement(Instruction *Op,
+                                             bool ReduceInOneBB) {
   if (!ST->hasSSE2())
     return false;
 
@@ -82,6 +119,13 @@ bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
   Value *LHS = Mul->getOperand(0);
   Value *RHS = Mul->getOperand(1);
 
+  // If the target support VNNI, leave it to ISel to combine reduce operation
+  // to VNNI instruction.
+  // TODO: we can support transforming reduce to VNNI intrinsic for across block
+  // in this pass.
+  if (ReduceInOneBB && matchVPDPBUSDPattern(ST, Mul, DL))
+    return false;
+
   // LHS and RHS should be only used once or if they are the same then only
   // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
   // instructions, otherwise we use punpck to emulate zero extend in stages. The
@@ -300,7 +344,9 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) {
 
 // Walk backwards from the ExtractElementInst and determine if it is the end of
 // a horizontal reduction. Return the input to the reduction if we find one.
-static Value *matchAddReduction(const ExtractElementInst &EE) {
+static Value *matchAddReduction(const ExtractElementInst &EE,
+                                bool &ReduceInOneBB) {
+  ReduceInOneBB = true;
   // Make sure we're extracting index 0.
   auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand());
   if (!Index || !Index->isNullValue())
@@ -309,6 +355,8 @@ static Value *matchAddReduction(const ExtractElementInst &EE) {
   const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand());
   if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse())
     return nullptr;
+  if (EE.getParent() != BO->getParent())
+    ReduceInOneBB = false;
 
   unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements();
   // Ensure the reduction size is a power of 2.
@@ -321,6 +369,8 @@ static Value *matchAddReduction(const ExtractElementInst &EE) {
     const auto *BO = dyn_cast<BinaryOperator>(Op);
     if (!BO || BO->getOpcode() != Instruction::Add)
       return nullptr;
+    if (EE.getParent() != BO->getParent())
+      ReduceInOneBB = false;
 
     // If this isn't the first add, then it should only have 2 users, the
     // shuffle and another add which we checked in the previous iteration.
@@ -460,9 +510,10 @@ bool X86PartialReduction::runOnFunction(Function &F) {
       if (!EE)
         continue;
 
+      bool ReduceInOneBB;
       // First find a reduction tree.
       // FIXME: Do we need to handle other opcodes than Add?
-      Value *Root = matchAddReduction(*EE);
+      Value *Root = matchAddReduction(*EE, ReduceInOneBB);
       if (!Root)
         continue;
 
@@ -470,7 +521,7 @@ bool X86PartialReduction::runOnFunction(Function &F) {
       collectLeaves(Root, Leaves);
 
       for (Instruction *I : Leaves) {
-        if (tryMAddReplacement(I)) {
+        if (tryMAddReplacement(I, ReduceInOneBB)) {
           MadeChange = true;
           continue;
         }
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index a6ff472aac6f..8e317dc22bd6 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -255,6 +255,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5
 defm : X86WriteRes<WriteFMove,         [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [BWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [BWPort5], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,          [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>;
 
 defm : BWWriteResPair<WriteFAdd,    [BWPort1],  3, [1], 1, 5>; // Floating point add/sub.
@@ -418,6 +419,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15],
 defm : X86WriteRes<WriteVecMove,         [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [BWPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [BWPort0], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [BWPort5], 1, [1], 1>;
 
@@ -1741,4 +1743,40 @@ def BWSETA_SETBErm :  SchedWriteVariant<[
 def : InstRW<[BWSETA_SETBErr], (instrs SETCCr)>;
 def : InstRW<[BWSETA_SETBErm], (instrs SETCCm)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr,
+
+    // int variants.
+    PXORrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr,
+
+    // xmm int variants.
+    VPXORrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+    // ymm variants.
+    VXORPSYrr, VXORPDYrr, VPXORYrr,
+    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 371a9571ae39..1cd0b3379684 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -257,6 +257,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5
 defm : X86WriteRes<WriteFMove,         [HWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [HWPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [HWPort5], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,          [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>;
 
 defm : HWWriteResPair<WriteFAdd,    [HWPort1],  3, [1], 1, 5>;
@@ -416,6 +417,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15],
 defm : X86WriteRes<WriteVecMove,         [HWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [HWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [HWPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [HWPort0], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [HWPort5], 1, [1], 1>;
 
@@ -2030,4 +2032,40 @@ def HWSETA_SETBErm :  SchedWriteVariant<[
 def : InstRW<[HWSETA_SETBErr], (instrs SETCCr)>;
 def : InstRW<[HWSETA_SETBErm], (instrs SETCCm)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr,
+
+    // int variants.
+    PXORrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr,
+
+    // xmm int variants.
+    VPXORrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+    // ymm variants.
+    VXORPSYrr, VXORPDYrr, VPXORYrr,
+    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index 789de9eb5751..9fd986e34181 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -252,6 +252,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [ICXPort237,ICXPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteFMove,         [ICXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [ICXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveZ,        [ICXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteEMMS,          [ICXPort05,ICXPort0156], 10, [9,1], 10>;
 
 defm : ICXWriteResPair<WriteFAdd,      [ICXPort01],  4, [1], 1, 5>; // Floating point add/sub.
@@ -367,6 +368,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [ICXPort237,ICXPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteVecMove,         [ICXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [ICXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [ICXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveZ,        [ICXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [ICXPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [ICXPort5], 1, [1], 1>;
 
@@ -2630,4 +2632,48 @@ def ICXSETA_SETBErm :  SchedWriteVariant<[
 def : InstRW<[ICXSETA_SETBErr], (instrs SETCCr)>;
 def : InstRW<[ICXSETA_SETBErm], (instrs SETCCm)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr,
+
+    // int variants.
+    PXORrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr,
+
+    // xmm int variants.
+    VPXORrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+    // ymm variants.
+    VXORPSYrr, VXORPDYrr, VPXORYrr,
+    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr,
+
+    // zmm variants.
+    VXORPSZrr, VXORPDZrr, VPXORDZrr, VPXORQZrr,
+    VXORPSZ128rr, VXORPDZ128rr, VPXORDZ128rr, VPXORQZ128rr,
+    VXORPSZ256rr, VXORPDZ256rr, VPXORDZ256rr, VPXORQZ256rr,
+    VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr,
+    VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr,
+    VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr,
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index af5c0540deb5..7e619a3a8722 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -223,6 +223,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1]
 defm : X86WriteRes<WriteFMove,         [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SBPort5], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveZ,        [SBPort5], 1, [1], 1>;
 defm : X86WriteRes<WriteEMMS,          [SBPort015], 31, [31], 31>;
 
 defm : SBWriteResPair<WriteFAdd,    [SBPort1],  3, [1], 1, 6>;
@@ -380,6 +381,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,
 defm : X86WriteRes<WriteVecMove,         [SBPort05], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SBPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SBPort05], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveZ,        [SBPort05], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [SBPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [SBPort5], 1, [1], 1>;
 
@@ -1230,4 +1232,35 @@ def SBSETA_SETBErm :  SchedWriteVariant<[
 def : InstRW<[SBSETA_SETBErr], (instrs SETCCr)>;
 def : InstRW<[SBSETA_SETBErm], (instrs SETCCm)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr,
+
+    // int variants.
+    PXORrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr,
+
+    // xmm int variants.
+    VPXORrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index b3c13c72dd01..0a88bac5aa66 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -244,6 +244,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteFMove,         [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,          [SKLPort05,SKLPort0156], 10, [9,1], 10>;
 
 defm : SKLWriteResPair<WriteFAdd,     [SKLPort01],  4, [1], 1, 5>; // Floating point add/sub.
@@ -359,6 +360,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteVecMove,         [SKLPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SKLPort015], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [SKLPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [SKLPort5], 1, [1], 1>;
 
@@ -1901,4 +1903,40 @@ def SKLSETA_SETBErm :  SchedWriteVariant<[
 def : InstRW<[SKLSETA_SETBErr], (instrs SETCCr)>;
 def : InstRW<[SKLSETA_SETBErm], (instrs SETCCm)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr,
+
+    // int variants.
+    PXORrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr,
+
+    // xmm int variants.
+    VPXORrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+    // ymm variants.
+    VXORPSYrr, VXORPDYrr, VPXORYrr,
+    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 74f9da158353..b28a18f0dcd7 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -244,6 +244,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteFMove,         [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveZ,        [SKXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteEMMS,          [SKXPort05,SKXPort0156], 10, [9,1], 10>;
 
 defm : SKXWriteResPair<WriteFAdd,      [SKXPort01],  4, [1], 1, 5>; // Floating point add/sub.
@@ -359,6 +360,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteVecMove,         [SKXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SKXPort015], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveZ,        [SKXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [SKXPort0], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [SKXPort5], 1, [1], 1>;
 
@@ -2613,4 +2615,48 @@ def SKXSETA_SETBErm :  SchedWriteVariant<[
 def : InstRW<[SKXSETA_SETBErr], (instrs SETCCr)>;
 def : InstRW<[SKXSETA_SETBErm], (instrs SETCCm)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ SUB32rr, SUB64rr, XOR32rr, XOR64rr ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr,
+
+    // int variants.
+    PXORrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX Zero-idioms.
+  DepBreakingClass<[
+    // xmm fp variants.
+    VXORPSrr, VXORPDrr,
+
+    // xmm int variants.
+    VPXORrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr,
+
+    // ymm variants.
+    VXORPSYrr, VXORPDYrr, VPXORYrr,
+    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr,
+
+    // zmm variants.
+    VXORPSZrr, VXORPDZrr, VPXORDZrr, VPXORQZrr,
+    VXORPSZ128rr, VXORPDZ128rr, VPXORDZ128rr, VPXORQZ128rr,
+    VXORPSZ256rr, VXORPDZ256rr, VPXORDZ256rr, VPXORQZ256rr,
+    VPSUBBZrr, VPSUBWZrr, VPSUBDZrr, VPSUBQZrr,
+    VPSUBBZ128rr, VPSUBWZ128rr, VPSUBDZ128rr, VPSUBQZ128rr,
+    VPSUBBZ256rr, VPSUBWZ256rr, VPSUBDZ256rr, VPSUBQZ256rr,
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index 1cb48175260a..d57e14715a4e 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -239,6 +239,7 @@ def  WriteFMaskedStore64Y : SchedWrite;
 def  WriteFMove         : SchedWrite;
 def  WriteFMoveX        : SchedWrite;
 def  WriteFMoveY        : SchedWrite;
+def  WriteFMoveZ        : SchedWrite;
 
 defm WriteFAdd    : X86SchedWritePair<ReadAfterVecLd>;  // Floating point add/sub.
 defm WriteFAddX   : X86SchedWritePair<ReadAfterVecXLd>; // Floating point add/sub (XMM).
@@ -354,6 +355,7 @@ def  WriteVecMaskedStore64Y : SchedWrite;
 def  WriteVecMove         : SchedWrite;
 def  WriteVecMoveX        : SchedWrite;
 def  WriteVecMoveY        : SchedWrite;
+def  WriteVecMoveZ        : SchedWrite;
 def  WriteVecMoveToGpr    : SchedWrite;
 def  WriteVecMoveFromGpr  : SchedWrite;
 
@@ -516,9 +518,11 @@ def WriteFMoveLSX
  : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>;
 def WriteFMoveLSY
  : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>;
+def WriteFMoveLSZ
+ : X86SchedWriteMoveLS<WriteFMoveZ, WriteFLoadY, WriteFStoreY>;
 def SchedWriteFMoveLS
   : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX,
-                              WriteFMoveLSY, WriteFMoveLSY>;
+                              WriteFMoveLSY, WriteFMoveLSZ>;
 
 def WriteFMoveLSNT
  : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStoreNT>;
@@ -536,9 +540,11 @@ def WriteVecMoveLSX
  : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>;
 def WriteVecMoveLSY
  : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>;
+def WriteVecMoveLSZ
+ : X86SchedWriteMoveLS<WriteVecMoveZ, WriteVecLoadY, WriteVecStoreY>;
 def SchedWriteVecMoveLS
   : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX,
-                              WriteVecMoveLSY, WriteVecMoveLSY>;
+                              WriteVecMoveLSY, WriteVecMoveLSZ>;
 
 def WriteVecMoveLSNT
  : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoadNT, WriteVecStoreNT>;
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index 0fedfc01092c..8ae8e574f87a 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -229,6 +229,7 @@ defm : X86WriteResUnsupported<WriteFMaskedStore64Y>;
 def  : WriteRes<WriteFMove,         [AtomPort01]>;
 def  : WriteRes<WriteFMoveX,        [AtomPort01]>;
 defm : X86WriteResUnsupported<WriteFMoveY>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : X86WriteRes<WriteEMMS,       [AtomPort01], 5, [5], 1>;
 
@@ -382,6 +383,7 @@ defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
 def  : WriteRes<WriteVecMove,          [AtomPort0]>;
 def  : WriteRes<WriteVecMoveX,        [AtomPort01]>;
 defm : X86WriteResUnsupported<WriteVecMoveY>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,   [AtomPort0], 3, [3], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr, [AtomPort0], 1, [1], 1>;
 
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index 0f6f24f9f1fe..cb75c3660728 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -772,6 +772,7 @@ defm : PdWriteRes<WriteFMaskedStore64Y,    [PdStore, PdFPU01, PdFPFMA], 6, [2, 2
 defm : PdWriteRes<WriteFMove,              [PdFPU01, PdFPFMA]>;
 defm : PdWriteRes<WriteFMoveX,             [PdFPU01, PdFPFMA], 1, [1, 2]>;
 defm : PdWriteRes<WriteFMoveY,             [PdFPU01, PdFPFMA], 2, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : PdWriteRes<WriteEMMS,               [PdFPU01, PdFPFMA], 2>;
 
@@ -1107,6 +1108,7 @@ defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
 defm : PdWriteRes<WriteVecMove,             [PdFPU01, PdFPMAL], 2>;
 defm : PdWriteRes<WriteVecMoveX,            [PdFPU01, PdFPMAL], 1, [1, 2]>;
 defm : PdWriteRes<WriteVecMoveY,            [PdFPU01, PdFPMAL], 2, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 
 def PdWriteMOVDQArr : SchedWriteRes<[PdFPU01, PdFPMAL]> {
 }
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index a070da34cab5..4b2fa87a25b5 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -525,6 +525,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU
 defm : X86WriteRes<WriteFMove,         [JFPU01, JFPX], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [JFPU01, JFPX], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [JFPU01, JFPX], 1, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : X86WriteRes<WriteEMMS,          [JFPU01, JFPX], 2, [1, 1], 1>;
 
@@ -682,6 +683,7 @@ defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
 defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteVecMoveY,         [JFPU01, JVALU], 1, [2, 2], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,     [JFPU0, JFPA, JALU0], 4, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,   [JFPU01, JFPX], 8, [1, 1], 2>;
 
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 36e5b55a4194..52605c031617 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -200,6 +200,7 @@ def  : WriteRes<WriteFMaskedStore64Y,   [SLM_MEC_RSV]>;
 def  : WriteRes<WriteFMove,         [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteFMoveX,        [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteFMoveY,        [SLM_FPC_RSV01]>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 defm : X86WriteRes<WriteEMMS,       [SLM_FPC_RSV01], 10, [10], 9>;
 
 defm : SLMWriteResPair<WriteFAdd,     [SLM_FPC_RSV1], 3>;
@@ -345,6 +346,7 @@ def  : WriteRes<WriteVecMaskedStore64Y,   [SLM_MEC_RSV]>;
 def  : WriteRes<WriteVecMove,         [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteVecMoveX,        [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteVecMoveY,        [SLM_FPC_RSV01]>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 def  : WriteRes<WriteVecMoveToGpr,    [SLM_IEC_RSV01]>;
 def  : WriteRes<WriteVecMoveFromGpr,  [SLM_IEC_RSV01]>;
 
@@ -480,4 +482,22 @@ def: InstRW<[SLMWriteResGroup1rm], (instrs MMX_PADDQrm, PADDQrm,
                                            MMX_PSUBQrm, PSUBQrm,
                                            PCMPEQQrm)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[ XOR32rr ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr,
+
+    // int variants.
+    PXORrr,
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 4343e1ed45d1..fe0484afd227 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -286,6 +286,7 @@ defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
 defm : X86WriteRes<WriteFMove,         [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [ZnFPU], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : ZnWriteResFpuPair<WriteFAdd,      [ZnFPU0],  3>;
 defm : ZnWriteResFpuPair<WriteFAddX,     [ZnFPU0],  3>;
@@ -404,6 +405,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
 defm : X86WriteRes<WriteVecMove,         [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [ZnFPU], 2, [1], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [ZnFPU2], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [ZnFPU2], 3, [1], 1>;
 defm : X86WriteRes<WriteEMMS,            [ZnFPU], 2, [1], 1>;
@@ -1541,4 +1543,83 @@ def : InstRW<[WriteMicrocoded], (instrs VZEROUPPER)>;
 // VZEROALL.
 def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[
+    SUB32rr, SUB64rr,
+    XOR32rr, XOR64rr
+  ], ZeroIdiomPredicate>,
+
+  // MMX Zero-idioms.
+  DepBreakingClass<[
+    MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
+    MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
+    MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
+    MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
+
+    // int variants.
+    PXORrr, PANDNrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX XMM Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
+
+    // int variants.
+    VPXORrr, VPANDNrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX YMM Zero-idioms.
+  DepBreakingClass<[
+    // fp variants
+    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr,
+
+    // int variants
+    VPXORYrr, VPANDNYrr,
+    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
+  ], ZeroIdiomPredicate>
+]>;
+
+def : IsDepBreakingFunction<[
+  // GPR
+  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
+  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
+
+  // MMX
+  DepBreakingClass<[
+    MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr
+  ], ZeroIdiomPredicate>,
+
+  // SSE
+  DepBreakingClass<[
+    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX XMM
+  DepBreakingClass<[
+    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX YMM
+  DepBreakingClass<[
+    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 96d2837880c7..38908a987595 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -274,6 +274,7 @@ defm : X86WriteRes<WriteFStoreNTY,     [Zn2AGU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMove,         [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveX,        [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteFMoveY,        [Zn2FPU], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : Zn2WriteResFpuPair<WriteFAdd,      [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFAddX,     [Zn2FPU0],  3>;
@@ -388,6 +389,7 @@ defm : X86WriteRes<WriteVecMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
 defm : X86WriteRes<WriteVecMove,         [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [Zn2FPU], 2, [1], 2>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 defm : X86WriteRes<WriteVecMoveToGpr,    [Zn2FPU2], 2, [1], 1>;
 defm : X86WriteRes<WriteVecMoveFromGpr,  [Zn2FPU2], 3, [1], 1>;
 defm : X86WriteRes<WriteEMMS,            [Zn2FPU], 2, [1], 1>;
@@ -1530,4 +1532,83 @@ def : InstRW<[WriteALU], (instrs VZEROUPPER)>;
 // VZEROALL.
 def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>;
 
+///////////////////////////////////////////////////////////////////////////////
+// Dependency breaking instructions.
+///////////////////////////////////////////////////////////////////////////////
+
+def : IsZeroIdiomFunction<[
+  // GPR Zero-idioms.
+  DepBreakingClass<[
+    SUB32rr, SUB64rr,
+    XOR32rr, XOR64rr
+  ], ZeroIdiomPredicate>,
+
+  // MMX Zero-idioms.
+  DepBreakingClass<[
+    MMX_PXORrr, MMX_PANDNrr, MMX_PSUBBrr,
+    MMX_PSUBDrr, MMX_PSUBQrr, MMX_PSUBWrr,
+    MMX_PSUBSBrr, MMX_PSUBSWrr, MMX_PSUBUSBrr, MMX_PSUBUSWrr,
+    MMX_PCMPGTBrr, MMX_PCMPGTDrr, MMX_PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // SSE Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    XORPSrr, XORPDrr, ANDNPSrr, ANDNPDrr,
+
+    // int variants.
+    PXORrr, PANDNrr,
+    PSUBBrr, PSUBWrr, PSUBDrr, PSUBQrr,
+    PCMPGTBrr, PCMPGTDrr, PCMPGTQrr, PCMPGTWrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX XMM Zero-idioms.
+  DepBreakingClass<[
+    // fp variants.
+    VXORPSrr, VXORPDrr, VANDNPSrr, VANDNPDrr,
+
+    // int variants.
+    VPXORrr, VPANDNrr,
+    VPSUBBrr, VPSUBWrr, VPSUBDrr, VPSUBQrr,
+    VPCMPGTBrr, VPCMPGTWrr, VPCMPGTDrr, VPCMPGTQrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX YMM Zero-idioms.
+  DepBreakingClass<[
+    // fp variants
+    VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr,
+
+    // int variants
+    VPXORYrr, VPANDNYrr,
+    VPSUBBYrr, VPSUBWYrr, VPSUBDYrr, VPSUBQYrr,
+    VPCMPGTBYrr, VPCMPGTWYrr, VPCMPGTDYrr, VPCMPGTQYrr
+  ], ZeroIdiomPredicate>
+]>;
+
+def : IsDepBreakingFunction<[
+  // GPR
+  DepBreakingClass<[ SBB32rr, SBB64rr ], ZeroIdiomPredicate>,
+  DepBreakingClass<[ CMP32rr, CMP64rr ], CheckSameRegOperand<0, 1> >,
+
+  // MMX
+  DepBreakingClass<[
+    MMX_PCMPEQBrr, MMX_PCMPEQWrr, MMX_PCMPEQDrr
+  ], ZeroIdiomPredicate>,
+
+  // SSE
+  DepBreakingClass<[
+    PCMPEQBrr, PCMPEQWrr, PCMPEQDrr, PCMPEQQrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX XMM
+  DepBreakingClass<[
+    VPCMPEQBrr, VPCMPEQWrr, VPCMPEQDrr, VPCMPEQQrr
+  ], ZeroIdiomPredicate>,
+
+  // AVX YMM
+  DepBreakingClass<[
+    VPCMPEQBYrr, VPCMPEQWYrr, VPCMPEQDYrr, VPCMPEQQYrr
+  ], ZeroIdiomPredicate>,
+]>;
+
 } // SchedModel
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver3.td b/llvm/lib/Target/X86/X86ScheduleZnver3.td
index f4e03ac11f0b..02f7f8376fdb 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver3.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver3.td
@@ -1446,10 +1446,12 @@ defm : Zn3WriteResInt<WriteXCHG, [Zn3ALU0123], 0, [8], 2>;        // Compare+Exc
 defm : Zn3WriteResXMM<WriteFMove, [Zn3FPVMisc0123], 1, [1], 1>; // Empty sched class
 defm : Zn3WriteResXMM<WriteFMoveX, [], 0, [], 1>;
 defm : Zn3WriteResYMM<WriteFMoveY, [], 0, [], 1>;
+defm : X86WriteResUnsupported<WriteFMoveZ>;
 
 defm : Zn3WriteResXMM<WriteVecMove, [Zn3FPFMisc0123], 1, [1], 1>; // MMX
 defm : Zn3WriteResXMM<WriteVecMoveX, [], 0, [], 1>;
 defm : Zn3WriteResYMM<WriteVecMoveY, [], 0, [], 1>;
+defm : X86WriteResUnsupported<WriteVecMoveZ>;
 
 def : IsOptimizableRegisterMove<[
   InstructionEquivalenceClass<[
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 83a4a025f518..dba11e8b4000 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -1139,7 +1139,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
     // branch back to itself. We can do this here because at this point, every
     // predecessor of this block has an available value. This is basically just
     // automating the construction of a PHI node for this target.
-    unsigned TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
+    Register TargetReg = TargetAddrSSA.GetValueInMiddleOfBlock(&MBB);
 
     // Insert a comparison of the incoming target register with this block's
     // address. This also requires us to mark the block as having its address
@@ -1642,7 +1642,7 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
     return;
 
   // Compute the current predicate state.
-  unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+  Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
 
   auto InsertPt = MI.getIterator();
 
@@ -1913,7 +1913,7 @@ unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister(
 
   auto *RC = MRI->getRegClass(Reg);
   int Bytes = TRI->getRegSizeInBits(*RC) / 8;
-  unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+  Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
   assert((Bytes == 1 || Bytes == 2 || Bytes == 4 || Bytes == 8) &&
          "Unknown register size");
 
@@ -2078,7 +2078,7 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
 
   // First, we transfer the predicate state into the called function by merging
   // it into the stack pointer. This will kill the current def of the state.
-  unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
+  Register StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
   mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
 
   // If this call is also a return, it is a tail call and we don't need anything
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 78bc5519c23f..e3d0128dd73d 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -127,7 +127,7 @@ static std::string computeDataLayout(const Triple &TT) {
   // Some ABIs align long double to 128 bits, others to 32.
   if (TT.isOSNaCl() || TT.isOSIAMCU())
     ; // No f80
-  else if (TT.isArch64Bit() || TT.isOSDarwin())
+  else if (TT.isArch64Bit() || TT.isOSDarwin() || TT.isWindowsMSVCEnvironment())
     Ret += "-f80:128";
   else
     Ret += "-f80:32";
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d8cd7311a0d5..5b95c10332dc 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -43,6 +43,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 
@@ -3429,6 +3430,20 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   if (ICA.isTypeBasedOnly())
     return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
 
+  static const CostTblEntry AVX512BWCostTbl[] = {
+    { ISD::ROTL,       MVT::v32i16,  2 },
+    { ISD::ROTL,       MVT::v16i16,  2 },
+    { ISD::ROTL,       MVT::v8i16,   2 },
+    { ISD::ROTL,       MVT::v64i8,   5 },
+    { ISD::ROTL,       MVT::v32i8,   5 },
+    { ISD::ROTL,       MVT::v16i8,   5 },
+    { ISD::ROTR,       MVT::v32i16,  2 },
+    { ISD::ROTR,       MVT::v16i16,  2 },
+    { ISD::ROTR,       MVT::v8i16,   2 },
+    { ISD::ROTR,       MVT::v64i8,   5 },
+    { ISD::ROTR,       MVT::v32i8,   5 },
+    { ISD::ROTR,       MVT::v16i8,   5 }
+  };
   static const CostTblEntry AVX512CostTbl[] = {
     { ISD::ROTL,       MVT::v8i64,   1 },
     { ISD::ROTL,       MVT::v4i64,   1 },
@@ -3506,6 +3521,10 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     MVT MTy = LT.second;
 
     // Attempt to lookup cost.
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
+
     if (ST->hasAVX512())
       if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
         return LT.first * Entry->Cost;
@@ -4976,9 +4995,13 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
     const Instruction *I = nullptr) {
   if (CostKind != TTI::TCK_RecipThroughput) {
     if ((Opcode == Instruction::Load &&
-         isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+         isLegalMaskedGather(SrcVTy, Align(Alignment)) &&
+         !forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
+                                     Align(Alignment))) ||
         (Opcode == Instruction::Store &&
-         isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+         isLegalMaskedScatter(SrcVTy, Align(Alignment)) &&
+         !forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
+                                      Align(Alignment))))
       return 1;
     return BaseT::getGatherScatterOpCost(Opcode, SrcVTy, Ptr, VariableMask,
                                          Alignment, CostKind, I);
@@ -4993,9 +5016,13 @@ InstructionCost X86TTIImpl::getGatherScatterOpCost(
   unsigned AddressSpace = PtrTy->getAddressSpace();
 
   if ((Opcode == Instruction::Load &&
-       !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
+       (!isLegalMaskedGather(SrcVTy, Align(Alignment)) ||
+        forceScalarizeMaskedGather(cast<VectorType>(SrcVTy),
+                                   Align(Alignment)))) ||
       (Opcode == Instruction::Store &&
-       !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
+       (!isLegalMaskedScatter(SrcVTy, Align(Alignment)) ||
+        forceScalarizeMaskedScatter(cast<VectorType>(SrcVTy),
+                                    Align(Alignment)))))
     return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
                            AddressSpace);
 
@@ -5118,35 +5145,21 @@ bool X86TTIImpl::supportsGather() const {
   return ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2());
 }
 
+bool X86TTIImpl::forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
+  // Gather / Scatter for vector 2 is not profitable on KNL / SKX
+  // Vector-4 of gather/scatter instruction does not exist on KNL. We can extend
+  // it to 8 elements, but zeroing upper bits of the mask vector will add more
+  // instructions. Right now we give the scalar cost of vector-4 for KNL. TODO:
+  // Check, maybe the gather/scatter instruction is better in the VariableMask
+  // case.
+  unsigned NumElts = cast<FixedVectorType>(VTy)->getNumElements();
+  return NumElts == 1 ||
+         (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())));
+}
+
 bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   if (!supportsGather())
     return false;
-
-  // This function is called now in two cases: from the Loop Vectorizer
-  // and from the Scalarizer.
-  // When the Loop Vectorizer asks about legality of the feature,
-  // the vectorization factor is not calculated yet. The Loop Vectorizer
-  // sends a scalar type and the decision is based on the width of the
-  // scalar element.
-  // Later on, the cost model will estimate usage this intrinsic based on
-  // the vector type.
-  // The Scalarizer asks again about legality. It sends a vector type.
-  // In this case we can reject non-power-of-2 vectors.
-  // We also reject single element vectors as the type legalizer can't
-  // scalarize it.
-  if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
-    unsigned NumElts = DataVTy->getNumElements();
-    if (NumElts == 1)
-      return false;
-    // Gather / Scatter for vector 2 is not profitable on KNL / SKX
-    // Vector-4 of gather/scatter instruction does not exist on KNL.
-    // We can extend it to 8 elements, but zeroing upper bits of
-    // the mask vector will add more instructions. Right now we give the scalar
-    // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter
-    // instruction is better in the VariableMask case.
-    if (ST->hasAVX512() && (NumElts == 2 || (NumElts == 4 && !ST->hasVLX())))
-      return false;
-  }
   Type *ScalarTy = DataTy->getScalarType();
   if (ScalarTy->isPointerTy())
     return true;
@@ -5187,9 +5200,48 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
   const FeatureBitset &CalleeBits =
       TM.getSubtargetImpl(*Callee)->getFeatureBits();
 
+  // Check whether features are the same (apart from the ignore list).
   FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
   FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
-  return (RealCallerBits & RealCalleeBits) == RealCalleeBits;
+  if (RealCallerBits == RealCalleeBits)
+    return true;
+
+  // If the features are a subset, we need to additionally check for calls
+  // that may become ABI-incompatible as a result of inlining.
+  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
+    return false;
+
+  for (const Instruction &I : instructions(Callee)) {
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      SmallVector<Type *, 8> Types;
+      for (Value *Arg : CB->args())
+        Types.push_back(Arg->getType());
+      if (!CB->getType()->isVoidTy())
+        Types.push_back(CB->getType());
+
+      // Simple types are always ABI compatible.
+      auto IsSimpleTy = [](Type *Ty) {
+        return !Ty->isVectorTy() && !Ty->isAggregateType();
+      };
+      if (all_of(Types, IsSimpleTy))
+        continue;
+
+      if (Function *NestedCallee = CB->getCalledFunction()) {
+        // Assume that intrinsics are always ABI compatible.
+        if (NestedCallee->isIntrinsic())
+          continue;
+
+        // Do a precise compatibility check.
+        if (!areTypesABICompatible(Caller, NestedCallee, Types))
+          return false;
+      } else {
+        // We don't know the target features of the callee,
+        // assume it is incompatible.
+        return false;
+      }
+    }
+  }
+  return true;
 }
 
 bool X86TTIImpl::areTypesABICompatible(const Function *Caller,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 11e9cb09c7d5..69715072426f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -226,6 +226,10 @@ public:
   bool isLegalMaskedStore(Type *DataType, Align Alignment);
   bool isLegalNTLoad(Type *DataType, Align Alignment);
   bool isLegalNTStore(Type *DataType, Align Alignment);
+  bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment);
+  bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
+    return forceScalarizeMaskedGather(VTy, Alignment);
+  }
   bool isLegalMaskedGather(Type *DataType, Align Alignment);
   bool isLegalMaskedScatter(Type *DataType, Align Alignment);
   bool isLegalMaskedExpandLoad(Type *DataType);
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index f2f89f4269ed..19ebcb3ea3e8 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -428,7 +428,7 @@ bool XCoreFrameLowering::spillCalleeSavedRegisters(
     DL = MI->getDebugLoc();
 
   for (const CalleeSavedInfo &I : CSI) {
-    unsigned Reg = I.getReg();
+    Register Reg = I.getReg();
     assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
            "LR & FP are always handled in emitPrologue");
 
@@ -455,7 +455,7 @@ bool XCoreFrameLowering::restoreCalleeSavedRegisters(
   if (!AtStart)
     --BeforeI;
   for (const CalleeSavedInfo &CSR : CSI) {
-    unsigned Reg = CSR.getReg();
+    Register Reg = CSR.getReg();
     assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
            "LR & FP are always handled in emitEpilogue");
 
diff --git a/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
index 6799823f6fcb..0d1ba39b8b10 100644
--- a/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -97,7 +97,7 @@ static void InsertFPConstInst(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc dl = MI.getDebugLoc();
-  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  Register ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
   RS->setRegUsed(ScratchOffset);
   TII.loadImmediate(MBB, II, ScratchOffset, Offset);
 
@@ -174,7 +174,7 @@ static void InsertSPConstInst(MachineBasicBlock::iterator II,
   } else
     ScratchBase = Reg;
   BuildMI(MBB, II, dl, TII.get(XCore::LDAWSP_ru6), ScratchBase).addImm(0);
-  unsigned ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
+  Register ScratchOffset = RS->scavengeRegister(&XCore::GRRegsRegClass, II, 0);
   RS->setRegUsed(ScratchOffset);
   TII.loadImmediate(MBB, II, ScratchOffset, Offset);
 
diff --git a/llvm/lib/Target/XCore/XCoreSubtarget.cpp b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
index 1be707cb488c..d4b777ef447f 100644
--- a/llvm/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/llvm/lib/Target/XCore/XCoreSubtarget.cpp
@@ -26,5 +26,5 @@ void XCoreSubtarget::anchor() { }
 
 XCoreSubtarget::XCoreSubtarget(const Triple &TT, const std::string &CPU,
                                const std::string &FS, const TargetMachine &TM)
-    : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), InstrInfo(),
-      FrameLowering(*this), TLInfo(TM, *this), TSInfo() {}
+    : XCoreGenSubtargetInfo(TT, CPU, /*TuneCPU*/ CPU, FS), FrameLowering(*this),
+      TLInfo(TM, *this) {}
diff --git a/llvm/lib/TextAPI/Architecture.cpp b/llvm/lib/TextAPI/Architecture.cpp
index e1901d5c0ce5..bb349b21774e 100644
--- a/llvm/lib/TextAPI/Architecture.cpp
+++ b/llvm/lib/TextAPI/Architecture.cpp
@@ -15,7 +15,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TextAPI/ArchitectureSet.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 namespace MachO {
diff --git a/llvm/lib/TextAPI/PackedVersion.cpp b/llvm/lib/TextAPI/PackedVersion.cpp
index f8171e02b6d3..67fb30aeb127 100644
--- a/llvm/lib/TextAPI/PackedVersion.cpp
+++ b/llvm/lib/TextAPI/PackedVersion.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TextAPI/PackedVersion.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/lib/TextAPI/Platform.cpp b/llvm/lib/TextAPI/Platform.cpp
index a2ce6d0cac86..c3c74252301e 100644
--- a/llvm/lib/TextAPI/Platform.cpp
+++ b/llvm/lib/TextAPI/Platform.cpp
@@ -18,120 +18,118 @@
 namespace llvm {
 namespace MachO {
 
-PlatformKind mapToPlatformKind(PlatformKind Platform, bool WantSim) {
+PlatformType mapToPlatformType(PlatformType Platform, bool WantSim) {
   switch (Platform) {
   default:
     return Platform;
-  case PlatformKind::iOS:
-    return WantSim ? PlatformKind::iOSSimulator : PlatformKind::iOS;
-  case PlatformKind::tvOS:
-    return WantSim ? PlatformKind::tvOSSimulator : PlatformKind::tvOS;
-  case PlatformKind::watchOS:
-    return WantSim ? PlatformKind::watchOSSimulator : PlatformKind::watchOS;
+  case PLATFORM_IOS:
+    return WantSim ? PLATFORM_IOSSIMULATOR : PLATFORM_IOS;
+  case PLATFORM_TVOS:
+    return WantSim ? PLATFORM_TVOSSIMULATOR : PLATFORM_TVOS;
+  case PLATFORM_WATCHOS:
+    return WantSim ? PLATFORM_WATCHOSSIMULATOR : PLATFORM_WATCHOS;
   }
-  llvm_unreachable("Unknown llvm::MachO::PlatformKind enum");
 }
 
-PlatformKind mapToPlatformKind(const Triple &Target) {
+PlatformType mapToPlatformType(const Triple &Target) {
   switch (Target.getOS()) {
   default:
-    return PlatformKind::unknown;
+    return PLATFORM_UNKNOWN;
   case Triple::MacOSX:
-    return PlatformKind::macOS;
+    return PLATFORM_MACOS;
   case Triple::IOS:
     if (Target.isSimulatorEnvironment())
-      return PlatformKind::iOSSimulator;
+      return PLATFORM_IOSSIMULATOR;
     if (Target.getEnvironment() == Triple::MacABI)
-      return PlatformKind::macCatalyst;
-    return PlatformKind::iOS;
+      return PLATFORM_MACCATALYST;
+    return PLATFORM_IOS;
   case Triple::TvOS:
-    return Target.isSimulatorEnvironment() ? PlatformKind::tvOSSimulator
-                                           : PlatformKind::tvOS;
+    return Target.isSimulatorEnvironment() ? PLATFORM_TVOSSIMULATOR
+                                           : PLATFORM_TVOS;
   case Triple::WatchOS:
-    return Target.isSimulatorEnvironment() ? PlatformKind::watchOSSimulator
-                                           : PlatformKind::watchOS;
+    return Target.isSimulatorEnvironment() ? PLATFORM_WATCHOSSIMULATOR
+                                           : PLATFORM_WATCHOS;
     // TODO: add bridgeOS & driverKit once in llvm::Triple
   }
-  llvm_unreachable("Unknown Target Triple");
 }
 
 PlatformSet mapToPlatformSet(ArrayRef<Triple> Targets) {
   PlatformSet Result;
   for (const auto &Target : Targets)
-    Result.insert(mapToPlatformKind(Target));
+    Result.insert(mapToPlatformType(Target));
   return Result;
 }
 
-StringRef getPlatformName(PlatformKind Platform) {
+StringRef getPlatformName(PlatformType Platform) {
   switch (Platform) {
-  case PlatformKind::unknown:
+  case PLATFORM_UNKNOWN:
     return "unknown";
-  case PlatformKind::macOS:
+  case PLATFORM_MACOS:
     return "macOS";
-  case PlatformKind::iOS:
+  case PLATFORM_IOS:
     return "iOS";
-  case PlatformKind::tvOS:
+  case PLATFORM_TVOS:
     return "tvOS";
-  case PlatformKind::watchOS:
+  case PLATFORM_WATCHOS:
     return "watchOS";
-  case PlatformKind::bridgeOS:
+  case PLATFORM_BRIDGEOS:
     return "bridgeOS";
-  case PlatformKind::macCatalyst:
+  case PLATFORM_MACCATALYST:
     return "macCatalyst";
-  case PlatformKind::iOSSimulator:
+  case PLATFORM_IOSSIMULATOR:
     return "iOS Simulator";
-  case PlatformKind::tvOSSimulator:
+  case PLATFORM_TVOSSIMULATOR:
     return "tvOS Simulator";
-  case PlatformKind::watchOSSimulator:
+  case PLATFORM_WATCHOSSIMULATOR:
     return "watchOS Simulator";
-  case PlatformKind::driverKit:
+  case PLATFORM_DRIVERKIT:
     return "DriverKit";
   }
-  llvm_unreachable("Unknown llvm::MachO::PlatformKind enum");
+  llvm_unreachable("Unknown llvm::MachO::PlatformType enum");
 }
 
-PlatformKind getPlatformFromName(StringRef Name) {
-  return StringSwitch<PlatformKind>(Name)
-      .Case("macos", PlatformKind::macOS)
-      .Case("ios", PlatformKind::iOS)
-      .Case("tvos", PlatformKind::tvOS)
-      .Case("watchos", PlatformKind::watchOS)
-      .Case("bridgeos", PlatformKind::macOS)
-      .Case("ios-macabi", PlatformKind::macCatalyst)
-      .Case("ios-simulator", PlatformKind::iOSSimulator)
-      .Case("tvos-simulator", PlatformKind::tvOSSimulator)
-      .Case("watchos-simulator", PlatformKind::watchOSSimulator)
-      .Case("driverkit", PlatformKind::driverKit)
-      .Default(PlatformKind::unknown);
+PlatformType getPlatformFromName(StringRef Name) {
+  return StringSwitch<PlatformType>(Name)
+      .Case("macos", PLATFORM_MACOS)
+      .Case("ios", PLATFORM_IOS)
+      .Case("tvos", PLATFORM_TVOS)
+      .Case("watchos", PLATFORM_WATCHOS)
+      .Case("bridgeos", PLATFORM_BRIDGEOS)
+      .Case("ios-macabi", PLATFORM_MACCATALYST)
+      .Case("ios-simulator", PLATFORM_IOSSIMULATOR)
+      .Case("tvos-simulator", PLATFORM_TVOSSIMULATOR)
+      .Case("watchos-simulator", PLATFORM_WATCHOSSIMULATOR)
+      .Case("driverkit", PLATFORM_DRIVERKIT)
+      .Default(PLATFORM_UNKNOWN);
 }
 
-std::string getOSAndEnvironmentName(PlatformKind Platform,
+std::string getOSAndEnvironmentName(PlatformType Platform,
                                     std::string Version) {
   switch (Platform) {
-  case PlatformKind::unknown:
+  case PLATFORM_UNKNOWN:
     return "darwin" + Version;
-  case PlatformKind::macOS:
+  case PLATFORM_MACOS:
     return "macos" + Version;
-  case PlatformKind::iOS:
+  case PLATFORM_IOS:
     return "ios" + Version;
-  case PlatformKind::tvOS:
+  case PLATFORM_TVOS:
     return "tvos" + Version;
-  case PlatformKind::watchOS:
+  case PLATFORM_WATCHOS:
     return "watchos" + Version;
-  case PlatformKind::bridgeOS:
+  case PLATFORM_BRIDGEOS:
     return "bridgeos" + Version;
-  case PlatformKind::macCatalyst:
+  case PLATFORM_MACCATALYST:
     return "ios" + Version + "-macabi";
-  case PlatformKind::iOSSimulator:
+  case PLATFORM_IOSSIMULATOR:
     return "ios" + Version + "-simulator";
-  case PlatformKind::tvOSSimulator:
+  case PLATFORM_TVOSSIMULATOR:
     return "tvos" + Version + "-simulator";
-  case PlatformKind::watchOSSimulator:
+  case PLATFORM_WATCHOSSIMULATOR:
     return "watchos" + Version + "-simulator";
-  case PlatformKind::driverKit:
+  case PLATFORM_DRIVERKIT:
     return "driverkit" + Version;
   }
-  llvm_unreachable("Unknown llvm::MachO::PlatformKind enum");
+  llvm_unreachable("Unknown llvm::MachO::PlatformType enum");
 }
 
 } // end namespace MachO.
diff --git a/llvm/lib/TextAPI/Target.cpp b/llvm/lib/TextAPI/Target.cpp
index 35fe1bf65e6f..c54c3bd66b9d 100644
--- a/llvm/lib/TextAPI/Target.cpp
+++ b/llvm/lib/TextAPI/Target.cpp
@@ -7,11 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TextAPI/Target.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Format.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
@@ -22,26 +19,26 @@ Expected<Target> Target::create(StringRef TargetValue) {
   auto ArchitectureStr = Result.first;
   auto Architecture = getArchitectureFromName(ArchitectureStr);
   auto PlatformStr = Result.second;
-  PlatformKind Platform;
-  Platform = StringSwitch<PlatformKind>(PlatformStr)
-                 .Case("macos", PlatformKind::macOS)
-                 .Case("ios", PlatformKind::iOS)
-                 .Case("tvos", PlatformKind::tvOS)
-                 .Case("watchos", PlatformKind::watchOS)
-                 .Case("bridgeos", PlatformKind::bridgeOS)
-                 .Case("maccatalyst", PlatformKind::macCatalyst)
-                 .Case("ios-simulator", PlatformKind::iOSSimulator)
-                 .Case("tvos-simulator", PlatformKind::tvOSSimulator)
-                 .Case("watchos-simulator", PlatformKind::watchOSSimulator)
-                 .Case("driverkit", PlatformKind::driverKit)
-                 .Default(PlatformKind::unknown);
+  PlatformType Platform;
+  Platform = StringSwitch<PlatformType>(PlatformStr)
+                 .Case("macos", PLATFORM_MACOS)
+                 .Case("ios", PLATFORM_IOS)
+                 .Case("tvos", PLATFORM_TVOS)
+                 .Case("watchos", PLATFORM_WATCHOS)
+                 .Case("bridgeos", PLATFORM_BRIDGEOS)
+                 .Case("maccatalyst", PLATFORM_MACCATALYST)
+                 .Case("ios-simulator", PLATFORM_IOSSIMULATOR)
+                 .Case("tvos-simulator", PLATFORM_TVOSSIMULATOR)
+                 .Case("watchos-simulator", PLATFORM_WATCHOSSIMULATOR)
+                 .Case("driverkit", PLATFORM_DRIVERKIT)
+                 .Default(PLATFORM_UNKNOWN);
 
-  if (Platform == PlatformKind::unknown) {
+  if (Platform == PLATFORM_UNKNOWN) {
     if (PlatformStr.startswith("<") && PlatformStr.endswith(">")) {
       PlatformStr = PlatformStr.drop_front().drop_back();
       unsigned long long RawValue;
       if (!PlatformStr.getAsInteger(10, RawValue))
-        Platform = (PlatformKind)RawValue;
+        Platform = (PlatformType)RawValue;
     }
   }
 
diff --git a/llvm/lib/TextAPI/TextStub.cpp b/llvm/lib/TextAPI/TextStub.cpp
index b64f19ab65cc..ff93e43356f7 100644
--- a/llvm/lib/TextAPI/TextStub.cpp
+++ b/llvm/lib/TextAPI/TextStub.cpp
@@ -380,34 +380,34 @@ template <> struct ScalarTraits<Target> {
     default:
       OS << "unknown";
       break;
-    case PlatformKind::macOS:
+    case PLATFORM_MACOS:
       OS << "macos";
       break;
-    case PlatformKind::iOS:
+    case PLATFORM_IOS:
       OS << "ios";
       break;
-    case PlatformKind::tvOS:
+    case PLATFORM_TVOS:
       OS << "tvos";
       break;
-    case PlatformKind::watchOS:
+    case PLATFORM_WATCHOS:
       OS << "watchos";
       break;
-    case PlatformKind::bridgeOS:
+    case PLATFORM_BRIDGEOS:
       OS << "bridgeos";
       break;
-    case PlatformKind::macCatalyst:
+    case PLATFORM_MACCATALYST:
       OS << "maccatalyst";
       break;
-    case PlatformKind::iOSSimulator:
+    case PLATFORM_IOSSIMULATOR:
       OS << "ios-simulator";
       break;
-    case PlatformKind::tvOSSimulator:
+    case PLATFORM_TVOSSIMULATOR:
       OS << "tvos-simulator";
       break;
-    case PlatformKind::watchOSSimulator:
+    case PLATFORM_WATCHOSSIMULATOR:
       OS << "watchos-simulator";
       break;
-    case PlatformKind::driverKit:
+    case PLATFORM_DRIVERKIT:
       OS << "driverkit";
       break;
     }
@@ -423,7 +423,7 @@ template <> struct ScalarTraits<Target> {
     Value = *Result;
     if (Value.Arch == AK_unknown)
       return "unknown architecture";
-    if (Value.Platform == PlatformKind::unknown)
+    if (Value.Platform == PLATFORM_UNKNOWN)
       return "unknown platform";
 
     return {};
@@ -597,11 +597,10 @@ template <> struct MappingTraits<const InterfaceFile *> {
       TargetList Targets;
 
       for (auto Platform : Platforms) {
-        Platform = mapToPlatformKind(Platform, Architectures.hasX86());
+        Platform = mapToPlatformType(Platform, Architectures.hasX86());
 
         for (const auto &&Architecture : Architectures) {
-          if ((Architecture == AK_i386) &&
-              (Platform == PlatformKind::macCatalyst))
+          if ((Architecture == AK_i386) && (Platform == PLATFORM_MACCATALYST))
             continue;
 
           Targets.emplace_back(Architecture, Platform);
diff --git a/llvm/lib/TextAPI/TextStubCommon.cpp b/llvm/lib/TextAPI/TextStubCommon.cpp
index c2713b9b5203..29b74f981a91 100644
--- a/llvm/lib/TextAPI/TextStubCommon.cpp
+++ b/llvm/lib/TextAPI/TextStubCommon.cpp
@@ -49,8 +49,8 @@ void ScalarTraits<PlatformSet>::output(const PlatformSet &Values, void *IO,
   assert((!Ctx || Ctx->FileKind != FileType::Invalid) &&
          "File type is not set in context");
 
-  if (Ctx && Ctx->FileKind == TBD_V3 && Values.count(PlatformKind::macOS) &&
-      Values.count(PlatformKind::macCatalyst)) {
+  if (Ctx && Ctx->FileKind == TBD_V3 && Values.count(PLATFORM_MACOS) &&
+      Values.count(PLATFORM_MACCATALYST)) {
     OS << "zippered";
     return;
   }
@@ -60,31 +60,31 @@ void ScalarTraits<PlatformSet>::output(const PlatformSet &Values, void *IO,
   default:
     llvm_unreachable("unexpected platform");
     break;
-  case PlatformKind::macOS:
+  case PLATFORM_MACOS:
     OS << "macosx";
     break;
-  case PlatformKind::iOSSimulator:
+  case PLATFORM_IOSSIMULATOR:
     LLVM_FALLTHROUGH;
-  case PlatformKind::iOS:
+  case PLATFORM_IOS:
     OS << "ios";
     break;
-  case PlatformKind::watchOSSimulator:
+  case PLATFORM_WATCHOSSIMULATOR:
     LLVM_FALLTHROUGH;
-  case PlatformKind::watchOS:
+  case PLATFORM_WATCHOS:
     OS << "watchos";
     break;
-  case PlatformKind::tvOSSimulator:
+  case PLATFORM_TVOSSIMULATOR:
     LLVM_FALLTHROUGH;
-  case PlatformKind::tvOS:
+  case PLATFORM_TVOS:
     OS << "tvos";
     break;
-  case PlatformKind::bridgeOS:
+  case PLATFORM_BRIDGEOS:
     OS << "bridgeos";
     break;
-  case PlatformKind::macCatalyst:
+  case PLATFORM_MACCATALYST:
     OS << "iosmac";
     break;
-  case PlatformKind::driverKit:
+  case PLATFORM_DRIVERKIT:
     OS << "driverkit";
     break;
   }
@@ -98,28 +98,27 @@ StringRef ScalarTraits<PlatformSet>::input(StringRef Scalar, void *IO,
 
   if (Scalar == "zippered") {
     if (Ctx && Ctx->FileKind == FileType::TBD_V3) {
-      Values.insert(PlatformKind::macOS);
-      Values.insert(PlatformKind::macCatalyst);
+      Values.insert(PLATFORM_MACOS);
+      Values.insert(PLATFORM_MACCATALYST);
       return {};
     }
     return "invalid platform";
   }
 
-  auto Platform = StringSwitch<PlatformKind>(Scalar)
-                      .Case("unknown", PlatformKind::unknown)
-                      .Case("macosx", PlatformKind::macOS)
-                      .Case("ios", PlatformKind::iOS)
-                      .Case("watchos", PlatformKind::watchOS)
-                      .Case("tvos", PlatformKind::tvOS)
-                      .Case("bridgeos", PlatformKind::bridgeOS)
-                      .Case("iosmac", PlatformKind::macCatalyst)
-                      .Default(PlatformKind::unknown);
-
-  if (Platform == PlatformKind::macCatalyst)
+  auto Platform = StringSwitch<PlatformType>(Scalar)
+                      .Case("macosx", PLATFORM_MACOS)
+                      .Case("ios", PLATFORM_IOS)
+                      .Case("watchos", PLATFORM_WATCHOS)
+                      .Case("tvos", PLATFORM_TVOS)
+                      .Case("bridgeos", PLATFORM_BRIDGEOS)
+                      .Case("iosmac", PLATFORM_MACCATALYST)
+                      .Default(PLATFORM_UNKNOWN);
+
+  if (Platform == PLATFORM_MACCATALYST)
     if (Ctx && Ctx->FileKind != FileType::TBD_V3)
       return "invalid platform";
 
-  if (Platform == PlatformKind::unknown)
+  if (Platform == PLATFORM_UNKNOWN)
     return "unknown platform";
 
   Values.insert(Platform);
@@ -226,7 +225,7 @@ StringRef ScalarTraits<UUID>::input(StringRef Scalar, void *, UUID &Value) {
   if (UUID.empty())
     return "invalid uuid string pair";
   Value.second = std::string(UUID);
-  Value.first = Target{getArchitectureFromName(Arch), PlatformKind::unknown};
+  Value.first = Target{getArchitectureFromName(Arch), PLATFORM_UNKNOWN};
   return {};
 }
 
diff --git a/llvm/lib/TextAPI/TextStubCommon.h b/llvm/lib/TextAPI/TextStubCommon.h
index 89ae5d56297c..aac27221b5ff 100644
--- a/llvm/lib/TextAPI/TextStubCommon.h
+++ b/llvm/lib/TextAPI/TextStubCommon.h
@@ -16,9 +16,9 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/TextAPI/Architecture.h"
-#include "llvm/TextAPI/ArchitectureSet.h"
 #include "llvm/TextAPI/InterfaceFile.h"
-#include "llvm/TextAPI/PackedVersion.h"
+#include "llvm/TextAPI/Platform.h"
+#include "llvm/TextAPI/Target.h"
 
 using UUID = std::pair<llvm::MachO::Target, std::string>;
 
@@ -28,6 +28,11 @@ LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(UUID)
 LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(FlowStringRef)
 
 namespace llvm {
+
+namespace MachO {
+    class ArchitectureSet;
+    class PackedVersion;
+}
 namespace yaml {
 
 template <> struct ScalarTraits<FlowStringRef> {
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 68a34bdcb1cd..1533e1805f17 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -176,11 +176,14 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         lowerCoroNoop(cast<IntrinsicInst>(&I));
         break;
       case Intrinsic::coro_id:
-        // Mark a function that comes out of the frontend that has a coro.id
-        // with a coroutine attribute.
         if (auto *CII = cast<CoroIdInst>(&I)) {
           if (CII->getInfo().isPreSplit()) {
-            F.addFnAttr(CORO_PRESPLIT_ATTR, UNPREPARED_FOR_SPLIT);
+            assert(F.hasFnAttribute(CORO_PRESPLIT_ATTR) &&
+                   F.getFnAttribute(CORO_PRESPLIT_ATTR).getValueAsString() ==
+                       UNPREPARED_FOR_SPLIT &&
+                   "The frontend uses Swtich-Resumed ABI should emit "
+                   "\"coroutine.presplit\" attribute with value \"0\" for the "
+                   "coroutine.");
             setCannotDuplicate(CII);
             CII->setCoroutineSelf();
             CoroId = cast<CoroIdInst>(&I);
@@ -190,6 +193,8 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
       case Intrinsic::coro_id_retcon:
       case Intrinsic::coro_id_retcon_once:
       case Intrinsic::coro_id_async:
+        // TODO: Remove the line once we support it in the corresponding
+        // frontend.
         F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
         break;
       case Intrinsic::coro_resume:
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index a0d12865bd3a..92acfb93057a 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -587,7 +587,7 @@ void FrameTypeBuilder::addFieldForAllocas(const Function &F,
     }
   });
 
-  if (!Shape.ReuseFrameSlot && !EnableReuseStorageInFrame) {
+  if (!Shape.OptimizeFrame && !EnableReuseStorageInFrame) {
     for (const auto &A : FrameData.Allocas) {
       AllocaInst *Alloca = A.Alloca;
       NonOverlapedAllocas.emplace_back(AllocaSetType(1, Alloca));
@@ -808,7 +808,7 @@ static StringRef solveTypeName(Type *Ty) {
 
   if (Ty->isPointerTy()) {
     auto *PtrTy = cast<PointerType>(Ty);
-    Type *PointeeTy = PtrTy->getElementType();
+    Type *PointeeTy = PtrTy->getPointerElementType();
     auto Name = solveTypeName(PointeeTy);
     if (Name == "UnknownType")
       return "PointerType";
@@ -1659,7 +1659,7 @@ static Instruction *insertSpills(const FrameDataInfo &FrameData,
                              &*Builder.GetInsertPoint());
           // This dbg.declare is for the main function entry point.  It
           // will be deleted in all coro-split functions.
-          coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.ReuseFrameSlot);
+          coro::salvageDebugInfo(DbgPtrAllocaCache, DDI, Shape.OptimizeFrame);
         }
       }
 
@@ -2278,7 +2278,7 @@ static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
   IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
 
   auto ArgTy = cast<PointerType>(Arg.getType());
-  auto ValueTy = ArgTy->getElementType();
+  auto ValueTy = ArgTy->getPointerElementType();
 
   // Reduce to the alloca case:
 
@@ -2506,7 +2506,7 @@ static void collectFrameAllocas(Function &F, coro::Shape &Shape,
 
 void coro::salvageDebugInfo(
     SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,
-    DbgVariableIntrinsic *DVI, bool ReuseFrameSlot) {
+    DbgVariableIntrinsic *DVI, bool OptimizeFrame) {
   Function *F = DVI->getFunction();
   IRBuilder<> Builder(F->getContext());
   auto InsertPt = F->getEntryBlock().getFirstInsertionPt();
@@ -2558,7 +2558,7 @@ void coro::salvageDebugInfo(
   //
   // Avoid to create the alloca would be eliminated by optimization
   // passes and the corresponding dbg.declares would be invalid.
-  if (!ReuseFrameSlot && !EnableReuseStorageInFrame)
+  if (!OptimizeFrame && !EnableReuseStorageInFrame)
     if (auto *Arg = dyn_cast<llvm::Argument>(Storage)) {
       auto &Cached = DbgPtrAllocaCache[Storage];
       if (!Cached) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h
index bf3d781ba43e..014938c15a0a 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -599,6 +599,18 @@ public:
   }
 };
 
+/// This represents the llvm.coro.align instruction.
+class LLVM_LIBRARY_VISIBILITY CoroAlignInst : public IntrinsicInst {
+public:
+  // Methods to support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::coro_align;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
 class LLVM_LIBRARY_VISIBILITY AnyCoroEndInst : public IntrinsicInst {
   enum { FrameArg, UnwindArg };
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 27ba8524f975..9a17068df3a9 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -36,6 +36,11 @@ void initializeCoroCleanupLegacyPass(PassRegistry &);
 // adds coroutine subfunctions to the SCC to be processed by IPO pipeline.
 // Async lowering similarily triggers a restart of the pipeline after it has
 // split the coroutine.
+//
+// FIXME: Refactor these attributes as LLVM attributes instead of string
+// attributes since these attributes are already used outside LLVM's
+// coroutine module.
+// FIXME: Remove these values once we remove the Legacy PM.
 #define CORO_PRESPLIT_ATTR "coroutine.presplit"
 #define UNPREPARED_FOR_SPLIT "0"
 #define PREPARED_FOR_SPLIT "1"
@@ -54,7 +59,7 @@ void updateCallGraph(Function &Caller, ArrayRef<Function *> Funcs,
 /// holding a pointer to the coroutine frame.
 void salvageDebugInfo(
     SmallDenseMap<llvm::Value *, llvm::AllocaInst *, 4> &DbgPtrAllocaCache,
-    DbgVariableIntrinsic *DVI, bool ReuseFrameSlot);
+    DbgVariableIntrinsic *DVI, bool OptimizeFrame);
 
 // Keeps data and helper functions for lowering coroutine intrinsics.
 struct LowererBase {
@@ -99,6 +104,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   CoroBeginInst *CoroBegin;
   SmallVector<AnyCoroEndInst *, 4> CoroEnds;
   SmallVector<CoroSizeInst *, 2> CoroSizes;
+  SmallVector<CoroAlignInst *, 2> CoroAligns;
   SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
   SmallVector<CallInst*, 2> SwiftErrorOps;
 
@@ -126,7 +132,7 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   BasicBlock *AllocaSpillBlock;
 
   /// This would only be true if optimization are enabled.
-  bool ReuseFrameSlot;
+  bool OptimizeFrame;
 
   struct SwitchLoweringStorage {
     SwitchInst *ResumeSwitch;
@@ -272,8 +278,8 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
 
   Shape() = default;
-  explicit Shape(Function &F, bool ReuseFrameSlot = false)
-      : ReuseFrameSlot(ReuseFrameSlot) {
+  explicit Shape(Function &F, bool OptimizeFrame = false)
+      : OptimizeFrame(OptimizeFrame) {
     buildFrom(F);
   }
   void buildFrom(Function &F);
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 12c1829524ef..b5129809c6a6 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -617,7 +618,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
   Value *CachedSlot = nullptr;
   auto getSwiftErrorSlot = [&](Type *ValueTy) -> Value * {
     if (CachedSlot) {
-      assert(CachedSlot->getType()->getPointerElementType() == ValueTy &&
+      assert(cast<PointerType>(CachedSlot->getType())
+                 ->isOpaqueOrPointeeTypeMatches(ValueTy) &&
              "multiple swifterror slots in function with different types");
       return CachedSlot;
     }
@@ -626,7 +628,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
     for (auto &Arg : F.args()) {
       if (Arg.isSwiftError()) {
         CachedSlot = &Arg;
-        assert(Arg.getType()->getPointerElementType() == ValueTy &&
+        assert(cast<PointerType>(Arg.getType())
+                   ->isOpaqueOrPointeeTypeMatches(ValueTy) &&
                "swifterror argument does not have expected type");
         return &Arg;
       }
@@ -682,7 +685,7 @@ void CoroCloner::salvageDebugInfo() {
       if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
         Worklist.push_back(DVI);
   for (DbgVariableIntrinsic *DVI : Worklist)
-    coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.ReuseFrameSlot);
+    coro::salvageDebugInfo(DbgPtrAllocaCache, DVI, Shape.OptimizeFrame);
 
   // Remove all salvaged dbg.declare intrinsics that became
   // either unreachable or stale due to the CoroSplit transformation.
@@ -835,7 +838,7 @@ Value *CoroCloner::deriveNewFramePointer() {
 static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context,
                                  unsigned ParamIndex,
                                  uint64_t Size, Align Alignment) {
-  AttrBuilder ParamAttrs;
+  AttrBuilder ParamAttrs(Context);
   ParamAttrs.addAttribute(Attribute::NonNull);
   ParamAttrs.addAttribute(Attribute::NoAlias);
   ParamAttrs.addAlignmentAttr(Alignment);
@@ -845,14 +848,14 @@ static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context,
 
 static void addAsyncContextAttrs(AttributeList &Attrs, LLVMContext &Context,
                                  unsigned ParamIndex) {
-  AttrBuilder ParamAttrs;
+  AttrBuilder ParamAttrs(Context);
   ParamAttrs.addAttribute(Attribute::SwiftAsync);
   Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs);
 }
 
 static void addSwiftSelfAttrs(AttributeList &Attrs, LLVMContext &Context,
                               unsigned ParamIndex) {
-  AttrBuilder ParamAttrs;
+  AttrBuilder ParamAttrs(Context);
   ParamAttrs.addAttribute(Attribute::SwiftSelf);
   Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs);
 }
@@ -929,7 +932,7 @@ void CoroCloner::create() {
   case coro::ABI::Switch:
     // Bootstrap attributes by copying function attributes from the
     // original function.  This should include optimization settings and so on.
-    NewAttrs = NewAttrs.addFnAttributes(Context, OrigAttrs.getFnAttrs());
+    NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, OrigAttrs.getFnAttrs()));
 
     addFramePointerAttrs(NewAttrs, Context, 0,
                          Shape.FrameSize, Shape.FrameAlign);
@@ -952,7 +955,7 @@ void CoroCloner::create() {
 
     // Transfer the original function's attributes.
     auto FnAttrs = OrigF.getAttributes().getFnAttrs();
-    NewAttrs = NewAttrs.addFnAttributes(Context, FnAttrs);
+    NewAttrs = NewAttrs.addFnAttributes(Context, AttrBuilder(Context, FnAttrs));
     break;
   }
   case coro::ABI::Retcon:
@@ -1082,10 +1085,16 @@ static void updateAsyncFuncPointerContextSize(coro::Shape &Shape) {
   Shape.AsyncLowering.AsyncFuncPointer->setInitializer(NewFuncPtrStruct);
 }
 
-static void replaceFrameSize(coro::Shape &Shape) {
+static void replaceFrameSizeAndAlignment(coro::Shape &Shape) {
   if (Shape.ABI == coro::ABI::Async)
     updateAsyncFuncPointerContextSize(Shape);
 
+  for (CoroAlignInst *CA : Shape.CoroAligns) {
+    CA->replaceAllUsesWith(
+        ConstantInt::get(CA->getType(), Shape.FrameAlign.value()));
+    CA->eraseFromParent();
+  }
+
   if (Shape.CoroSizes.empty())
     return;
 
@@ -1197,10 +1206,34 @@ scanPHIsAndUpdateValueMap(Instruction *Prev, BasicBlock *NewBlock,
 static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
   DenseMap<Value *, Value *> ResolvedValues;
   BasicBlock *UnconditionalSucc = nullptr;
+  assert(InitialInst->getModule());
+  const DataLayout &DL = InitialInst->getModule()->getDataLayout();
+
+  auto GetFirstValidInstruction = [](Instruction *I) {
+    while (I) {
+      // BitCastInst wouldn't generate actual code so that we could skip it.
+      if (isa<BitCastInst>(I) || I->isDebugOrPseudoInst() ||
+          I->isLifetimeStartOrEnd())
+        I = I->getNextNode();
+      else if (isInstructionTriviallyDead(I))
+        // Duing we are in the middle of the transformation, we need to erase
+        // the dead instruction manually.
+        I = &*I->eraseFromParent();
+      else
+        break;
+    }
+    return I;
+  };
+
+  auto TryResolveConstant = [&ResolvedValues](Value *V) {
+    auto It = ResolvedValues.find(V);
+    if (It != ResolvedValues.end())
+      V = It->second;
+    return dyn_cast<ConstantInt>(V);
+  };
 
   Instruction *I = InitialInst;
-  while (I->isTerminator() ||
-         (isa<CmpInst>(I) && I->getNextNode()->isTerminator())) {
+  while (I->isTerminator() || isa<CmpInst>(I)) {
     if (isa<ReturnInst>(I)) {
       if (I != InitialInst) {
         // If InitialInst is an unconditional branch,
@@ -1213,48 +1246,68 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
     }
     if (auto *BR = dyn_cast<BranchInst>(I)) {
       if (BR->isUnconditional()) {
-        BasicBlock *BB = BR->getSuccessor(0);
+        BasicBlock *Succ = BR->getSuccessor(0);
         if (I == InitialInst)
-          UnconditionalSucc = BB;
-        scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
-        I = BB->getFirstNonPHIOrDbgOrLifetime();
+          UnconditionalSucc = Succ;
+        scanPHIsAndUpdateValueMap(I, Succ, ResolvedValues);
+        I = GetFirstValidInstruction(Succ->getFirstNonPHIOrDbgOrLifetime());
         continue;
       }
-    } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) {
-      auto *BR = dyn_cast<BranchInst>(I->getNextNode());
-      if (BR && BR->isConditional() && CondCmp == BR->getCondition()) {
-        // If the case number of suspended switch instruction is reduced to
-        // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
-        // And the comparsion looks like : %cond = icmp eq i8 %V, constant.
-        ConstantInt *CondConst = dyn_cast<ConstantInt>(CondCmp->getOperand(1));
-        if (CondConst && CondCmp->getPredicate() == CmpInst::ICMP_EQ) {
-          Value *V = CondCmp->getOperand(0);
-          auto it = ResolvedValues.find(V);
-          if (it != ResolvedValues.end())
-            V = it->second;
-
-          if (ConstantInt *Cond0 = dyn_cast<ConstantInt>(V)) {
-            BasicBlock *BB = Cond0->equalsInt(CondConst->getZExtValue())
-                                 ? BR->getSuccessor(0)
-                                 : BR->getSuccessor(1);
-            scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
-            I = BB->getFirstNonPHIOrDbgOrLifetime();
-            continue;
-          }
-        }
-      }
-    } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
-      Value *V = SI->getCondition();
-      auto it = ResolvedValues.find(V);
-      if (it != ResolvedValues.end())
-        V = it->second;
-      if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
-        BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor();
-        scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
-        I = BB->getFirstNonPHIOrDbgOrLifetime();
+
+      BasicBlock *BB = BR->getParent();
+      // Handle the case the condition of the conditional branch is constant.
+      // e.g.,
+      //
+      //     br i1 false, label %cleanup, label %CoroEnd
+      //
+      // It is possible during the transformation. We could continue the
+      // simplifying in this case.
+      if (ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true)) {
+        // Handle this branch in next iteration.
+        I = BB->getTerminator();
         continue;
       }
+    } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) {
+      // If the case number of suspended switch instruction is reduced to
+      // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
+      auto *BR = dyn_cast<BranchInst>(
+          GetFirstValidInstruction(CondCmp->getNextNode()));
+      if (!BR || !BR->isConditional() || CondCmp != BR->getCondition())
+        return false;
+
+      // And the comparsion looks like : %cond = icmp eq i8 %V, constant.
+      // So we try to resolve constant for the first operand only since the
+      // second operand should be literal constant by design.
+      ConstantInt *Cond0 = TryResolveConstant(CondCmp->getOperand(0));
+      auto *Cond1 = dyn_cast<ConstantInt>(CondCmp->getOperand(1));
+      if (!Cond0 || !Cond1)
+        return false;
+
+      // Both operands of the CmpInst are Constant. So that we could evaluate
+      // it immediately to get the destination.
+      auto *ConstResult =
+          dyn_cast_or_null<ConstantInt>(ConstantFoldCompareInstOperands(
+              CondCmp->getPredicate(), Cond0, Cond1, DL));
+      if (!ConstResult)
+        return false;
+
+      CondCmp->replaceAllUsesWith(ConstResult);
+      CondCmp->eraseFromParent();
+
+      // Handle this branch in next iteration.
+      I = BR;
+      continue;
+    } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
+      ConstantInt *Cond = TryResolveConstant(SI->getCondition());
+      if (!Cond)
+        return false;
+
+      BasicBlock *BB = SI->findCaseValue(Cond)->getCaseSuccessor();
+      scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
+      I = GetFirstValidInstruction(BB->getFirstNonPHIOrDbgOrLifetime());
+      continue;
     }
+
     return false;
   }
   return false;
@@ -1826,20 +1879,20 @@ namespace {
 
 static coro::Shape splitCoroutine(Function &F,
                                   SmallVectorImpl<Function *> &Clones,
-                                  bool ReuseFrameSlot) {
+                                  bool OptimizeFrame) {
   PrettyStackTraceFunction prettyStackTrace(F);
 
   // The suspend-crossing algorithm in buildCoroutineFrame get tripped
   // up by uses in unreachable blocks, so remove them as a first pass.
   removeUnreachableBlocks(F);
 
-  coro::Shape Shape(F, ReuseFrameSlot);
+  coro::Shape Shape(F, OptimizeFrame);
   if (!Shape.CoroBegin)
     return Shape;
 
   simplifySuspendPoints(Shape);
   buildCoroutineFrame(F, Shape);
-  replaceFrameSize(Shape);
+  replaceFrameSizeAndAlignment(Shape);
 
   // If there are no suspend points, no split required, just remove
   // the allocation and deallocation blocks, they are not needed.
@@ -2165,7 +2218,7 @@ PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
     F.removeFnAttr(CORO_PRESPLIT_ATTR);
 
     SmallVector<Function *, 4> Clones;
-    const coro::Shape Shape = splitCoroutine(F, Clones, ReuseFrameSlot);
+    const coro::Shape Shape = splitCoroutine(F, Clones, OptimizeFrame);
     updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM);
 
     if (!Shape.CoroSuspends.empty()) {
@@ -2198,13 +2251,13 @@ namespace {
 struct CoroSplitLegacy : public CallGraphSCCPass {
   static char ID; // Pass identification, replacement for typeid
 
-  CoroSplitLegacy(bool ReuseFrameSlot = false)
-      : CallGraphSCCPass(ID), ReuseFrameSlot(ReuseFrameSlot) {
+  CoroSplitLegacy(bool OptimizeFrame = false)
+      : CallGraphSCCPass(ID), OptimizeFrame(OptimizeFrame) {
     initializeCoroSplitLegacyPass(*PassRegistry::getPassRegistry());
   }
 
   bool Run = false;
-  bool ReuseFrameSlot;
+  bool OptimizeFrame;
 
   // A coroutine is identified by the presence of coro.begin intrinsic, if
   // we don't have any, this pass has nothing to do.
@@ -2263,7 +2316,7 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
       F->removeFnAttr(CORO_PRESPLIT_ATTR);
 
       SmallVector<Function *, 4> Clones;
-      const coro::Shape Shape = splitCoroutine(*F, Clones, ReuseFrameSlot);
+      const coro::Shape Shape = splitCoroutine(*F, Clones, OptimizeFrame);
       updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC);
       if (Shape.ABI == coro::ABI::Async) {
         // Restart SCC passes.
@@ -2300,6 +2353,6 @@ INITIALIZE_PASS_END(
     "Split coroutine into a set of functions driving its state machine", false,
     false)
 
-Pass *llvm::createCoroSplitLegacyPass(bool ReuseFrameSlot) {
-  return new CoroSplitLegacy(ReuseFrameSlot);
+Pass *llvm::createCoroSplitLegacyPass(bool OptimizeFrame) {
+  return new CoroSplitLegacy(OptimizeFrame);
 }
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index fba8b03e44ba..965a146c143f 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -123,6 +123,7 @@ Value *coro::LowererBase::makeSubFnCall(Value *Arg, int Index,
 static bool isCoroutineIntrinsicName(StringRef Name) {
   // NOTE: Must be sorted!
   static const char *const CoroIntrinsics[] = {
+      "llvm.coro.align",
       "llvm.coro.alloc",
       "llvm.coro.async.context.alloc",
       "llvm.coro.async.context.dealloc",
@@ -268,6 +269,9 @@ void coro::Shape::buildFrom(Function &F) {
       case Intrinsic::coro_size:
         CoroSizes.push_back(cast<CoroSizeInst>(II));
         break;
+      case Intrinsic::coro_align:
+        CoroAligns.push_back(cast<CoroAlignInst>(II));
+        break;
       case Intrinsic::coro_frame:
         CoroFrames.push_back(cast<CoroFrameInst>(II));
         break;
@@ -672,8 +676,11 @@ static void checkAsyncFuncPointer(const Instruction *I, Value *V) {
   if (!AsyncFuncPtrAddr)
     fail(I, "llvm.coro.id.async async function pointer not a global", V);
 
-  auto *StructTy =
-      cast<StructType>(AsyncFuncPtrAddr->getType()->getPointerElementType());
+  if (AsyncFuncPtrAddr->getType()->isOpaquePointerTy())
+    return;
+
+  auto *StructTy = cast<StructType>(
+      AsyncFuncPtrAddr->getType()->getNonOpaquePointerElementType());
   if (StructTy->isOpaque() || !StructTy->isPacked() ||
       StructTy->getNumElements() != 2 ||
       !StructTy->getElementType(0)->isIntegerTy(32) ||
@@ -697,14 +704,16 @@ void CoroIdAsyncInst::checkWellFormed() const {
 static void checkAsyncContextProjectFunction(const Instruction *I,
                                              Function *F) {
   auto *FunTy = cast<FunctionType>(F->getValueType());
-  if (!FunTy->getReturnType()->isPointerTy() ||
-      !FunTy->getReturnType()->getPointerElementType()->isIntegerTy(8))
+  Type *Int8Ty = Type::getInt8Ty(F->getContext());
+  auto *RetPtrTy = dyn_cast<PointerType>(FunTy->getReturnType());
+  if (!RetPtrTy || !RetPtrTy->isOpaqueOrPointeeTypeMatches(Int8Ty))
     fail(I,
          "llvm.coro.suspend.async resume function projection function must "
          "return an i8* type",
          F);
   if (FunTy->getNumParams() != 1 || !FunTy->getParamType(0)->isPointerTy() ||
-      !FunTy->getParamType(0)->getPointerElementType()->isIntegerTy(8))
+      !cast<PointerType>(FunTy->getParamType(0))
+           ->isOpaqueOrPointeeTypeMatches(Int8Ty))
     fail(I,
          "llvm.coro.suspend.async resume function projection function must "
          "take one i8* type as parameter",
@@ -719,8 +728,7 @@ void CoroAsyncEndInst::checkWellFormed() const {
   auto *MustTailCallFunc = getMustTailCallFunction();
   if (!MustTailCallFunc)
     return;
-  auto *FnTy =
-      cast<FunctionType>(MustTailCallFunc->getType()->getPointerElementType());
+  auto *FnTy = MustTailCallFunc->getFunctionType();
   if (FnTy->getNumParams() != (arg_size() - 3))
     fail(this,
          "llvm.coro.end.async must tail call function argument type must "
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 01e724e22dcf..a6d9ce1033f3 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -54,13 +54,13 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
     if (F.isPresplitCoroutine())
       continue;
 
-    if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
-        isInlineViable(F).isSuccess()) {
+    if (!F.isDeclaration() && isInlineViable(F).isSuccess()) {
       Calls.clear();
 
       for (User *U : F.users())
         if (auto *CB = dyn_cast<CallBase>(U))
-          if (CB->getCalledFunction() == &F)
+          if (CB->getCalledFunction() == &F &&
+              CB->hasFnAttr(Attribute::AlwaysInline))
             Calls.insert(CB);
 
       for (CallBase *CB : Calls) {
@@ -92,10 +92,12 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
         Changed = true;
       }
 
-      // Remember to try and delete this function afterward. This both avoids
-      // re-walking the rest of the module and avoids dealing with any iterator
-      // invalidation issues while deleting functions.
-      InlinedFunctions.push_back(&F);
+      if (F.hasFnAttribute(Attribute::AlwaysInline)) {
+        // Remember to try and delete this function afterward. This both avoids
+        // re-walking the rest of the module and avoids dealing with any
+        // iterator invalidation issues while deleting functions.
+        InlinedFunctions.push_back(&F);
+      }
     }
   }
 
@@ -117,7 +119,7 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
   if (!InlinedFunctions.empty()) {
     // Now we just have the comdat functions. Filter out the ones whose comdats
     // are not actually dead.
-    filterDeadComdatFunctions(M, InlinedFunctions);
+    filterDeadComdatFunctions(InlinedFunctions);
     // The remaining functions are actually dead.
     for (Function *F : InlinedFunctions) {
       M.getFunctionList().erase(F);
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index 3a42a2cac928..ce3c5153bde2 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -196,8 +196,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       for (const auto &ArgIndex : ArgIndices) {
         // not allowed to dereference ->begin() if size() is 0
         Params.push_back(GetElementPtrInst::getIndexedType(
-            cast<PointerType>(I->getType())->getElementType(),
-            ArgIndex.second));
+            I->getType()->getPointerElementType(), ArgIndex.second));
         ArgAttrVec.push_back(AttributeSet());
         assert(Params.back());
       }
@@ -298,7 +297,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
               Ops.push_back(ConstantInt::get(IdxTy, II));
               // Keep track of the type we're currently indexing.
               if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
-                ElTy = ElPTy->getElementType();
+                ElTy = ElPTy->getPointerElementType();
               else
                 ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
             }
@@ -928,7 +927,7 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
   SmallPtrSet<Argument *, 8> ArgsToPromote;
   SmallPtrSet<Argument *, 8> ByValArgsToTransform;
   for (Argument *PtrArg : PointerArgs) {
-    Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+    Type *AgTy = PtrArg->getType()->getPointerElementType();
 
     // Replace sret attribute with noalias. This reduces register pressure by
     // avoiding a register copy.
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 7e729e57153c..12b8a0ef9d00 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -202,9 +203,12 @@ bool AA::isDynamicallyUnique(Attributor &A, const AbstractAttribute &QueryingAA,
   return NoRecurseAA.isAssumedNoRecurse();
 }
 
-Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty) {
+Constant *AA::getInitialValueForObj(Value &Obj, Type &Ty,
+                                    const TargetLibraryInfo *TLI) {
   if (isa<AllocaInst>(Obj))
     return UndefValue::get(&Ty);
+  if (isAllocationFn(&Obj, TLI))
+    return getInitialValueOfAllocation(&cast<CallBase>(Obj), TLI, &Ty);
   auto *GV = dyn_cast<GlobalVariable>(&Obj);
   if (!GV || !GV->hasLocalLinkage())
     return nullptr;
@@ -316,7 +320,8 @@ bool AA::getPotentialCopiesOfStoredValue(
           dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
       return false;
     }
-    if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj)) {
+    if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj) &&
+        !isNoAliasCall(Obj)) {
       LLVM_DEBUG(dbgs() << "Underlying object is not supported yet: " << *Obj
                         << "\n";);
       return false;
@@ -741,6 +746,7 @@ void IRPosition::verify() {
     assert((CBContext == nullptr) &&
            "'call site argument' position must not have CallBaseContext!");
     Use *U = getAsUsePtr();
+    (void)U; // Silence unused variable warning.
     assert(U && "Expected use for a 'call site argument' position!");
     assert(isa<CallBase>(U->getUser()) &&
            "Expected call base user for a 'call site argument' position!");
@@ -999,10 +1005,11 @@ bool Attributor::isAssumedDead(const BasicBlock &BB,
   return false;
 }
 
-bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
-                                 const AbstractAttribute &QueryingAA,
-                                 const Value &V, bool CheckBBLivenessOnly,
-                                 DepClassTy LivenessDepClass) {
+bool Attributor::checkForAllUses(
+    function_ref<bool(const Use &, bool &)> Pred,
+    const AbstractAttribute &QueryingAA, const Value &V,
+    bool CheckBBLivenessOnly, DepClassTy LivenessDepClass,
+    function_ref<bool(const Use &OldU, const Use &NewU)> EquivalentUseCB) {
 
   // Check the trivial case first as it catches void values.
   if (V.use_empty())
@@ -1053,8 +1060,15 @@ bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
                             << PotentialCopies.size()
                             << " potential copies instead!\n");
           for (Value *PotentialCopy : PotentialCopies)
-            for (const Use &U : PotentialCopy->uses())
-              Worklist.push_back(&U);
+            for (const Use &CopyUse : PotentialCopy->uses()) {
+              if (EquivalentUseCB && !EquivalentUseCB(*U, CopyUse)) {
+                LLVM_DEBUG(dbgs() << "[Attributor] Potential copy was "
+                                     "rejected by the equivalence call back: "
+                                  << *CopyUse << "!\n");
+                return false;
+              }
+              Worklist.push_back(&CopyUse);
+            }
           continue;
         }
       }
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index b977821bcaa6..76420783b2d1 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -417,12 +417,10 @@ const Value *stripAndAccumulateMinimalOffsets(
                                                 AttributorAnalysis);
 }
 
-static const Value *getMinimalBaseOfAccessPointerOperand(
-    Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
-    int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
-  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
-  if (!Ptr)
-    return nullptr;
+static const Value *
+getMinimalBaseOfPointer(Attributor &A, const AbstractAttribute &QueryingAA,
+                        const Value *Ptr, int64_t &BytesOffset,
+                        const DataLayout &DL, bool AllowNonInbounds = false) {
   APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
   const Value *Base = stripAndAccumulateMinimalOffsets(
       A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
@@ -431,18 +429,6 @@ static const Value *getMinimalBaseOfAccessPointerOperand(
   return Base;
 }
 
-static const Value *
-getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
-                                     const DataLayout &DL,
-                                     bool AllowNonInbounds = false) {
-  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
-  if (!Ptr)
-    return nullptr;
-
-  return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
-                                          AllowNonInbounds);
-}
-
 /// Clamp the information known for all returned values of a function
 /// (identified by \p QueryingAA) into \p S.
 template <typename AAType, typename StateType = typename AAType::StateType>
@@ -810,14 +796,17 @@ struct AA::PointerInfo::OffsetAndSize : public std::pair<int64_t, int64_t> {
   int64_t getSize() const { return second; }
   static OffsetAndSize getUnknown() { return OffsetAndSize(Unknown, Unknown); }
 
+  /// Return true if offset or size are unknown.
+  bool offsetOrSizeAreUnknown() const {
+    return getOffset() == OffsetAndSize::Unknown ||
+           getSize() == OffsetAndSize::Unknown;
+  }
+
   /// Return true if this offset and size pair might describe an address that
   /// overlaps with \p OAS.
   bool mayOverlap(const OffsetAndSize &OAS) const {
     // Any unknown value and we are giving up -> overlap.
-    if (OAS.getOffset() == OffsetAndSize::Unknown ||
-        OAS.getSize() == OffsetAndSize::Unknown ||
-        getOffset() == OffsetAndSize::Unknown ||
-        getSize() == OffsetAndSize::Unknown)
+    if (offsetOrSizeAreUnknown() || OAS.offsetOrSizeAreUnknown())
       return true;
 
     // Check if one offset point is in the other interval [offset, offset+size].
@@ -1024,8 +1013,9 @@ protected:
       OffsetAndSize ItOAS = It.getFirst();
       if (!OAS.mayOverlap(ItOAS))
         continue;
+      bool IsExact = OAS == ItOAS && !OAS.offsetOrSizeAreUnknown();
       for (auto &Access : It.getSecond())
-        if (!CB(Access, OAS == ItOAS))
+        if (!CB(Access, IsExact))
           return false;
     }
     return true;
@@ -1161,27 +1151,34 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
       return true;
     };
 
+    const auto *TLI = getAnchorScope()
+                          ? A.getInfoCache().getTargetLibraryInfoForFunction(
+                                *getAnchorScope())
+                          : nullptr;
     auto UsePred = [&](const Use &U, bool &Follow) -> bool {
       Value *CurPtr = U.get();
       User *Usr = U.getUser();
       LLVM_DEBUG(dbgs() << "[AAPointerInfo] Analyze " << *CurPtr << " in "
                         << *Usr << "\n");
-
-      OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
+      assert(OffsetInfoMap.count(CurPtr) &&
+             "The current pointer offset should have been seeded!");
 
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Usr)) {
         if (CE->isCast())
-          return HandlePassthroughUser(Usr, PtrOI, Follow);
+          return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow);
         if (CE->isCompare())
           return true;
-        if (!CE->isGEPWithNoNotionalOverIndexing()) {
+        if (!isa<GEPOperator>(CE)) {
           LLVM_DEBUG(dbgs() << "[AAPointerInfo] Unhandled constant user " << *CE
                             << "\n");
           return false;
         }
       }
       if (auto *GEP = dyn_cast<GEPOperator>(Usr)) {
+        // Note the order here, the Usr access might change the map, CurPtr is
+        // already in it though.
         OffsetInfo &UsrOI = OffsetInfoMap[Usr];
+        OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
         UsrOI = PtrOI;
 
         // TODO: Use range information.
@@ -1205,19 +1202,22 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         }
         UsrOI.Offset = PtrOI.Offset +
                        DL.getIndexedOffsetInType(
-                           CurPtr->getType()->getPointerElementType(), Indices);
+                           GEP->getSourceElementType(), Indices);
         Follow = true;
         return true;
       }
       if (isa<CastInst>(Usr) || isa<SelectInst>(Usr))
-        return HandlePassthroughUser(Usr, PtrOI, Follow);
+        return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow);
 
       // For PHIs we need to take care of the recurrence explicitly as the value
       // might change while we iterate through a loop. For now, we give up if
       // the PHI is not invariant.
       if (isa<PHINode>(Usr)) {
-        // Check if the PHI is invariant (so far).
+        // Note the order here, the Usr access might change the map, CurPtr is
+        // already in it though.
         OffsetInfo &UsrOI = OffsetInfoMap[Usr];
+        OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
+        // Check if the PHI is invariant (so far).
         if (UsrOI == PtrOI)
           return true;
 
@@ -1257,8 +1257,8 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
 
       if (auto *LoadI = dyn_cast<LoadInst>(Usr))
         return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr,
-                            AccessKind::AK_READ, PtrOI.Offset, Changed,
-                            LoadI->getType());
+                            AccessKind::AK_READ, OffsetInfoMap[CurPtr].Offset,
+                            Changed, LoadI->getType());
       if (auto *StoreI = dyn_cast<StoreInst>(Usr)) {
         if (StoreI->getValueOperand() == CurPtr) {
           LLVM_DEBUG(dbgs() << "[AAPointerInfo] Escaping use in store "
@@ -1269,18 +1269,21 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         Optional<Value *> Content = A.getAssumedSimplified(
             *StoreI->getValueOperand(), *this, UsedAssumedInformation);
         return handleAccess(A, *StoreI, *CurPtr, Content, AccessKind::AK_WRITE,
-                            PtrOI.Offset, Changed,
+                            OffsetInfoMap[CurPtr].Offset, Changed,
                             StoreI->getValueOperand()->getType());
       }
       if (auto *CB = dyn_cast<CallBase>(Usr)) {
         if (CB->isLifetimeStartOrEnd())
           return true;
+        if (TLI && isFreeCall(CB, TLI))
+          return true;
         if (CB->isArgOperand(&U)) {
           unsigned ArgNo = CB->getArgOperandNo(&U);
           const auto &CSArgPI = A.getAAFor<AAPointerInfo>(
               *this, IRPosition::callsite_argument(*CB, ArgNo),
               DepClassTy::REQUIRED);
-          Changed = translateAndAddCalleeState(A, CSArgPI, PtrOI.Offset, *CB) |
+          Changed = translateAndAddCalleeState(
+                        A, CSArgPI, OffsetInfoMap[CurPtr].Offset, *CB) |
                     Changed;
           return true;
         }
@@ -1293,8 +1296,15 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
       LLVM_DEBUG(dbgs() << "[AAPointerInfo] User not handled " << *Usr << "\n");
       return false;
     };
+    auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) {
+      if (OffsetInfoMap.count(NewU))
+        return OffsetInfoMap[NewU] == OffsetInfoMap[OldU];
+      OffsetInfoMap[NewU] = OffsetInfoMap[OldU];
+      return true;
+    };
     if (!A.checkForAllUses(UsePred, *this, AssociatedValue,
-                           /* CheckBBLivenessOnly */ true))
+                           /* CheckBBLivenessOnly */ true, DepClassTy::OPTIONAL,
+                           EquivalentUseCB))
       return indicatePessimisticFixpoint();
 
     LLVM_DEBUG({
@@ -2127,31 +2137,26 @@ static int64_t getKnownNonNullAndDerefBytesForUse(
     return DerefAA.getKnownDereferenceableBytes();
   }
 
+  Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I);
+  if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile())
+    return 0;
+
   int64_t Offset;
   const Value *Base =
-      getMinimalBaseOfAccessPointerOperand(A, QueryingAA, I, Offset, DL);
-  if (Base) {
-    if (Base == &AssociatedValue &&
-        getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
-      int64_t DerefBytes =
-          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
-
-      IsNonNull |= !NullPointerIsDefined;
-      return std::max(int64_t(0), DerefBytes);
-    }
+      getMinimalBaseOfPointer(A, QueryingAA, Loc->Ptr, Offset, DL);
+  if (Base && Base == &AssociatedValue) {
+    int64_t DerefBytes = Loc->Size.getValue() + Offset;
+    IsNonNull |= !NullPointerIsDefined;
+    return std::max(int64_t(0), DerefBytes);
   }
 
   /// Corner case when an offset is 0.
-  Base = getBasePointerOfAccessPointerOperand(I, Offset, DL,
-                                              /*AllowNonInbounds*/ true);
-  if (Base) {
-    if (Offset == 0 && Base == &AssociatedValue &&
-        getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
-      int64_t DerefBytes =
-          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
-      IsNonNull |= !NullPointerIsDefined;
-      return std::max(int64_t(0), DerefBytes);
-    }
+  Base = GetPointerBaseWithConstantOffset(Loc->Ptr, Offset, DL,
+                                          /*AllowNonInbounds*/ true);
+  if (Base && Base == &AssociatedValue && Offset == 0) {
+    int64_t DerefBytes = Loc->Size.getValue();
+    IsNonNull |= !NullPointerIsDefined;
+    return std::max(int64_t(0), DerefBytes);
   }
 
   return 0;
@@ -2325,6 +2330,8 @@ struct AANoRecurseFunction final : AANoRecurseImpl {
   /// See AbstractAttribute::initialize(...).
   void initialize(Attributor &A) override {
     AANoRecurseImpl::initialize(A);
+    // TODO: We should build a call graph ourselves to enable this in the module
+    // pass as well.
     if (const Function *F = getAnchorScope())
       if (A.getInfoCache().getSccSize(*F) != 1)
         indicatePessimisticFixpoint();
@@ -4057,17 +4064,15 @@ struct AADereferenceableImpl : AADereferenceable {
     if (!UseV->getType()->isPointerTy())
       return;
 
-    Type *PtrTy = UseV->getType();
-    const DataLayout &DL = A.getDataLayout();
+    Optional<MemoryLocation> Loc = MemoryLocation::getOrNone(I);
+    if (!Loc || Loc->Ptr != UseV || !Loc->Size.isPrecise() || I->isVolatile())
+      return;
+
     int64_t Offset;
-    if (const Value *Base = getBasePointerOfAccessPointerOperand(
-            I, Offset, DL, /*AllowNonInbounds*/ true)) {
-      if (Base == &getAssociatedValue() &&
-          getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
-        uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
-        State.addAccessedBytes(Offset, Size);
-      }
-    }
+    const Value *Base = GetPointerBaseWithConstantOffset(
+        Loc->Ptr, Offset, A.getDataLayout(), /*AllowNonInbounds*/ true);
+    if (Base && Base == &getAssociatedValue())
+      State.addAccessedBytes(Offset, Loc->Size.getValue());
   }
 
   /// See followUsesInMBEC
@@ -5236,6 +5241,8 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, AA, &L))
       return false;
 
+    const auto *TLI =
+        A.getInfoCache().getTargetLibraryInfoForFunction(*L.getFunction());
     for (Value *Obj : Objects) {
       LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
       if (isa<UndefValue>(Obj))
@@ -5250,9 +5257,7 @@ struct AAValueSimplifyImpl : AAValueSimplify {
           continue;
         return false;
       }
-      if (!isa<AllocaInst>(Obj) && !isa<GlobalVariable>(Obj))
-        return false;
-      Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType());
+      Constant *InitialVal = AA::getInitialValueForObj(*Obj, *L.getType(), TLI);
       if (!InitialVal || !Union(*InitialVal))
         return false;
 
@@ -5745,13 +5750,6 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
     /// The call that allocates the memory.
     CallBase *const CB;
 
-    /// The kind of allocation.
-    const enum class AllocationKind {
-      MALLOC,
-      CALLOC,
-      ALIGNED_ALLOC,
-    } Kind;
-
     /// The library function id for the allocation.
     LibFunc LibraryFunctionId = NotLibFunc;
 
@@ -5808,20 +5806,17 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
         DeallocationInfos[CB] = new (A.Allocator) DeallocationInfo{CB};
         return true;
       }
-      bool IsMalloc = isMallocLikeFn(CB, TLI);
-      bool IsAlignedAllocLike = !IsMalloc && isAlignedAllocLikeFn(CB, TLI);
-      bool IsCalloc =
-          !IsMalloc && !IsAlignedAllocLike && isCallocLikeFn(CB, TLI);
-      if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc)
-        return true;
-      auto Kind =
-          IsMalloc ? AllocationInfo::AllocationKind::MALLOC
-                   : (IsCalloc ? AllocationInfo::AllocationKind::CALLOC
-                               : AllocationInfo::AllocationKind::ALIGNED_ALLOC);
-
-      AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB, Kind};
-      AllocationInfos[CB] = AI;
-      TLI->getLibFunc(*CB, AI->LibraryFunctionId);
+      // To do heap to stack, we need to know that the allocation itself is
+      // removable once uses are rewritten, and that we can initialize the
+      // alloca to the same pattern as the original allocation result.
+      if (isAllocationFn(CB, TLI) && isAllocRemovable(CB, TLI)) {
+        auto *I8Ty = Type::getInt8Ty(CB->getParent()->getContext());
+        if (nullptr != getInitialValueOfAllocation(CB, TLI, I8Ty)) {
+          AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB};
+          AllocationInfos[CB] = AI;
+          TLI->getLibFunc(*CB, AI->LibraryFunctionId);
+        }
+      }
       return true;
     };
 
@@ -5917,21 +5912,22 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
       Optional<APInt> SizeAPI = getSize(A, *this, AI);
       if (SizeAPI.hasValue()) {
         Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI);
-      } else if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) {
-        auto *Num = AI.CB->getOperand(0);
-        auto *SizeT = AI.CB->getOperand(1);
-        IRBuilder<> B(AI.CB);
-        Size = B.CreateMul(Num, SizeT, "h2s.calloc.size");
-      } else if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) {
-        Size = AI.CB->getOperand(1);
       } else {
-        Size = AI.CB->getOperand(0);
+        LLVMContext &Ctx = AI.CB->getContext();
+        auto &DL = A.getInfoCache().getDL();
+        ObjectSizeOpts Opts;
+        ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
+        SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
+        assert(SizeOffsetPair != ObjectSizeOffsetEvaluator::unknown() &&
+               cast<ConstantInt>(SizeOffsetPair.second)->isZero());
+        Size = SizeOffsetPair.first;
       }
 
       Align Alignment(1);
-      if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC) {
-        Optional<APInt> AlignmentAPI =
-            getAPInt(A, *this, *AI.CB->getArgOperand(0));
+      if (MaybeAlign RetAlign = AI.CB->getRetAlign())
+        Alignment = max(Alignment, RetAlign);
+      if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
+        Optional<APInt> AlignmentAPI = getAPInt(A, *this, *Align);
         assert(AlignmentAPI.hasValue() &&
                "Expected an alignment during manifest!");
         Alignment =
@@ -5947,6 +5943,11 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
         Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc",
                                  Alloca->getNextNode());
 
+      auto *I8Ty = Type::getInt8Ty(F->getContext());
+      auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty);
+      assert(InitVal &&
+             "Must be able to materialize initial memory state of allocation");
+
       A.changeValueAfterManifest(*AI.CB, *Alloca);
 
       if (auto *II = dyn_cast<InvokeInst>(AI.CB)) {
@@ -5957,18 +5958,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
         A.deleteAfterManifest(*AI.CB);
       }
 
-      // Zero out the allocated memory if it was a calloc.
-      if (AI.Kind == AllocationInfo::AllocationKind::CALLOC) {
-        auto *BI = new BitCastInst(Alloca, AI.CB->getType(), "calloc_bc",
-                                   Alloca->getNextNode());
-        Value *Ops[] = {
-            BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
-            ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
-
-        Type *Tys[] = {BI->getType(), AI.CB->getOperand(0)->getType()};
-        Module *M = F->getParent();
-        Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
-        CallInst::Create(Fn, Ops, "", BI->getNextNode());
+      // Initialize the alloca with the same value as used by the allocation
+      // function.  We can skip undef as the initial value of an alloc is
+      // undef, and the memset would simply end up being DSEd.
+      if (!isa<UndefValue>(InitVal)) {
+        IRBuilder<> Builder(Alloca->getNextNode());
+        // TODO: Use alignment above if align!=1
+        Builder.CreateMemSet(Alloca, InitVal, Size, None);
       }
       HasChanged = ChangeStatus::CHANGED;
     }
@@ -5990,25 +5986,18 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
 
   Optional<APInt> getSize(Attributor &A, const AbstractAttribute &AA,
                           AllocationInfo &AI) {
+    auto Mapper = [&](const Value *V) -> const Value * {
+      bool UsedAssumedInformation = false;
+      if (Optional<Constant *> SimpleV =
+              A.getAssumedConstant(*V, AA, UsedAssumedInformation))
+        if (*SimpleV)
+          return *SimpleV;
+      return V;
+    };
 
-    if (AI.Kind == AllocationInfo::AllocationKind::MALLOC)
-      return getAPInt(A, AA, *AI.CB->getArgOperand(0));
-
-    if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC)
-      // Only if the alignment is also constant we return a size.
-      return getAPInt(A, AA, *AI.CB->getArgOperand(0)).hasValue()
-                 ? getAPInt(A, AA, *AI.CB->getArgOperand(1))
-                 : llvm::None;
-
-    assert(AI.Kind == AllocationInfo::AllocationKind::CALLOC &&
-           "Expected only callocs are left");
-    Optional<APInt> Num = getAPInt(A, AA, *AI.CB->getArgOperand(0));
-    Optional<APInt> Size = getAPInt(A, AA, *AI.CB->getArgOperand(1));
-    if (!Num.hasValue() || !Size.hasValue())
-      return llvm::None;
-    bool Overflow = false;
-    Size = Size.getValue().umul_ov(Num.getValue(), Overflow);
-    return Overflow ? llvm::None : Size;
+    const Function *F = getAnchorScope();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+    return getAllocSize(AI.CB, TLI, Mapper);
   }
 
   /// Collection of all malloc-like calls in a function with associated
@@ -6025,6 +6014,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
 ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
   ChangeStatus Changed = ChangeStatus::UNCHANGED;
   const Function *F = getAnchorScope();
+  const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
 
   const auto &LivenessAA =
       A.getAAFor<AAIsDead>(*this, IRPosition::function(*F), DepClassTy::NONE);
@@ -6239,22 +6229,24 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
     if (AI.Status == AllocationInfo::INVALID)
       continue;
 
-    if (MaxHeapToStackSize == -1) {
-      if (AI.Kind == AllocationInfo::AllocationKind::ALIGNED_ALLOC)
-        if (!getAPInt(A, *this, *AI.CB->getArgOperand(0)).hasValue()) {
-          LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
-                            << "\n");
-          AI.Status = AllocationInfo::INVALID;
-          Changed = ChangeStatus::CHANGED;
-          continue;
-        }
-    } else {
+    if (Value *Align = getAllocAlignment(AI.CB, TLI)) {
+      if (!getAPInt(A, *this, *Align)) {
+        // Can't generate an alloca which respects the required alignment
+        // on the allocation.
+        LLVM_DEBUG(dbgs() << "[H2S] Unknown allocation alignment: " << *AI.CB
+                          << "\n");
+        AI.Status = AllocationInfo::INVALID;
+        Changed = ChangeStatus::CHANGED;
+        continue;
+      }
+    }
+
+    if (MaxHeapToStackSize != -1) {
       Optional<APInt> Size = getSize(A, *this, AI);
       if (!Size.hasValue() || Size.getValue().ugt(MaxHeapToStackSize)) {
         LLVM_DEBUG({
           if (!Size.hasValue())
-            dbgs() << "[H2S] Unknown allocation size (or alignment): " << *AI.CB
-                   << "\n";
+            dbgs() << "[H2S] Unknown allocation size: " << *AI.CB << "\n";
           else
             dbgs() << "[H2S] Allocation size too large: " << *AI.CB << " vs. "
                    << MaxHeapToStackSize << "\n";
@@ -6637,9 +6629,10 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
     IRBuilder<NoFolder> IRB(IP);
     const DataLayout &DL = IP->getModule()->getDataLayout();
 
-    if (Base->getType()->getPointerElementType() != PrivType)
-      Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(),
-                                                 "", ACS.getInstruction());
+    Type *PrivPtrType = PrivType->getPointerTo();
+    if (Base->getType() != PrivPtrType)
+      Base = BitCastInst::CreateBitOrPointerCast(Base, PrivPtrType, "",
+                                                 ACS.getInstruction());
 
     // Traverse the type, build GEPs and loads.
     if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
@@ -6781,7 +6774,7 @@ struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
     if (auto *AI = dyn_cast<AllocaInst>(Obj))
       if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
         if (CI->isOne())
-          return Obj->getType()->getPointerElementType();
+          return AI->getAllocatedType();
     if (auto *Arg = dyn_cast<Argument>(Obj)) {
       auto &PrivArgAA = A.getAAFor<AAPrivatizablePtr>(
           *this, IRPosition::argument(*Arg), DepClassTy::REQUIRED);
@@ -7675,7 +7668,6 @@ void AAMemoryLocationImpl::categorizePtrValue(
   for (Value *Obj : Objects) {
     // TODO: recognize the TBAA used for constant accesses.
     MemoryLocationsKind MLK = NO_LOCATIONS;
-    assert(!isa<GEPOperator>(Obj) && "GEPs should have been stripped.");
     if (isa<UndefValue>(Obj))
       continue;
     if (isa<Argument>(Obj)) {
@@ -8485,13 +8477,30 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
                                                   /* UseValueSimplify */ false))
       return indicatePessimisticFixpoint();
 
-    return clampStateAndIndicateChange(getState(), T);
+    // Ensure that long def-use chains can't cause circular reasoning either by
+    // introducing a cutoff below.
+    if (clampStateAndIndicateChange(getState(), T) == ChangeStatus::UNCHANGED)
+      return ChangeStatus::UNCHANGED;
+    if (++NumChanges > MaxNumChanges) {
+      LLVM_DEBUG(dbgs() << "[AAValueConstantRange] performed " << NumChanges
+                        << " but only " << MaxNumChanges
+                        << " are allowed to avoid cyclic reasoning.");
+      return indicatePessimisticFixpoint();
+    }
+    return ChangeStatus::CHANGED;
   }
 
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override {
     STATS_DECLTRACK_FLOATING_ATTR(value_range)
   }
+
+  /// Tracker to bail after too many widening steps of the constant range.
+  int NumChanges = 0;
+
+  /// Upper bound for the number of allowed changes (=widening steps) for the
+  /// constant range before we give up.
+  static constexpr int MaxNumChanges = 5;
 };
 
 struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index fb9ab7954e36..2a6e38b0437f 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -287,7 +287,8 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
   SmallVector<unsigned, 8> UnusedArgs;
   bool Changed = false;
 
-  AttrBuilder UBImplyingAttributes = AttributeFuncs::getUBImplyingAttributes();
+  AttributeMask UBImplyingAttributes =
+      AttributeFuncs::getUBImplyingAttributes();
   for (Argument &Arg : Fn.args()) {
     if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
         !Arg.hasPassPointeeByValueCopyAttr()) {
@@ -838,7 +839,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   assert(NRetTy && "No new return type found?");
 
   // The existing function return attributes.
-  AttrBuilder RAttrs(PAL.getRetAttrs());
+  AttrBuilder RAttrs(F->getContext(), PAL.getRetAttrs());
 
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
@@ -889,7 +890,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Adjust the call return attributes in case the function was changed to
     // return void.
-    AttrBuilder RAttrs(CallPAL.getRetAttrs());
+    AttrBuilder RAttrs(F->getContext(), CallPAL.getRetAttrs());
     RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
     AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
@@ -912,7 +913,7 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
           // this is not an expected case anyway
           ArgAttrVec.push_back(AttributeSet::get(
               F->getContext(),
-              AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+              AttrBuilder(F->getContext(), Attrs).removeAttribute(Attribute::Returned)));
         } else {
           // Otherwise, use the original attributes.
           ArgAttrVec.push_back(Attrs);
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 321d4a19a585..213a998d5bba 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -133,7 +133,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
     if (AliasAnalysis::onlyReadsMemory(MRB))
       return MAK_ReadOnly;
 
-    if (AliasAnalysis::doesNotReadMemory(MRB))
+    if (AliasAnalysis::onlyWritesMemory(MRB))
       return MAK_WriteOnly;
 
     // Conservatively assume it reads and writes to memory.
@@ -295,13 +295,13 @@ static void addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
       // No change.
       continue;
 
-    if (F->doesNotReadMemory() && WritesMemory)
+    if (F->onlyWritesMemory() && WritesMemory)
       continue;
 
     Changed.insert(F);
 
     // Clear out any existing attributes.
-    AttrBuilder AttrsToRemove;
+    AttributeMask AttrsToRemove;
     AttrsToRemove.addAttribute(Attribute::ReadOnly);
     AttrsToRemove.addAttribute(Attribute::ReadNone);
     AttrsToRemove.addAttribute(Attribute::WriteOnly);
@@ -720,10 +720,16 @@ determinePointerAccessAttrs(Argument *A,
 
       // The accessors used on call site here do the right thing for calls and
       // invokes with operand bundles.
-      if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
-        return Attribute::None;
-      if (!CB.doesNotAccessMemory(UseIndex))
+      if (CB.doesNotAccessMemory(UseIndex)) {
+        /* nop */
+      } else if (CB.onlyReadsMemory() || CB.onlyReadsMemory(UseIndex)) {
         IsRead = true;
+      } else if (CB.hasFnAttr(Attribute::WriteOnly) ||
+                 CB.dataOperandHasImpliedAttr(UseIndex, Attribute::WriteOnly)) {
+        IsWrite = true;
+      } else {
+        return Attribute::None;
+      }
       break;
     }
 
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 2425646455bd..6c3cc3914337 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -6,15 +6,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This specialises functions with constant parameters (e.g. functions,
-// globals). Constant parameters like function pointers and constant globals
-// are propagated to the callee by specializing the function.
+// This specialises functions with constant parameters. Constant parameters
+// like function pointers and constant globals are propagated to the callee by
+// specializing the function. The main benefit of this pass at the moment is
+// that indirect calls are transformed into direct calls, which provides inline
+// opportunities that the inliner would not have been able to achieve. That's
+// why function specialisation is run before the inliner in the optimisation
+// pipeline; that is by design. Otherwise, we would only benefit from constant
+// passing, which is a valid use-case too, but hasn't been explored much in
+// terms of performance uplifts, cost-model and compile-time impact.
 //
 // Current limitations:
-// - It does not yet handle integer ranges.
+// - It does not yet handle integer ranges. We do support "literal constants",
+//   but that's off by default under an option.
 // - Only 1 argument per function is specialised,
-// - The cost-model could be further looked into,
-// - We are not yet caching analysis results.
+// - The cost-model could be further looked into (it mainly focuses on inlining
+//   benefits),
+// - We are not yet caching analysis results, but profiling and checking where
+//   extra compile time is spent didn't suggest this to be a problem.
 //
 // Ideas:
 // - With a function specialization attribute for arguments, we could have
@@ -30,8 +39,12 @@
 //   https://reviews.llvm.org/D106426 for details. Perhaps there is a
 //   compile-time friendlier way to control/limit the number of specialisations
 //   for recursive functions.
-// - Don't transform the function if there is no function specialization
-//   happens.
+// - Don't transform the function if function specialization does not trigger;
+//   the SCCPSolver may make IR changes.
+//
+// References:
+// - 2021 LLVM Dev Mtg “Introducing function specialisation, and can we enable
+//   it by default?”, https://www.youtube.com/watch?v=zJiCjeXgV5Q
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index b1f3ff15c97b..d3cac3efce86 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -303,11 +303,11 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
     else if (auto *GEP = dyn_cast<GEPOperator>(U))
       append_range(WorkList, GEP->users());
     else if (auto *LI = dyn_cast<LoadInst>(U)) {
-      // A load from zeroinitializer is always zeroinitializer, regardless of
-      // any applied offset.
+      // A load from a uniform value is always the same, regardless of any
+      // applied offset.
       Type *Ty = LI->getType();
-      if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) {
-        LI->replaceAllUsesWith(Constant::getNullValue(Ty));
+      if (Constant *Res = ConstantFoldLoadFromUniformValue(Init, Ty)) {
+        LI->replaceAllUsesWith(Res);
         EraseFromParent(LI);
         continue;
       }
@@ -337,107 +337,68 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
   return Changed;
 }
 
-static bool isSafeSROAElementUse(Value *V);
-
-/// Return true if the specified GEP is a safe user of a derived
-/// expression from a global that we want to SROA.
-static bool isSafeSROAGEP(User *U) {
-  // Check to see if this ConstantExpr GEP is SRA'able.  In particular, we
-  // don't like < 3 operand CE's, and we don't like non-constant integer
-  // indices.  This enforces that all uses are 'gep GV, 0, C, ...' for some
-  // value of C.
-  if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
-      !cast<Constant>(U->getOperand(1))->isNullValue())
-    return false;
-
-  gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
-  ++GEPI; // Skip over the pointer index.
-
-  // For all other level we require that the indices are constant and inrange.
-  // In particular, consider: A[0][i].  We cannot know that the user isn't doing
-  // invalid things like allowing i to index an out-of-range subscript that
-  // accesses A[1]. This can also happen between different members of a struct
-  // in llvm IR.
-  for (; GEPI != E; ++GEPI) {
-    if (GEPI.isStruct())
+/// Look at all uses of the global and determine which (offset, type) pairs it
+/// can be split into.
+static bool collectSRATypes(DenseMap<uint64_t, Type *> &Types, GlobalValue *GV,
+                            const DataLayout &DL) {
+  SmallVector<Use *, 16> Worklist;
+  SmallPtrSet<Use *, 16> Visited;
+  auto AppendUses = [&](Value *V) {
+    for (Use &U : V->uses())
+      if (Visited.insert(&U).second)
+        Worklist.push_back(&U);
+  };
+  AppendUses(GV);
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    User *V = U->getUser();
+    if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V)) {
+      AppendUses(V);
       continue;
+    }
 
-    ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
-    if (!IdxVal || (GEPI.isBoundedSequential() &&
-                    IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
-      return false;
-  }
-
-  return llvm::all_of(U->users(), isSafeSROAElementUse);
-}
-
-/// Return true if the specified instruction is a safe user of a derived
-/// expression from a global that we want to SROA.
-static bool isSafeSROAElementUse(Value *V) {
-  // We might have a dead and dangling constant hanging off of here.
-  if (Constant *C = dyn_cast<Constant>(V))
-    return isSafeToDestroyConstant(C);
-
-  Instruction *I = dyn_cast<Instruction>(V);
-  if (!I) return false;
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->hasAllConstantIndices())
+        return false;
+      AppendUses(V);
+      continue;
+    }
 
-  // Loads are ok.
-  if (isa<LoadInst>(I)) return true;
+    if (Value *Ptr = getLoadStorePointerOperand(V)) {
+      // This is storing the global address into somewhere, not storing into
+      // the global.
+      if (isa<StoreInst>(V) && U->getOperandNo() == 0)
+        return false;
 
-  // Stores *to* the pointer are ok.
-  if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->getOperand(0) != V;
+      APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+      Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                   /* AllowNonInbounds */ true);
+      if (Ptr != GV || Offset.getActiveBits() >= 64)
+        return false;
 
-  // Otherwise, it must be a GEP. Check it and its users are safe to SRA.
-  return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I);
-}
+      // TODO: We currently require that all accesses at a given offset must
+      // use the same type. This could be relaxed.
+      Type *Ty = getLoadStoreType(V);
+      auto It = Types.try_emplace(Offset.getZExtValue(), Ty).first;
+      if (Ty != It->second)
+        return false;
+      continue;
+    }
 
-/// Look at all uses of the global and decide whether it is safe for us to
-/// perform this transformation.
-static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
-  for (User *U : GV->users()) {
-    // The user of the global must be a GEP Inst or a ConstantExpr GEP.
-    if (!isa<GetElementPtrInst>(U) &&
-        (!isa<ConstantExpr>(U) ||
-        cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
-      return false;
+    // Ignore dead constant users.
+    if (auto *C = dyn_cast<Constant>(V)) {
+      if (!isSafeToDestroyConstant(C))
+        return false;
+      continue;
+    }
 
-    // Check the gep and it's users are safe to SRA
-    if (!isSafeSROAGEP(U))
-      return false;
+    // Unknown user.
+    return false;
   }
 
   return true;
 }
 
-static bool IsSRASequential(Type *T) {
-  return isa<ArrayType>(T) || isa<VectorType>(T);
-}
-static uint64_t GetSRASequentialNumElements(Type *T) {
-  if (ArrayType *AT = dyn_cast<ArrayType>(T))
-    return AT->getNumElements();
-  return cast<FixedVectorType>(T)->getNumElements();
-}
-static Type *GetSRASequentialElementType(Type *T) {
-  if (ArrayType *AT = dyn_cast<ArrayType>(T))
-    return AT->getElementType();
-  return cast<VectorType>(T)->getElementType();
-}
-static bool CanDoGlobalSRA(GlobalVariable *GV) {
-  Constant *Init = GV->getInitializer();
-
-  if (isa<StructType>(Init->getType())) {
-    // nothing to check
-  } else if (IsSRASequential(Init->getType())) {
-    if (GetSRASequentialNumElements(Init->getType()) > 16 &&
-        GV->hasNUsesOrMore(16))
-      return false; // It's not worth it.
-  } else
-    return false;
-
-  return GlobalUsersSafeToSRA(GV);
-}
-
 /// Copy over the debug info for a variable to its SRA replacements.
 static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
                                  uint64_t FragmentOffsetInBits,
@@ -468,161 +429,140 @@ static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
 /// transformation is safe already.  We return the first global variable we
 /// insert so that the caller can reprocess it.
 static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
-  // Make sure this global only has simple uses that we can SRA.
-  if (!CanDoGlobalSRA(GV))
+  assert(GV->hasLocalLinkage());
+
+  // Collect types to split into.
+  DenseMap<uint64_t, Type *> Types;
+  if (!collectSRATypes(Types, GV, DL) || Types.empty())
     return nullptr;
 
-  assert(GV->hasLocalLinkage());
-  Constant *Init = GV->getInitializer();
-  Type *Ty = Init->getType();
-  uint64_t VarSize = DL.getTypeSizeInBits(Ty);
+  // Make sure we don't SRA back to the same type.
+  if (Types.size() == 1 && Types.begin()->second == GV->getValueType())
+    return nullptr;
 
-  std::map<unsigned, GlobalVariable *> NewGlobals;
+  // Don't perform SRA if we would have to split into many globals.
+  if (Types.size() > 16)
+    return nullptr;
 
-  // Get the alignment of the global, either explicit or target-specific.
-  Align StartAlignment =
-      DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType());
-
-  // Loop over all users and create replacement variables for used aggregate
-  // elements.
-  for (User *GEP : GV->users()) {
-    assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() ==
-                                           Instruction::GetElementPtr) ||
-            isa<GetElementPtrInst>(GEP)) &&
-           "NonGEP CE's are not SRAable!");
-
-    // Ignore the 1th operand, which has to be zero or else the program is quite
-    // broken (undefined).  Get the 2nd operand, which is the structure or array
-    // index.
-    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
-    if (NewGlobals.count(ElementIdx) == 1)
-      continue; // we`ve already created replacement variable
-    assert(NewGlobals.count(ElementIdx) == 0);
-
-    Type *ElTy = nullptr;
-    if (StructType *STy = dyn_cast<StructType>(Ty))
-      ElTy = STy->getElementType(ElementIdx);
-    else
-      ElTy = GetSRASequentialElementType(Ty);
-    assert(ElTy);
+  // Sort by offset.
+  SmallVector<std::pair<uint64_t, Type *>, 16> TypesVector;
+  append_range(TypesVector, Types);
+  sort(TypesVector,
+       [](const auto &A, const auto &B) { return A.first < B.first; });
 
-    Constant *In = Init->getAggregateElement(ElementIdx);
-    assert(In && "Couldn't get element of initializer?");
+  // Check that the types are non-overlapping.
+  uint64_t Offset = 0;
+  for (const auto &Pair : TypesVector) {
+    // Overlaps with previous type.
+    if (Pair.first < Offset)
+      return nullptr;
 
-    GlobalVariable *NGV = new GlobalVariable(
-        ElTy, false, GlobalVariable::InternalLinkage, In,
-        GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(),
-        GV->getType()->getAddressSpace());
-    NGV->setExternallyInitialized(GV->isExternallyInitialized());
-    NGV->copyAttributesFrom(GV);
-    NewGlobals.insert(std::make_pair(ElementIdx, NGV));
-
-    if (StructType *STy = dyn_cast<StructType>(Ty)) {
-      const StructLayout &Layout = *DL.getStructLayout(STy);
-
-      // Calculate the known alignment of the field.  If the original aggregate
-      // had 256 byte alignment for example, something might depend on that:
-      // propagate info to each field.
-      uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
-      Align NewAlign = commonAlignment(StartAlignment, FieldOffset);
-      if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx)))
-        NGV->setAlignment(NewAlign);
-
-      // Copy over the debug info for the variable.
-      uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
-      uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
-      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize);
-    } else {
-      uint64_t EltSize = DL.getTypeAllocSize(ElTy);
-      Align EltAlign = DL.getABITypeAlign(ElTy);
-      uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
-
-      // Calculate the known alignment of the field.  If the original aggregate
-      // had 256 byte alignment for example, something might depend on that:
-      // propagate info to each field.
-      Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx);
-      if (NewAlign > EltAlign)
-        NGV->setAlignment(NewAlign);
-      transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
-                           FragmentSizeInBits, VarSize);
-    }
+    Offset = Pair.first + DL.getTypeAllocSize(Pair.second);
   }
 
-  if (NewGlobals.empty())
+  // Some accesses go beyond the end of the global, don't bother.
+  if (Offset > DL.getTypeAllocSize(GV->getValueType()))
     return nullptr;
 
-  Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
-  for (auto NewGlobalVar : NewGlobals)
-    Globals.push_back(NewGlobalVar.second);
+  // Collect initializers for new globals.
+  Constant *OrigInit = GV->getInitializer();
+  DenseMap<uint64_t, Constant *> Initializers;
+  for (const auto &Pair : Types) {
+    Constant *NewInit = ConstantFoldLoadFromConst(OrigInit, Pair.second,
+                                                  APInt(64, Pair.first), DL);
+    if (!NewInit) {
+      LLVM_DEBUG(dbgs() << "Global SRA: Failed to evaluate initializer of "
+                        << *GV << " with type " << *Pair.second << " at offset "
+                        << Pair.first << "\n");
+      return nullptr;
+    }
+    Initializers.insert({Pair.first, NewInit});
+  }
 
   LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
 
-  Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
+  // Get the alignment of the global, either explicit or target-specific.
+  Align StartAlignment =
+      DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+  uint64_t VarSize = DL.getTypeSizeInBits(GV->getValueType());
+
+  // Create replacement globals.
+  DenseMap<uint64_t, GlobalVariable *> NewGlobals;
+  unsigned NameSuffix = 0;
+  for (auto &Pair : TypesVector) {
+    uint64_t Offset = Pair.first;
+    Type *Ty = Pair.second;
+    GlobalVariable *NGV = new GlobalVariable(
+        *GV->getParent(), Ty, false, GlobalVariable::InternalLinkage,
+        Initializers[Offset], GV->getName() + "." + Twine(NameSuffix++), GV,
+        GV->getThreadLocalMode(), GV->getAddressSpace());
+    NGV->copyAttributesFrom(GV);
+    NewGlobals.insert({Offset, NGV});
+
+    // Calculate the known alignment of the field.  If the original aggregate
+    // had 256 byte alignment for example, something might depend on that:
+    // propagate info to each field.
+    Align NewAlign = commonAlignment(StartAlignment, Offset);
+    if (NewAlign > DL.getABITypeAlign(Ty))
+      NGV->setAlignment(NewAlign);
+
+    // Copy over the debug info for the variable.
+    transferSRADebugInfo(GV, NGV, Offset * 8, DL.getTypeAllocSizeInBits(Ty),
+                         VarSize);
+  }
+
+  // Replace uses of the original global with uses of the new global.
+  SmallVector<Value *, 16> Worklist;
+  SmallPtrSet<Value *, 16> Visited;
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
+  auto AppendUsers = [&](Value *V) {
+    for (User *U : V->users())
+      if (Visited.insert(U).second)
+        Worklist.push_back(U);
+  };
+  AppendUsers(GV);
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+    if (isa<BitCastOperator>(V) || isa<AddrSpaceCastOperator>(V) ||
+        isa<GEPOperator>(V)) {
+      AppendUsers(V);
+      if (isa<Instruction>(V))
+        DeadInsts.push_back(V);
+      continue;
+    }
 
-  // Loop over all of the uses of the global, replacing the constantexpr geps,
-  // with smaller constantexpr geps or direct references.
-  while (!GV->use_empty()) {
-    User *GEP = GV->user_back();
-    assert(((isa<ConstantExpr>(GEP) &&
-             cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
-            isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
-
-    // Ignore the 1th operand, which has to be zero or else the program is quite
-    // broken (undefined).  Get the 2nd operand, which is the structure or array
-    // index.
-    unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
-    assert(NewGlobals.count(ElementIdx) == 1);
-
-    Value *NewPtr = NewGlobals[ElementIdx];
-    Type *NewTy = NewGlobals[ElementIdx]->getValueType();
-
-    // Form a shorter GEP if needed.
-    if (GEP->getNumOperands() > 3) {
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
-        SmallVector<Constant*, 8> Idxs;
-        Idxs.push_back(NullInt);
-        for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
-          Idxs.push_back(CE->getOperand(i));
-        NewPtr =
-            ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs);
+    if (Value *Ptr = getLoadStorePointerOperand(V)) {
+      APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+      Ptr = Ptr->stripAndAccumulateConstantOffsets(DL, Offset,
+                                                   /* AllowNonInbounds */ true);
+      assert(Ptr == GV && "Load/store must be from/to global");
+      GlobalVariable *NGV = NewGlobals[Offset.getZExtValue()];
+      assert(NGV && "Must have replacement global for this offset");
+
+      // Update the pointer operand and recalculate alignment.
+      Align PrefAlign = DL.getPrefTypeAlign(getLoadStoreType(V));
+      Align NewAlign =
+          getOrEnforceKnownAlignment(NGV, PrefAlign, DL, cast<Instruction>(V));
+
+      if (auto *LI = dyn_cast<LoadInst>(V)) {
+        LI->setOperand(0, NGV);
+        LI->setAlignment(NewAlign);
       } else {
-        GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
-        SmallVector<Value*, 8> Idxs;
-        Idxs.push_back(NullInt);
-        for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
-          Idxs.push_back(GEPI->getOperand(i));
-        NewPtr = GetElementPtrInst::Create(
-            NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx),
-            GEPI);
-      }
-    }
-    GEP->replaceAllUsesWith(NewPtr);
-
-    // We changed the pointer of any memory access user. Recalculate alignments.
-    for (User *U : NewPtr->users()) {
-      if (auto *Load = dyn_cast<LoadInst>(U)) {
-        Align PrefAlign = DL.getPrefTypeAlign(Load->getType());
-        Align NewAlign = getOrEnforceKnownAlignment(Load->getPointerOperand(),
-                                                    PrefAlign, DL, Load);
-        Load->setAlignment(NewAlign);
-      }
-      if (auto *Store = dyn_cast<StoreInst>(U)) {
-        Align PrefAlign =
-            DL.getPrefTypeAlign(Store->getValueOperand()->getType());
-        Align NewAlign = getOrEnforceKnownAlignment(Store->getPointerOperand(),
-                                                    PrefAlign, DL, Store);
-        Store->setAlignment(NewAlign);
+        auto *SI = cast<StoreInst>(V);
+        SI->setOperand(1, NGV);
+        SI->setAlignment(NewAlign);
       }
+      continue;
     }
 
-    if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
-      GEPI->eraseFromParent();
-    else
-      cast<ConstantExpr>(GEP)->destroyConstant();
+    assert(isa<Constant>(V) && isSafeToDestroyConstant(cast<Constant>(V)) &&
+           "Other users can only be dead constants");
   }
 
-  // Delete the old global, now that it is dead.
-  Globals.erase(GV);
+  // Delete old instructions and global.
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+  GV->removeDeadConstantUsers();
+  GV->eraseFromParent();
   ++NumSRA;
 
   assert(NewGlobals.size() > 0);
@@ -677,7 +617,7 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
              "Should be GlobalVariable");
       // This and only this kind of non-signed ICmpInst is to be replaced with
       // the comparing of the value of the created global init bool later in
-      // optimizeGlobalAddressOfMalloc for the global variable.
+      // optimizeGlobalAddressOfAllocation for the global variable.
     } else {
       //cerr << "NONTRAPPING USE: " << *U;
       return false;
@@ -895,29 +835,36 @@ static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
 /// to actually DO the malloc.  Instead, turn the malloc into a global, and any
 /// loads of GV as uses of the new global.
 static GlobalVariable *
-OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
-                              ConstantInt *NElements, const DataLayout &DL,
-                              TargetLibraryInfo *TLI) {
+OptimizeGlobalAddressOfAllocation(GlobalVariable *GV, CallInst *CI,
+                                  uint64_t AllocSize, Constant *InitVal,
+                                  const DataLayout &DL,
+                                  TargetLibraryInfo *TLI) {
   LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI
                     << '\n');
 
-  Type *GlobalType;
-  if (NElements->getZExtValue() == 1)
-    GlobalType = AllocTy;
-  else
-    // If we have an array allocation, the global variable is of an array.
-    GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());
+  // Create global of type [AllocSize x i8].
+  Type *GlobalType = ArrayType::get(Type::getInt8Ty(GV->getContext()),
+                                    AllocSize);
 
-  // Create the new global variable.  The contents of the malloc'd memory is
-  // undefined, so initialize with an undef value.
+  // Create the new global variable.  The contents of the allocated memory is
+  // undefined initially, so initialize with an undef value.
   GlobalVariable *NewGV = new GlobalVariable(
       *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
       UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
       GV->getThreadLocalMode());
 
-  // If there are bitcast users of the malloc (which is typical, usually we have
-  // a malloc + bitcast) then replace them with uses of the new global.  Update
-  // other users to use the global as well.
+  // Initialize the global at the point of the original call.  Note that this
+  // is a different point from the initialization referred to below for the
+  // nullability handling.  Sublety: We have not proven the original global was
+  // only initialized once.  As such, we can not fold this into the initializer
+  // of the new global as may need to re-init the storage multiple times.
+  if (!isa<UndefValue>(InitVal)) {
+    IRBuilder<> Builder(CI->getNextNode());
+    // TODO: Use alignment above if align!=1
+    Builder.CreateMemSet(NewGV, InitVal, AllocSize, None);
+  }
+
+  // Update users of the allocation to use the new global instead.
   BitCastInst *TheBC = nullptr;
   while (!CI->use_empty()) {
     Instruction *User = cast<Instruction>(CI->user_back());
@@ -1009,7 +956,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
   } else
     GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);
 
-  // Now the GV is dead, nuke it and the malloc..
+  // Now the GV is dead, nuke it and the allocation..
   GV->eraseFromParent();
   CI->eraseFromParent();
 
@@ -1066,15 +1013,33 @@ valueIsOnlyUsedLocallyOrStoredToOneGlobal(const CallInst *CI,
   return true;
 }
 
-/// This function is called when we see a pointer global variable with a single
-/// value stored it that is a malloc or cast of malloc.
-static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
-                                               Type *AllocTy,
-                                               AtomicOrdering Ordering,
-                                               const DataLayout &DL,
-                                               TargetLibraryInfo *TLI) {
-  // If this is a malloc of an abstract type, don't touch it.
-  if (!AllocTy->isSized())
+/// If we have a global that is only initialized with a fixed size allocation
+/// try to transform the program to use global memory instead of heap
+/// allocated memory. This eliminates dynamic allocation, avoids an indirection
+/// accessing the data, and exposes the resultant global to further GlobalOpt.
+static bool tryToOptimizeStoreOfAllocationToGlobal(GlobalVariable *GV,
+                                                   CallInst *CI,
+                                                   AtomicOrdering Ordering,
+                                                   const DataLayout &DL,
+                                                   TargetLibraryInfo *TLI) {
+  if (!isAllocRemovable(CI, TLI))
+    // Must be able to remove the call when we get done..
+    return false;
+
+  Type *Int8Ty = Type::getInt8Ty(CI->getFunction()->getContext());
+  Constant *InitVal = getInitialValueOfAllocation(CI, TLI, Int8Ty);
+  if (!InitVal)
+    // Must be able to emit a memset for initialization
+    return false;
+
+  uint64_t AllocSize;
+  if (!getObjectSize(CI, AllocSize, DL, TLI, ObjectSizeOpts()))
+    return false;
+
+  // Restrict this transformation to only working on small allocations
+  // (2048 bytes currently), as we don't want to introduce a 16M global or
+  // something.
+  if (AllocSize >= 2048)
     return false;
 
   // We can't optimize this global unless all uses of it are *known* to be
@@ -1093,25 +1058,8 @@ static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
   if (!valueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV))
     return false;
 
-  // If we have a global that is only initialized with a fixed size malloc,
-  // transform the program to use global memory instead of malloc'd memory.
-  // This eliminates dynamic allocation, avoids an indirection accessing the
-  // data, and exposes the resultant global to further GlobalOpt.
-  // We cannot optimize the malloc if we cannot determine malloc array size.
-  Value *NElems = getMallocArraySize(CI, DL, TLI, true);
-  if (!NElems)
-    return false;
-
-  if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
-    // Restrict this transformation to only working on small allocations
-    // (2048 bytes currently), as we don't want to introduce a 16M global or
-    // something.
-    if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
-      OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
-      return true;
-    }
-
-  return false;
+  OptimizeGlobalAddressOfAllocation(GV, CI, AllocSize, InitVal, DL, TLI);
+  return true;
 }
 
 // Try to optimize globals based on the knowledge that only one value (besides
@@ -1140,12 +1088,12 @@ optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       // Optimize away any trapping uses of the loaded value.
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
         return true;
-    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
-      auto *TLI = &GetTLI(*CI->getFunction());
-      Type *MallocType = getMallocAllocatedType(CI, TLI);
-      if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
-                                                           Ordering, DL, TLI))
-        return true;
+    } else if (isAllocationFn(StoredOnceVal, GetTLI)) {
+      if (auto *CI = dyn_cast<CallInst>(StoredOnceVal)) {
+        auto *TLI = &GetTLI(*CI->getFunction());
+        if (tryToOptimizeStoreOfAllocationToGlobal(GV, CI, Ordering, DL, TLI))
+          return true;
+      }
     }
   }
 
@@ -1171,9 +1119,12 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
 
   // Walk the use list of the global seeing if all the uses are load or store.
   // If there is anything else, bail out.
-  for (User *U : GV->users())
+  for (User *U : GV->users()) {
     if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
       return false;
+    if (getLoadStoreType(U) != GVElType)
+      return false;
+  }
 
   LLVM_DEBUG(dbgs() << "   *** SHRINKING TO BOOL: " << *GV << "\n");
 
@@ -1590,11 +1541,25 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
     // This is restricted to address spaces that allow globals to have
     // initializers. NVPTX, for example, does not support initializers for
     // shared memory (AS 3).
-    if (SOVConstant && SOVConstant->getType() == GV->getValueType() &&
-        isa<UndefValue>(GV->getInitializer()) &&
+    if (SOVConstant && isa<UndefValue>(GV->getInitializer()) &&
+        DL.getTypeAllocSize(SOVConstant->getType()) ==
+            DL.getTypeAllocSize(GV->getValueType()) &&
         CanHaveNonUndefGlobalInitializer) {
-      // Change the initial value here.
-      GV->setInitializer(SOVConstant);
+      if (SOVConstant->getType() == GV->getValueType()) {
+        // Change the initializer in place.
+        GV->setInitializer(SOVConstant);
+      } else {
+        // Create a new global with adjusted type.
+        auto *NGV = new GlobalVariable(
+            *GV->getParent(), SOVConstant->getType(), GV->isConstant(),
+            GV->getLinkage(), SOVConstant, "", GV, GV->getThreadLocalMode(),
+            GV->getAddressSpace());
+        NGV->takeName(GV);
+        NGV->copyAttributesFrom(GV);
+        GV->replaceAllUsesWith(ConstantExpr::getBitCast(NGV, GV->getType()));
+        GV->eraseFromParent();
+        GV = NGV;
+      }
 
       // Clean up any obviously simplifiable users now.
       CleanupConstantGlobalUsers(GV, DL);
@@ -2066,194 +2031,6 @@ OptimizeGlobalVars(Module &M,
   return Changed;
 }
 
-/// Evaluate a piece of a constantexpr store into a global initializer.  This
-/// returns 'Init' modified to reflect 'Val' stored into it.  At this point, the
-/// GEP operands of Addr [0, OpNo) have been stepped into.
-static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
-                                   ConstantExpr *Addr, unsigned OpNo) {
-  // Base case of the recursion.
-  if (OpNo == Addr->getNumOperands()) {
-    assert(Val->getType() == Init->getType() && "Type mismatch!");
-    return Val;
-  }
-
-  SmallVector<Constant*, 32> Elts;
-  if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
-    // Break up the constant into its elements.
-    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-      Elts.push_back(Init->getAggregateElement(i));
-
-    // Replace the element that we are supposed to.
-    ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
-    unsigned Idx = CU->getZExtValue();
-    assert(Idx < STy->getNumElements() && "Struct index out of range!");
-    Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
-
-    // Return the modified struct.
-    return ConstantStruct::get(STy, Elts);
-  }
-
-  ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
-  uint64_t NumElts;
-  if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType()))
-    NumElts = ATy->getNumElements();
-  else
-    NumElts = cast<FixedVectorType>(Init->getType())->getNumElements();
-
-  // Break up the array into elements.
-  for (uint64_t i = 0, e = NumElts; i != e; ++i)
-    Elts.push_back(Init->getAggregateElement(i));
-
-  assert(CI->getZExtValue() < NumElts);
-  Elts[CI->getZExtValue()] =
-    EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
-
-  if (Init->getType()->isArrayTy())
-    return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts);
-  return ConstantVector::get(Elts);
-}
-
-/// We have decided that Addr (which satisfies the predicate
-/// isSimpleEnoughPointerToCommit) should get Val as its value.  Make it happen.
-static void CommitValueTo(Constant *Val, Constant *Addr) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
-    assert(GV->hasInitializer());
-    GV->setInitializer(Val);
-    return;
-  }
-
-  ConstantExpr *CE = cast<ConstantExpr>(Addr);
-  GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
-  GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
-}
-
-/// Given a map of address -> value, where addresses are expected to be some form
-/// of either a global or a constant GEP, set the initializer for the address to
-/// be the value. This performs mostly the same function as CommitValueTo()
-/// and EvaluateStoreInto() but is optimized to be more efficient for the common
-/// case where the set of addresses are GEPs sharing the same underlying global,
-/// processing the GEPs in batches rather than individually.
-///
-/// To give an example, consider the following C++ code adapted from the clang
-/// regression tests:
-/// struct S {
-///  int n = 10;
-///  int m = 2 * n;
-///  S(int a) : n(a) {}
-/// };
-///
-/// template<typename T>
-/// struct U {
-///  T *r = &q;
-///  T q = 42;
-///  U *p = this;
-/// };
-///
-/// U<S> e;
-///
-/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
-/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
-/// members. This batch algorithm will simply use general CommitValueTo() method
-/// to handle the complex nested S struct initialization of 'q', before
-/// processing the outermost members in a single batch. Using CommitValueTo() to
-/// handle member in the outer struct is inefficient when the struct/array is
-/// very large as we end up creating and destroy constant arrays for each
-/// initialization.
-/// For the above case, we expect the following IR to be generated:
-///
-/// %struct.U = type { %struct.S*, %struct.S, %struct.U* }
-/// %struct.S = type { i32, i32 }
-/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
-///                                                  i64 0, i32 1),
-///                         %struct.S { i32 42, i32 84 }, %struct.U* @e }
-/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
-/// constant expression, while the other two elements of @e are "simple".
-static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
-  SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs;
-  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs;
-  SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs;
-  SimpleCEs.reserve(Mem.size());
-
-  for (const auto &I : Mem) {
-    if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
-      GVs.push_back(std::make_pair(GV, I.second));
-    } else {
-      ConstantExpr *GEP = cast<ConstantExpr>(I.first);
-      // We don't handle the deeply recursive case using the batch method.
-      if (GEP->getNumOperands() > 3)
-        ComplexCEs.push_back(std::make_pair(GEP, I.second));
-      else
-        SimpleCEs.push_back(std::make_pair(GEP, I.second));
-    }
-  }
-
-  // The algorithm below doesn't handle cases like nested structs, so use the
-  // slower fully general method if we have to.
-  for (auto ComplexCE : ComplexCEs)
-    CommitValueTo(ComplexCE.second, ComplexCE.first);
-
-  for (auto GVPair : GVs) {
-    assert(GVPair.first->hasInitializer());
-    GVPair.first->setInitializer(GVPair.second);
-  }
-
-  if (SimpleCEs.empty())
-    return;
-
-  // We cache a single global's initializer elements in the case where the
-  // subsequent address/val pair uses the same one. This avoids throwing away and
-  // rebuilding the constant struct/vector/array just because one element is
-  // modified at a time.
-  SmallVector<Constant *, 32> Elts;
-  Elts.reserve(SimpleCEs.size());
-  GlobalVariable *CurrentGV = nullptr;
-
-  auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
-    Constant *Init = GV->getInitializer();
-    Type *Ty = Init->getType();
-    if (Update) {
-      if (CurrentGV) {
-        assert(CurrentGV && "Expected a GV to commit to!");
-        Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
-        // We have a valid cache that needs to be committed.
-        if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
-          CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
-        else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
-          CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
-        else
-          CurrentGV->setInitializer(ConstantVector::get(Elts));
-      }
-      if (CurrentGV == GV)
-        return;
-      // Need to clear and set up cache for new initializer.
-      CurrentGV = GV;
-      Elts.clear();
-      unsigned NumElts;
-      if (auto *STy = dyn_cast<StructType>(Ty))
-        NumElts = STy->getNumElements();
-      else if (auto *ATy = dyn_cast<ArrayType>(Ty))
-        NumElts = ATy->getNumElements();
-      else
-        NumElts = cast<FixedVectorType>(Ty)->getNumElements();
-      for (unsigned i = 0, e = NumElts; i != e; ++i)
-        Elts.push_back(Init->getAggregateElement(i));
-    }
-  };
-
-  for (auto CEPair : SimpleCEs) {
-    ConstantExpr *GEP = CEPair.first;
-    Constant *Val = CEPair.second;
-
-    GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
-    commitAndSetupCache(GV, GV != CurrentGV);
-    ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
-    Elts[CI->getZExtValue()] = Val;
-  }
-  // The last initializer in the list needs to be committed, others
-  // will be committed on a new initializer being processed.
-  commitAndSetupCache(CurrentGV, true);
-}
-
 /// Evaluate static constructors in the function, if we can.  Return true if we
 /// can, false otherwise.
 static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
@@ -2268,10 +2045,12 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
     ++NumCtorsEvaluated;
 
     // We succeeded at evaluation: commit the result.
+    auto NewInitializers = Eval.getMutatedInitializers();
     LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
-                      << F->getName() << "' to "
-                      << Eval.getMutatedMemory().size() << " stores.\n");
-    BatchCommitValueTo(Eval.getMutatedMemory());
+                      << F->getName() << "' to " << NewInitializers.size()
+                      << " stores.\n");
+    for (const auto &Pair : NewInitializers)
+      Pair.first->setInitializer(Pair.second);
     for (GlobalVariable *GV : Eval.getInvariants())
       GV->setConstant(true);
   }
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index b8a314c54f18..e064fbbef595 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -36,8 +36,14 @@ using namespace IRSimilarity;
 
 // A command flag to be used for debugging to exclude branches from similarity
 // matching and outlining.
+namespace llvm {
 extern cl::opt<bool> DisableBranches;
 
+// A command flag to be used for debugging to indirect calls from similarity
+// matching and outlining.
+extern cl::opt<bool> DisableIndirectCalls;
+} // namespace llvm
+
 // Set to true if the user wants the ir outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
 // functions. Since the outliner is confined to a single module (modulo LTO),
@@ -104,6 +110,16 @@ struct OutlinableGroup {
   /// of the region.
   unsigned BranchesToOutside = 0;
 
+  /// Tracker counting backwards from the highest unsigned value possible to
+  /// avoid conflicting with the GVNs of assigned values.  We start at -3 since
+  /// -2 and -1 are assigned by the DenseMap.
+  unsigned PHINodeGVNTracker = -3;
+
+  DenseMap<unsigned,
+           std::pair<std::pair<unsigned, unsigned>, SmallVector<unsigned, 2>>>
+      PHINodeGVNToGVNs;
+  DenseMap<hash_code, unsigned> GVNsToPHINodeGVN;
+
   /// The number of instructions that will be outlined by extracting \ref
   /// Regions.
   InstructionCost Benefit = 0;
@@ -169,6 +185,44 @@ Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other,
   return FoundValueOpt.getValueOr(nullptr);
 }
 
+/// Rewrite the BranchInsts in the incoming blocks to \p PHIBlock that are found
+/// in \p Included to branch to BasicBlock \p Replace if they currently branch
+/// to the BasicBlock \p Find.  This is used to fix up the incoming basic blocks
+/// when PHINodes are included in outlined regions.
+///
+/// \param PHIBlock - The BasicBlock containing the PHINodes that need to be
+/// checked.
+/// \param Find - The successor block to be replaced.
+/// \param Replace - The new succesor block to branch to.
+/// \param Included - The set of blocks about to be outlined.
+static void replaceTargetsFromPHINode(BasicBlock *PHIBlock, BasicBlock *Find,
+                                      BasicBlock *Replace,
+                                      DenseSet<BasicBlock *> &Included) {
+  for (PHINode &PN : PHIBlock->phis()) {
+    for (unsigned Idx = 0, PNEnd = PN.getNumIncomingValues(); Idx != PNEnd;
+         ++Idx) {
+      // Check if the incoming block is included in the set of blocks being
+      // outlined.
+      BasicBlock *Incoming = PN.getIncomingBlock(Idx);
+      if (!Included.contains(Incoming))
+        continue;
+
+      BranchInst *BI = dyn_cast<BranchInst>(Incoming->getTerminator());
+      assert(BI && "Not a branch instruction?");
+      // Look over the branching instructions into this block to see if we
+      // used to branch to Find in this outlined block.
+      for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ != End;
+           Succ++) {
+        // If we have found the block to replace, we do so here.
+        if (BI->getSuccessor(Succ) != Find)
+          continue;
+        BI->setSuccessor(Succ, Replace);
+      }
+    }
+  }
+}
+
+
 void OutlinableRegion::splitCandidate() {
   assert(!CandidateSplit && "Candidate already split!");
 
@@ -199,6 +253,39 @@ void OutlinableRegion::splitCandidate() {
   StartBB = StartInst->getParent();
   PrevBB = StartBB;
 
+  DenseSet<BasicBlock *> BBSet;
+  Candidate->getBasicBlocks(BBSet);
+
+  // We iterate over the instructions in the region, if we find a PHINode, we
+  // check if there are predecessors outside of the region, if there are,
+  // we ignore this region since we are unable to handle the severing of the
+  // phi node right now. 
+  BasicBlock::iterator It = StartInst->getIterator();
+  while (PHINode *PN = dyn_cast<PHINode>(&*It)) {
+    unsigned NumPredsOutsideRegion = 0;
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+      if (!BBSet.contains(PN->getIncomingBlock(i)))
+        ++NumPredsOutsideRegion;
+
+    if (NumPredsOutsideRegion > 1)
+      return;
+    
+    It++;
+  }
+
+  // If the region starts with a PHINode, but is not the initial instruction of
+  // the BasicBlock, we ignore this region for now.
+  if (isa<PHINode>(StartInst) && StartInst != &*StartBB->begin())
+    return;
+  
+  // If the region ends with a PHINode, but does not contain all of the phi node
+  // instructions of the region, we ignore it for now.
+  if (isa<PHINode>(BackInst)) {
+    EndBB = BackInst->getParent();
+    if (BackInst != &*std::prev(EndBB->getFirstInsertionPt()))
+      return;
+  }
+
   // The basic block gets split like so:
   // block:                 block:
   //   inst1                  inst1
@@ -225,12 +312,20 @@ void OutlinableRegion::splitCandidate() {
     FollowBB = EndBB->splitBasicBlock(EndInst, OriginalName + "_after_outline");
     EndBB->replaceSuccessorsPhiUsesWith(EndBB, FollowBB);
     FollowBB->replaceSuccessorsPhiUsesWith(PrevBB, FollowBB);
-    return;
+  } else {
+    EndBB = BackInst->getParent();
+    EndsInBranch = true;
+    FollowBB = nullptr;
   }
 
-  EndBB = BackInst->getParent();
-  EndsInBranch = true;
-  FollowBB = nullptr;
+  // Refind the basic block set.
+  BBSet.clear();
+  Candidate->getBasicBlocks(BBSet);
+  // For the phi nodes in the new starting basic block of the region, we
+  // reassign the targets of the basic blocks branching instructions.
+  replaceTargetsFromPHINode(StartBB, PrevBB, StartBB, BBSet);
+  if (FollowBB)
+    replaceTargetsFromPHINode(FollowBB, EndBB, FollowBB, BBSet);
 }
 
 void OutlinableRegion::reattachCandidate() {
@@ -252,15 +347,21 @@ void OutlinableRegion::reattachCandidate() {
   //   inst4
   assert(StartBB != nullptr && "StartBB for Candidate is not defined!");
 
-  // StartBB should only have one predecessor since we put an unconditional
-  // branch at the end of PrevBB when we split the BasicBlock.
-  PrevBB = StartBB->getSinglePredecessor();
-  assert(PrevBB != nullptr &&
-         "No Predecessor for the region start basic block!");
-
   assert(PrevBB->getTerminator() && "Terminator removed from PrevBB!");
   PrevBB->getTerminator()->eraseFromParent();
 
+  // If we reattaching after outlining, we iterate over the phi nodes to
+  // the initial block, and reassign the branch instructions of the incoming
+  // blocks to the block we are remerging into.
+  if (!ExtractedFunction) {
+    DenseSet<BasicBlock *> BBSet;
+    Candidate->getBasicBlocks(BBSet);
+
+    replaceTargetsFromPHINode(StartBB, StartBB, PrevBB, BBSet);
+    if (!EndsInBranch)
+      replaceTargetsFromPHINode(FollowBB, FollowBB, EndBB, BBSet);
+  }
+
   moveBBContents(*StartBB, *PrevBB);
 
   BasicBlock *PlacementBB = PrevBB;
@@ -354,6 +455,24 @@ InstructionCost OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
   return Benefit;
 }
 
+/// Check the \p OutputMappings structure for value \p Input, if it exists
+/// it has been used as an output for outlining, and has been renamed, and we
+/// return the new value, otherwise, we return the same value.
+///
+/// \param OutputMappings [in] - The mapping of values to their renamed value
+/// after being used as an output for an outlined region.
+/// \param Input [in] - The value to find the remapped value of, if it exists.
+/// \return The remapped value if it has been renamed, and the same value if has
+/// not.
+static Value *findOutputMapping(const DenseMap<Value *, Value *> OutputMappings,
+                                Value *Input) {
+  DenseMap<Value *, Value *>::const_iterator OutputMapping =
+      OutputMappings.find(Input);
+  if (OutputMapping != OutputMappings.end())
+    return OutputMapping->second;
+  return Input;
+}
+
 /// Find whether \p Region matches the global value numbering to Constant
 /// mapping found so far.
 ///
@@ -830,6 +949,209 @@ findExtractedInputToOverallInputMapping(OutlinableRegion &Region,
   Region.NumExtractedInputs = OriginalIndex;
 }
 
+/// Check if the \p V has any uses outside of the region other than \p PN.
+///
+/// \param V [in] - The value to check.
+/// \param PHILoc [in] - The location in the PHINode of \p V.
+/// \param PN [in] - The PHINode using \p V.
+/// \param Exits [in] - The potential blocks we exit to from the outlined
+/// region.
+/// \param BlocksInRegion [in] - The basic blocks contained in the region.
+/// \returns true if \p V has any use soutside its region other than \p PN.
+static bool outputHasNonPHI(Value *V, unsigned PHILoc, PHINode &PN,
+                            SmallPtrSet<BasicBlock *, 1> &Exits,
+                            DenseSet<BasicBlock *> &BlocksInRegion) {
+  // We check to see if the value is used by the PHINode from some other
+  // predecessor not included in the region.  If it is, we make sure
+  // to keep it as an output.
+  SmallVector<unsigned, 2> IncomingNumbers(PN.getNumIncomingValues());
+  std::iota(IncomingNumbers.begin(), IncomingNumbers.end(), 0);
+  if (any_of(IncomingNumbers, [PHILoc, &PN, V, &BlocksInRegion](unsigned Idx) {
+        return (Idx != PHILoc && V == PN.getIncomingValue(Idx) &&
+                !BlocksInRegion.contains(PN.getIncomingBlock(Idx)));
+      }))
+    return true;
+
+  // Check if the value is used by any other instructions outside the region.
+  return any_of(V->users(), [&Exits, &BlocksInRegion](User *U) {
+    Instruction *I = dyn_cast<Instruction>(U);
+    if (!I)
+      return false;
+
+    // If the use of the item is inside the region, we skip it.  Uses
+    // inside the region give us useful information about how the item could be
+    // used as an output.
+    BasicBlock *Parent = I->getParent();
+    if (BlocksInRegion.contains(Parent))
+      return false;
+
+    // If it's not a PHINode then we definitely know the use matters.  This
+    // output value will not completely combined with another item in a PHINode
+    // as it is directly reference by another non-phi instruction
+    if (!isa<PHINode>(I))
+      return true;
+
+    // If we have a PHINode outside one of the exit locations, then it
+    // can be considered an outside use as well.  If there is a PHINode
+    // contained in the Exit where this values use matters, it will be
+    // caught when we analyze that PHINode.
+    if (!Exits.contains(Parent))
+      return true;
+
+    return false;
+  });
+}
+
+/// Test whether \p CurrentExitFromRegion contains any PhiNodes that should be
+/// considered outputs. A PHINodes is an output when more than one incoming
+/// value has been marked by the CodeExtractor as an output.
+///
+/// \param CurrentExitFromRegion [in] - The block to analyze.
+/// \param PotentialExitsFromRegion [in] - The potential exit blocks from the
+/// region.
+/// \param RegionBlocks [in] - The basic blocks in the region.
+/// \param Outputs [in, out] - The existing outputs for the region, we may add
+/// PHINodes to this as we find that they replace output values.
+/// \param OutputsReplacedByPHINode [out] - A set containing outputs that are
+/// totally replaced  by a PHINode.
+/// \param OutputsWithNonPhiUses [out] - A set containing outputs that are used
+/// in PHINodes, but have other uses, and should still be considered outputs.
+static void analyzeExitPHIsForOutputUses(
+    BasicBlock *CurrentExitFromRegion,
+    SmallPtrSet<BasicBlock *, 1> &PotentialExitsFromRegion,
+    DenseSet<BasicBlock *> &RegionBlocks, SetVector<Value *> &Outputs,
+    DenseSet<Value *> &OutputsReplacedByPHINode,
+    DenseSet<Value *> &OutputsWithNonPhiUses) {
+  for (PHINode &PN : CurrentExitFromRegion->phis()) {
+    // Find all incoming values from the outlining region.
+    SmallVector<unsigned, 2> IncomingVals;
+    for (unsigned I = 0, E = PN.getNumIncomingValues(); I < E; ++I)
+      if (RegionBlocks.contains(PN.getIncomingBlock(I)))
+        IncomingVals.push_back(I);
+
+    // Do not process PHI if there are no predecessors from region.
+    unsigned NumIncomingVals = IncomingVals.size();
+    if (NumIncomingVals == 0)
+      continue;
+
+    // If there is one predecessor, we mark it as a value that needs to be kept
+    // as an output.
+    if (NumIncomingVals == 1) {
+      Value *V = PN.getIncomingValue(*IncomingVals.begin());
+      OutputsWithNonPhiUses.insert(V);
+      OutputsReplacedByPHINode.erase(V);
+      continue;
+    }
+
+    // This PHINode will be used as an output value, so we add it to our list.
+    Outputs.insert(&PN);
+
+    // Not all of the incoming values should be ignored as other inputs and
+    // outputs may have uses in outlined region.  If they have other uses
+    // outside of the single PHINode we should not skip over it.
+    for (unsigned Idx : IncomingVals) {
+      Value *V = PN.getIncomingValue(Idx);
+      if (outputHasNonPHI(V, Idx, PN, PotentialExitsFromRegion, RegionBlocks)) {
+        OutputsWithNonPhiUses.insert(V);
+        OutputsReplacedByPHINode.erase(V);
+        continue;
+      }
+      if (!OutputsWithNonPhiUses.contains(V))
+        OutputsReplacedByPHINode.insert(V);
+    }
+  }
+}
+
+// Represents the type for the unsigned number denoting the output number for
+// phi node, along with the canonical number for the exit block.
+using ArgLocWithBBCanon = std::pair<unsigned, unsigned>;
+// The list of canonical numbers for the incoming values to a PHINode.
+using CanonList = SmallVector<unsigned, 2>;
+// The pair type representing the set of canonical values being combined in the
+// PHINode, along with the location data for the PHINode.
+using PHINodeData = std::pair<ArgLocWithBBCanon, CanonList>;
+
+/// Encode \p PND as an integer for easy lookup based on the argument location,
+/// the parent BasicBlock canonical numbering, and the canonical numbering of
+/// the values stored in the PHINode.
+///
+/// \param PND - The data to hash.
+/// \returns The hash code of \p PND.
+static hash_code encodePHINodeData(PHINodeData &PND) {
+  return llvm::hash_combine(
+      llvm::hash_value(PND.first.first), llvm::hash_value(PND.first.second),
+      llvm::hash_combine_range(PND.second.begin(), PND.second.end()));
+}
+
+/// Create a special GVN for PHINodes that will be used outside of
+/// the region.  We create a hash code based on the Canonical number of the
+/// parent BasicBlock, the canonical numbering of the values stored in the
+/// PHINode and the aggregate argument location.  This is used to find whether
+/// this PHINode type has been given a canonical numbering already.  If not, we
+/// assign it a value and store it for later use.  The value is returned to
+/// identify different output schemes for the set of regions.
+///
+/// \param Region - The region that \p PN is an output for.
+/// \param PN - The PHINode we are analyzing.
+/// \param AggArgIdx - The argument \p PN will be stored into.
+/// \returns An optional holding the assigned canonical number, or None if
+/// there is some attribute of the PHINode blocking it from being used.
+static Optional<unsigned> getGVNForPHINode(OutlinableRegion &Region,
+                                           PHINode *PN, unsigned AggArgIdx) {
+  OutlinableGroup &Group = *Region.Parent;
+  IRSimilarityCandidate &Cand = *Region.Candidate;
+  BasicBlock *PHIBB = PN->getParent();
+  CanonList PHIGVNs;
+  for (Value *Incoming : PN->incoming_values()) {
+    // If we cannot find a GVN, this means that the input to the PHINode is
+    // not included in the region we are trying to analyze, meaning, that if
+    // it was outlined, we would be adding an extra input.  We ignore this
+    // case for now, and so ignore the region.
+    Optional<unsigned> OGVN = Cand.getGVN(Incoming);
+    if (!OGVN.hasValue()) {
+      Region.IgnoreRegion = true;
+      return None;
+    }
+
+    // Collect the canonical numbers of the values in the PHINode.
+    unsigned GVN = OGVN.getValue();
+    OGVN = Cand.getCanonicalNum(GVN);
+    assert(OGVN.hasValue() && "No GVN found for incoming value?");
+    PHIGVNs.push_back(*OGVN);
+  }
+
+  // Now that we have the GVNs for the incoming values, we are going to combine
+  // them with the GVN of the incoming bock, and the output location of the
+  // PHINode to generate a hash value representing this instance of the PHINode.
+  DenseMap<hash_code, unsigned>::iterator GVNToPHIIt;
+  DenseMap<unsigned, PHINodeData>::iterator PHIToGVNIt;
+  Optional<unsigned> BBGVN = Cand.getGVN(PHIBB);
+  assert(BBGVN.hasValue() && "Could not find GVN for the incoming block!");
+
+  BBGVN = Cand.getCanonicalNum(BBGVN.getValue());
+  assert(BBGVN.hasValue() &&
+         "Could not find canonical number for the incoming block!");
+  // Create a pair of the exit block canonical value, and the aggregate
+  // argument location, connected to the canonical numbers stored in the
+  // PHINode.
+  PHINodeData TemporaryPair =
+      std::make_pair(std::make_pair(BBGVN.getValue(), AggArgIdx), PHIGVNs);
+  hash_code PHINodeDataHash = encodePHINodeData(TemporaryPair);
+
+  // Look for and create a new entry in our connection between canonical
+  // numbers for PHINodes, and the set of objects we just created.
+  GVNToPHIIt = Group.GVNsToPHINodeGVN.find(PHINodeDataHash);
+  if (GVNToPHIIt == Group.GVNsToPHINodeGVN.end()) {
+    bool Inserted = false;
+    std::tie(PHIToGVNIt, Inserted) = Group.PHINodeGVNToGVNs.insert(
+        std::make_pair(Group.PHINodeGVNTracker, TemporaryPair));
+    std::tie(GVNToPHIIt, Inserted) = Group.GVNsToPHINodeGVN.insert(
+        std::make_pair(PHINodeDataHash, Group.PHINodeGVNTracker--));
+  }
+
+  return GVNToPHIIt->second;
+}
+
 /// Create a mapping of the output arguments for the \p Region to the output
 /// arguments of the overall outlined function.
 ///
@@ -842,35 +1164,25 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
   IRSimilarityCandidate &C = *Region.Candidate;
 
   SmallVector<BasicBlock *> BE;
-  DenseSet<BasicBlock *> BBSet;
-  C.getBasicBlocks(BBSet, BE);
+  DenseSet<BasicBlock *> BlocksInRegion;
+  C.getBasicBlocks(BlocksInRegion, BE);
 
   // Find the exits to the region.
   SmallPtrSet<BasicBlock *, 1> Exits;
   for (BasicBlock *Block : BE)
     for (BasicBlock *Succ : successors(Block))
-      if (!BBSet.contains(Succ))
+      if (!BlocksInRegion.contains(Succ))
         Exits.insert(Succ);
 
   // After determining which blocks exit to PHINodes, we add these PHINodes to
   // the set of outputs to be processed.  We also check the incoming values of
   // the PHINodes for whether they should no longer be considered outputs.
-  for (BasicBlock *ExitBB : Exits) {
-    for (PHINode &PN : ExitBB->phis()) {
-      // Find all incoming values from the outlining region.
-      SmallVector<unsigned, 2> IncomingVals;
-      for (unsigned Idx = 0; Idx < PN.getNumIncomingValues(); ++Idx)
-        if (BBSet.contains(PN.getIncomingBlock(Idx)))
-          IncomingVals.push_back(Idx);
-
-      // Do not process PHI if there is one (or fewer) predecessor from region.
-      if (IncomingVals.size() <= 1)
-        continue;
-
-      Region.IgnoreRegion = true;
-      return;
-    }
-  }
+  DenseSet<Value *> OutputsReplacedByPHINode;
+  DenseSet<Value *> OutputsWithNonPhiUses;
+  for (BasicBlock *ExitBB : Exits)
+    analyzeExitPHIsForOutputUses(ExitBB, Exits, BlocksInRegion, Outputs,
+                                 OutputsReplacedByPHINode,
+                                 OutputsWithNonPhiUses);
 
   // This counts the argument number in the extracted function.
   unsigned OriginalIndex = Region.NumExtractedInputs;
@@ -893,9 +1205,13 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
     // do not have to be in same order, but are functionally the same, we will
     // have to use a different scheme, as one-to-one correspondence is not
     // guaranteed.
-    unsigned GlobalValue = C.getGVN(Output).getValue();
     unsigned ArgumentSize = Group.ArgumentTypes.size();
 
+    // If the output is combined in a PHINode, we make sure to skip over it.
+    if (OutputsReplacedByPHINode.contains(Output))
+      continue;
+
+    unsigned AggArgIdx = 0;
     for (unsigned Jdx = TypeIndex; Jdx < ArgumentSize; Jdx++) {
       if (Group.ArgumentTypes[Jdx] != PointerType::getUnqual(Output->getType()))
         continue;
@@ -907,7 +1223,7 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
       AggArgsUsed.insert(Jdx);
       Region.ExtractedArgToAgg.insert(std::make_pair(OriginalIndex, Jdx));
       Region.AggArgToExtracted.insert(std::make_pair(Jdx, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+      AggArgIdx = Jdx;
       break;
     }
 
@@ -916,18 +1232,54 @@ findExtractedOutputToOverallOutputMapping(OutlinableRegion &Region,
     // function to handle this output and create a mapping to it.
     if (!TypeFound) {
       Group.ArgumentTypes.push_back(PointerType::getUnqual(Output->getType()));
-      AggArgsUsed.insert(Group.ArgumentTypes.size() - 1);
+      // Mark the new pointer type as the last value in the aggregate argument
+      // list.
+      unsigned ArgTypeIdx = Group.ArgumentTypes.size() - 1;
+      AggArgsUsed.insert(ArgTypeIdx);
       Region.ExtractedArgToAgg.insert(
-          std::make_pair(OriginalIndex, Group.ArgumentTypes.size() - 1));
+          std::make_pair(OriginalIndex, ArgTypeIdx));
       Region.AggArgToExtracted.insert(
-          std::make_pair(Group.ArgumentTypes.size() - 1, OriginalIndex));
-      Region.GVNStores.push_back(GlobalValue);
+          std::make_pair(ArgTypeIdx, OriginalIndex));
+      AggArgIdx = ArgTypeIdx;
+    }
+
+    // TODO: Adapt to the extra input from the PHINode.
+    PHINode *PN = dyn_cast<PHINode>(Output);
+
+    Optional<unsigned> GVN;
+    if (PN && !BlocksInRegion.contains(PN->getParent())) {
+      // Values outside the region can be combined into PHINode when we
+      // have multiple exits. We collect both of these into a list to identify
+      // which values are being used in the PHINode. Each list identifies a
+      // different PHINode, and a different output. We store the PHINode as it's
+      // own canonical value.  These canonical values are also dependent on the
+      // output argument it is saved to.
+
+      // If two PHINodes have the same canonical values, but different aggregate
+      // argument locations, then they will have distinct Canonical Values.
+      GVN = getGVNForPHINode(Region, PN, AggArgIdx);
+      if (!GVN.hasValue())
+        return; 
+    } else {
+      // If we do not have a PHINode we use the global value numbering for the
+      // output value, to find the canonical number to add to the set of stored
+      // values.
+      GVN = C.getGVN(Output);
+      GVN = C.getCanonicalNum(*GVN);
     }
 
-    stable_sort(Region.GVNStores);
+    // Each region has a potentially unique set of outputs.  We save which
+    // values are output in a list of canonical values so we can differentiate
+    // among the different store schemes.
+    Region.GVNStores.push_back(*GVN);
+
     OriginalIndex++;
     TypeIndex++;
   }
+
+  // We sort the stored values to make sure that we are not affected by analysis
+  // order when determining what combination of items were stored.
+  stable_sort(Region.GVNStores);
 }
 
 void IROutliner::findAddInputsOutputs(Module &M, OutlinableRegion &Region,
@@ -1063,6 +1415,214 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
   return Call;
 }
 
+/// Find or create a BasicBlock in the outlined function containing PhiBlocks
+/// for \p RetVal.
+///
+/// \param Group - The OutlinableGroup containing the information about the
+/// overall outlined function.
+/// \param RetVal - The return value or exit option that we are currently
+/// evaluating.
+/// \returns The found or newly created BasicBlock to contain the needed
+/// PHINodes to be used as outputs.
+static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) {
+  DenseMap<Value *, BasicBlock *>::iterator PhiBlockForRetVal,
+      ReturnBlockForRetVal;
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  ReturnBlockForRetVal = Group.EndBBs.find(RetVal);
+  assert(ReturnBlockForRetVal != Group.EndBBs.end() &&
+         "Could not find output value!");
+  BasicBlock *ReturnBB = ReturnBlockForRetVal->second;
+
+  // Find if a PHIBlock exists for this return value already.  If it is
+  // the first time we are analyzing this, we will not, so we record it.
+  PhiBlockForRetVal = Group.PHIBlocks.find(RetVal);
+  if (PhiBlockForRetVal != Group.PHIBlocks.end())
+    return PhiBlockForRetVal->second;
+  
+  // If we did not find a block, we create one, and insert it into the
+  // overall function and record it.
+  bool Inserted = false;
+  BasicBlock *PHIBlock = BasicBlock::Create(ReturnBB->getContext(), "phi_block",
+                                            ReturnBB->getParent());
+  std::tie(PhiBlockForRetVal, Inserted) =
+      Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+
+  // We find the predecessors of the return block in the newly created outlined
+  // function in order to point them to the new PHIBlock rather than the already
+  // existing return block.
+  SmallVector<BranchInst *, 2> BranchesToChange;
+  for (BasicBlock *Pred : predecessors(ReturnBB))
+    BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator()));
+
+  // Now we mark the branch instructions found, and change the references of the
+  // return block to the newly created PHIBlock.
+  for (BranchInst *BI : BranchesToChange)
+    for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) {
+      if (BI->getSuccessor(Succ) != ReturnBB)
+        continue;
+      BI->setSuccessor(Succ, PHIBlock);
+    }
+
+  BranchInst::Create(ReturnBB, PHIBlock);
+
+  return PhiBlockForRetVal->second;
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has already been replaced with a call to the  overall, aggregate
+/// function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentInAlreadyOutlinedFunction(const Argument *A,
+                                           const OutlinableRegion &Region) {
+  // If we don't need to adjust the argument number at all (since the call
+  // has already been replaced by a call to the overall outlined function)
+  // we can just get the specified argument.
+  return Region.Call->getArgOperand(A->getArgNo());
+}
+
+/// For the function call now representing the \p Region, find the passed value
+/// to that call that represents Argument \p A at the call location if the
+/// call has only been replaced by the call to the aggregate function.
+///
+/// \param A - The Argument to get the passed value for.
+/// \param Region - The extracted Region corresponding to the outlined function.
+/// \returns The Value representing \p A at the call site.
+static Value *
+getPassedArgumentAndAdjustArgumentLocation(const Argument *A,
+                                           const OutlinableRegion &Region) {
+  unsigned ArgNum = A->getArgNo();
+  
+  // If it is a constant, we can look at our mapping from when we created
+  // the outputs to figure out what the constant value is.
+  if (Region.AggArgToConstant.count(ArgNum))
+    return Region.AggArgToConstant.find(ArgNum)->second;
+  
+  // If it is not a constant, and we are not looking at the overall function, we
+  // need to adjust which argument we are looking at.
+  ArgNum = Region.AggArgToExtracted.find(ArgNum)->second;
+  return Region.Call->getArgOperand(ArgNum);
+}
+
+/// Find the canonical numbering for the incoming Values into the PHINode \p PN.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \param CanonNums [out] - The canonical numbering for the incoming values to
+/// \p PN.
+/// \param ReplacedWithOutlinedCall - A flag to use the extracted function call
+/// of \p Region rather than the overall function's call.
+static void
+findCanonNumsForPHI(PHINode *PN, OutlinableRegion &Region,
+                    const DenseMap<Value *, Value *> &OutputMappings,
+                    DenseSet<unsigned> &CanonNums,
+                    bool ReplacedWithOutlinedCall = true) {
+  // Iterate over the incoming values.
+  for (unsigned Idx = 0, EIdx = PN->getNumIncomingValues(); Idx < EIdx; Idx++) {
+    Value *IVal = PN->getIncomingValue(Idx);
+    // If we have an argument as incoming value, we need to grab the passed
+    // value from the call itself.
+    if (Argument *A = dyn_cast<Argument>(IVal)) {
+      if (ReplacedWithOutlinedCall)
+        IVal = getPassedArgumentInAlreadyOutlinedFunction(A, Region);
+      else
+        IVal = getPassedArgumentAndAdjustArgumentLocation(A, Region);
+    }
+
+    // Get the original value if it has been replaced by an output value.
+    IVal = findOutputMapping(OutputMappings, IVal);
+
+    // Find and add the canonical number for the incoming value.
+    Optional<unsigned> GVN = Region.Candidate->getGVN(IVal);
+    assert(GVN.hasValue() && "No GVN for incoming value");
+    Optional<unsigned> CanonNum = Region.Candidate->getCanonicalNum(*GVN);
+    assert(CanonNum.hasValue() && "No Canonical Number for GVN");
+    CanonNums.insert(*CanonNum);
+  }
+}
+
+/// Find, or add PHINode \p PN to the combined PHINode Block \p OverallPHIBlock
+/// in order to condense the number of instructions added to the outlined
+/// function.
+///
+/// \param PN [in] - The PHINode that we are finding the canonical numbers for.
+/// \param Region [in] - The OutlinableRegion containing \p PN. 
+/// \param OverallPhiBlock [in] - The overall PHIBlock we are trying to find
+/// \p PN in.
+/// \param OutputMappings [in] - The mapping of output values from outlined
+/// region to their original values.
+/// \return the newly found or created PHINode in \p OverallPhiBlock.
+static PHINode*
+findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
+                       BasicBlock *OverallPhiBlock,
+                       const DenseMap<Value *, Value *> &OutputMappings) {
+  OutlinableGroup &Group = *Region.Parent;
+  
+  DenseSet<unsigned> PNCanonNums;
+  // We have to use the extracted function since we have merged this region into
+  // the overall function yet.  We make sure to reassign the argument numbering
+  // since it is possible that the argument ordering is different between the
+  // functions.
+  findCanonNumsForPHI(&PN, Region, OutputMappings, PNCanonNums,
+                      /* ReplacedWithOutlinedCall = */ false);
+
+  OutlinableRegion *FirstRegion = Group.Regions[0];
+  DenseSet<unsigned> CurrentCanonNums;
+  // Find the Canonical Numbering for each PHINode, if it matches, we replace
+  // the uses of the PHINode we are searching for, with the found PHINode.
+  for (PHINode &CurrPN : OverallPhiBlock->phis()) {
+    CurrentCanonNums.clear();
+    findCanonNumsForPHI(&CurrPN, *FirstRegion, OutputMappings, CurrentCanonNums,
+                        /* ReplacedWithOutlinedCall = */ true);
+
+    if (all_of(PNCanonNums, [&CurrentCanonNums](unsigned CanonNum) {
+          return CurrentCanonNums.contains(CanonNum);
+        }))
+      return &CurrPN;
+  }
+
+  // If we've made it here, it means we weren't able to replace the PHINode, so
+  // we must insert it ourselves.
+  PHINode *NewPN = cast<PHINode>(PN.clone());
+  NewPN->insertBefore(&*OverallPhiBlock->begin());
+  for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx;
+       Idx++) {
+    Value *IncomingVal = NewPN->getIncomingValue(Idx);
+    BasicBlock *IncomingBlock = NewPN->getIncomingBlock(Idx);
+
+    // Find corresponding basic block in the overall function for the incoming
+    // block.
+    Instruction *FirstNonPHI = IncomingBlock->getFirstNonPHI();
+    assert(FirstNonPHI && "Incoming block is empty?");
+    Value *CorrespondingVal =
+        Region.findCorrespondingValueIn(*FirstRegion, FirstNonPHI);
+    assert(CorrespondingVal && "Value is nullptr?");
+    BasicBlock *BlockToUse = cast<Instruction>(CorrespondingVal)->getParent();
+    NewPN->setIncomingBlock(Idx, BlockToUse);
+
+    // If we have an argument we make sure we replace using the argument from
+    // the correct function.
+    if (Argument *A = dyn_cast<Argument>(IncomingVal)) {
+      Value *Val = Group.OutlinedFunction->getArg(A->getArgNo());
+      NewPN->setIncomingValue(Idx, Val);
+      continue;
+    }
+    
+    // Find the corresponding value in the overall function.
+    IncomingVal = findOutputMapping(OutputMappings, IncomingVal);
+    Value *Val = Region.findCorrespondingValueIn(*FirstRegion, IncomingVal);
+    assert(Val && "Value is nullptr?");
+    NewPN->setIncomingValue(Idx, Val);
+  }
+  return NewPN;
+}
+
 // Within an extracted function, replace the argument uses of the extracted
 // region with the arguments of the function for an OutlinableGroup.
 //
@@ -1075,6 +1635,7 @@ CallInst *replaceCalledFunction(Module &M, OutlinableRegion &Region) {
 static void
 replaceArgumentUses(OutlinableRegion &Region,
                     DenseMap<Value *, BasicBlock *> &OutputBBs,
+                    const DenseMap<Value *, Value *> &OutputMappings,
                     bool FirstFunction = false) {
   OutlinableGroup &Group = *Region.Parent;
   assert(Region.ExtractedFunction && "Region has no extracted function?");
@@ -1144,12 +1705,47 @@ replaceArgumentUses(OutlinableRegion &Region,
       LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to "
                         << *OutputBB << "\n");
 
-      if (FirstFunction)
+      // If this is storing a PHINode, we must make sure it is included in the
+      // overall function.
+      if (!isa<PHINode>(ValueOperand) ||
+          Region.Candidate->getGVN(ValueOperand).hasValue()) {
+        if (FirstFunction)
+          continue;
+        Value *CorrVal =
+            Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
+        assert(CorrVal && "Value is nullptr?");
+        NewI->setOperand(0, CorrVal);
+        continue;
+      }
+      PHINode *PN = cast<PHINode>(SI->getValueOperand());
+      // If it has a value, it was not split by the code extractor, which
+      // is what we are looking for.
+      if (Region.Candidate->getGVN(PN).hasValue())
         continue;
-      Value *CorrVal =
-          Region.findCorrespondingValueIn(*Group.Regions[0], ValueOperand);
-      assert(CorrVal && "Value is nullptr?");
-      NewI->setOperand(0, CorrVal);
+
+      // We record the parent block for the PHINode in the Region so that
+      // we can exclude it from checks later on.
+      Region.PHIBlocks.insert(std::make_pair(RetVal, PN->getParent()));
+
+      // If this is the first function, we do not need to worry about mergiing
+      // this with any other block in the overall outlined function, so we can
+      // just continue.
+      if (FirstFunction) {
+        BasicBlock *PHIBlock = PN->getParent();
+        Group.PHIBlocks.insert(std::make_pair(RetVal, PHIBlock));
+        continue;
+      }
+
+      // We look for the aggregate block that contains the PHINodes leading into
+      // this exit path. If we can't find one, we create one.
+      BasicBlock *OverallPhiBlock = findOrCreatePHIBlock(Group, RetVal);
+
+      // For our PHINode, we find the combined canonical numbering, and
+      // attempt to find a matching PHINode in the overall PHIBlock.  If we
+      // cannot, we copy the PHINode and move it into this new block.
+      PHINode *NewPN =
+          findOrCreatePHIInBlock(*PN, Region, OverallPhiBlock, OutputMappings);
+      NewI->setOperand(0, NewPN);
     }
 
     // If we added an edge for basic blocks without a predecessor, we remove it
@@ -1390,7 +1986,12 @@ void createSwitchStatement(
     Module &M, OutlinableGroup &OG, DenseMap<Value *, BasicBlock *> &EndBBs,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs) {
   // We only need the switch statement if there is more than one store
-  // combination.
+  // combination, or there is more than one set of output blocks.  The first
+  // will occur when we store different sets of values for two different
+  // regions.  The second will occur when we have two outputs that are combined
+  // in a PHINode outside of the region in one outlined instance, and are used
+  // seaparately in another. This will create the same set of OutputGVNs, but
+  // will generate two different output schemes.
   if (OG.OutputGVNCombinations.size() > 1) {
     Function *AggFunc = OG.OutlinedFunction;
     // Create a final block for each different return block.
@@ -1433,8 +2034,14 @@ void createSwitchStatement(
     return;
   }
 
+  assert(OutputStoreBBs.size() < 2 && "Different store sets not handled!");
+
   // If there needs to be stores, move them from the output blocks to their
-  // corresponding ending block.
+  // corresponding ending block.  We do not check that the OutputGVNCombinations
+  // is equal to 1 here since that could just been the case where there are 0
+  // outputs. Instead, we check whether there is more than one set of output
+  // blocks since this is the only case where we would have to move the
+  // stores, and erase the extraneous blocks.
   if (OutputStoreBBs.size() == 1) {
     LLVM_DEBUG(dbgs() << "Move store instructions to the end block in "
                       << *OG.OutlinedFunction << "\n");
@@ -1466,10 +2073,13 @@ void createSwitchStatement(
 /// set of stores needed for the different functions.
 /// \param [in,out] FuncsToRemove - Extracted functions to erase from module
 /// once outlining is complete.
+/// \param [in] OutputMappings - Extracted functions to erase from module
+/// once outlining is complete.
 static void fillOverallFunction(
     Module &M, OutlinableGroup &CurrentGroup,
     std::vector<DenseMap<Value *, BasicBlock *>> &OutputStoreBBs,
-    std::vector<Function *> &FuncsToRemove) {
+    std::vector<Function *> &FuncsToRemove,
+    const DenseMap<Value *, Value *> &OutputMappings) {
   OutlinableRegion *CurrentOS = CurrentGroup.Regions[0];
 
   // Move first extracted function's instructions into new function.
@@ -1489,7 +2099,7 @@ static void fillOverallFunction(
                              CurrentGroup.OutlinedFunction, "output_block_0");
   CurrentOS->OutputBlockNum = 0;
 
-  replaceArgumentUses(*CurrentOS, NewBBs, true);
+  replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings, true);
   replaceConstants(*CurrentOS);
 
   // We first identify if any output blocks are empty, if they are we remove
@@ -1523,7 +2133,8 @@ void IROutliner::deduplicateExtractedSections(
 
   OutlinableRegion *CurrentOS;
 
-  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove);
+  fillOverallFunction(M, CurrentGroup, OutputStoreBBs, FuncsToRemove,
+                      OutputMappings);
 
   std::vector<Value *> SortedKeys;
   for (unsigned Idx = 1; Idx < CurrentGroup.Regions.size(); Idx++) {
@@ -1537,8 +2148,7 @@ void IROutliner::deduplicateExtractedSections(
     createAndInsertBasicBlocks(
         CurrentGroup.EndBBs, NewBBs, CurrentGroup.OutlinedFunction,
         "output_block_" + Twine(static_cast<unsigned>(Idx)));
-
-    replaceArgumentUses(*CurrentOS, NewBBs);
+    replaceArgumentUses(*CurrentOS, NewBBs, OutputMappings);
     alignOutputBlockWithAggFunc(CurrentGroup, *CurrentOS, NewBBs,
                                 CurrentGroup.EndBBs, OutputMappings,
                                 OutputStoreBBs);
@@ -1637,7 +2247,7 @@ void IROutliner::pruneIncompatibleRegions(
   if (FirstCandidate.getLength() == 2) {
     if (isa<CallInst>(FirstCandidate.front()->Inst) &&
         isa<BranchInst>(FirstCandidate.back()->Inst))
-        return;
+      return;
   }
 
   unsigned CurrentEndIdx = 0;
@@ -1706,6 +2316,34 @@ IROutliner::findBenefitFromAllRegions(OutlinableGroup &CurrentGroup) {
   return RegionBenefit;
 }
 
+/// For the \p OutputCanon number passed in find the value represented by this
+/// canonical number. If it is from a PHINode, we pick the first incoming
+/// value and return that Value instead.
+///
+/// \param Region - The OutlinableRegion to get the Value from.
+/// \param OutputCanon - The canonical number to find the Value from.
+/// \returns The Value represented by a canonical number \p OutputCanon in \p
+/// Region.
+static Value *findOutputValueInRegion(OutlinableRegion &Region,
+                                      unsigned OutputCanon) {
+  OutlinableGroup &CurrentGroup = *Region.Parent;
+  // If the value is greater than the value in the tracker, we have a
+  // PHINode and will instead use one of the incoming values to find the
+  // type.
+  if (OutputCanon > CurrentGroup.PHINodeGVNTracker) {
+    auto It = CurrentGroup.PHINodeGVNToGVNs.find(OutputCanon);
+    assert(It != CurrentGroup.PHINodeGVNToGVNs.end() &&
+           "Could not find GVN set for PHINode number!");
+    assert(It->second.second.size() > 0 && "PHINode does not have any values!");
+    OutputCanon = *It->second.second.begin();
+  }
+  Optional<unsigned> OGVN = Region.Candidate->fromCanonicalNum(OutputCanon);
+  assert(OGVN.hasValue() && "Could not find GVN for Canonical Number?");
+  Optional<Value *> OV = Region.Candidate->fromGVN(*OGVN);
+  assert(OV.hasValue() && "Could not find value for GVN?");
+  return *OV;
+}
+
 InstructionCost
 IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
   InstructionCost OverallCost = 0;
@@ -1713,10 +2351,8 @@ IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
     TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
 
     // Each output incurs a load after the call, so we add that to the cost.
-    for (unsigned OutputGVN : Region->GVNStores) {
-      Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : Region->GVNStores) {
+      Value *V = findOutputValueInRegion(*Region, OutputCanon);
       InstructionCost LoadCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -1745,6 +2381,7 @@ static InstructionCost findCostForOutputBlocks(Module &M,
   InstructionCost OutputCost = 0;
   unsigned NumOutputBranches = 0;
 
+  OutlinableRegion &FirstRegion = *CurrentGroup.Regions[0];
   IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
   DenseSet<BasicBlock *> CandidateBlocks;
   Candidate.getBasicBlocks(CandidateBlocks);
@@ -1770,10 +2407,8 @@ static InstructionCost findCostForOutputBlocks(Module &M,
 
   for (const ArrayRef<unsigned> &OutputUse :
        CurrentGroup.OutputGVNCombinations) {
-    for (unsigned GVN : OutputUse) {
-      Optional<Value *> OV = Candidate.fromGVN(GVN);
-      assert(OV.hasValue() && "Could not find value for GVN?");
-      Value *V = OV.getValue();
+    for (unsigned OutputCanon : OutputUse) {
+      Value *V = findOutputValueInRegion(FirstRegion, OutputCanon);
       InstructionCost StoreCost =
           TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
                               TargetTransformInfo::TCK_CodeSize);
@@ -1974,6 +2609,7 @@ bool IROutliner::extractSection(OutlinableRegion &Region) {
 unsigned IROutliner::doOutline(Module &M) {
   // Find the possible similarity sections.
   InstructionClassifier.EnableBranches = !DisableBranches;
+  InstructionClassifier.EnableIndirectCalls = !DisableIndirectCalls;
   IRSimilarityIdentifier &Identifier = getIRSI(M);
   SimilarityGroupList &SimilarityCandidates = *Identifier.getSimilarity();
 
@@ -2033,8 +2669,8 @@ unsigned IROutliner::doOutline(Module &M) {
         continue;
 
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
@@ -2144,8 +2780,8 @@ unsigned IROutliner::doOutline(Module &M) {
     OutlinedRegions.clear();
     for (OutlinableRegion *OS : CurrentGroup.Regions) {
       SmallVector<BasicBlock *> BE;
-      DenseSet<BasicBlock *> BBSet;
-      OS->Candidate->getBasicBlocks(BBSet, BE);
+      DenseSet<BasicBlock *> BlocksInRegion;
+      OS->Candidate->getBasicBlocks(BlocksInRegion, BE);
       OS->CE = new (ExtractorAllocator.Allocate())
           CodeExtractor(BE, nullptr, false, nullptr, nullptr, nullptr, false,
                         false, "outlined");
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 4e3689f09536..49babc24cb82 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -92,6 +92,11 @@ static cl::opt<bool>
     DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
                                 cl::init(false), cl::Hidden);
 
+/// A flag for test, so we can print the content of the advisor when running it
+/// as part of the default (e.g. -O3) pipeline.
+static cl::opt<bool> KeepAdvisorForPrinting("keep-inline-advisor-for-printing",
+                                            cl::init(false), cl::Hidden);
+
 extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
 
 static cl::opt<std::string> CGSCCInlineReplayFile(
@@ -660,7 +665,7 @@ bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
   }
   if (!DeadFunctionsInComdats.empty()) {
     // Filter out the functions whose comdats remain alive.
-    filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats);
+    filterDeadComdatFunctions(DeadFunctionsInComdats);
     // Remove the rest.
     for (Function *F : DeadFunctionsInComdats)
       RemoveCGN(CG[F]);
@@ -741,7 +746,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
   Advisor.onPassEntry();
 
-  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
+  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(&InitialC); });
 
   // We use a single common worklist for calls across the entire SCC. We
   // process these in-order and append new calls introduced during inlining to
@@ -823,6 +828,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // defer deleting these to make it easier to handle the call graph updates.
   SmallVector<Function *, 4> DeadFunctions;
 
+  // Track potentially dead non-local functions with comdats to see if they can
+  // be deleted as a batch after inlining.
+  SmallVector<Function *, 4> DeadFunctionsInComdats;
+
   // Loop forward over all of the calls.
   while (!Calls->empty()) {
     // We expect the calls to typically be batched with sequences of calls that
@@ -935,16 +944,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       // Merge the attributes based on the inlining.
       AttributeFuncs::mergeAttributesForInlining(F, Callee);
 
-      // For local functions, check whether this makes the callee trivially
-      // dead. In that case, we can drop the body of the function eagerly
-      // which may reduce the number of callers of other functions to one,
-      // changing inline cost thresholds.
+      // For local functions or discardable functions without comdats, check
+      // whether this makes the callee trivially dead. In that case, we can drop
+      // the body of the function eagerly which may reduce the number of callers
+      // of other functions to one, changing inline cost thresholds. Non-local
+      // discardable functions with comdats are checked later on.
       bool CalleeWasDeleted = false;
-      if (Callee.hasLocalLinkage()) {
-        // To check this we also need to nuke any dead constant uses (perhaps
-        // made dead by this operation on other functions).
-        Callee.removeDeadConstantUsers();
-        if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
+      if (Callee.isDiscardableIfUnused() && Callee.hasZeroLiveUses() &&
+          !CG.isLibFunction(Callee)) {
+        if (Callee.hasLocalLinkage() || !Callee.hasComdat()) {
           Calls->erase_if([&](const std::pair<CallBase *, int> &Call) {
             return Call.first->getCaller() == &Callee;
           });
@@ -957,6 +965,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                  "Cannot put cause a function to become dead twice!");
           DeadFunctions.push_back(&Callee);
           CalleeWasDeleted = true;
+        } else {
+          DeadFunctionsInComdats.push_back(&Callee);
         }
       }
       if (CalleeWasDeleted)
@@ -1019,6 +1029,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     FAM.invalidate(F, PreservedAnalyses::none());
   }
 
+  // We must ensure that we only delete functions with comdats if every function
+  // in the comdat is going to be deleted.
+  if (!DeadFunctionsInComdats.empty()) {
+    filterDeadComdatFunctions(DeadFunctionsInComdats);
+    for (auto *Callee : DeadFunctionsInComdats)
+      Callee->dropAllReferences();
+    DeadFunctions.append(DeadFunctionsInComdats);
+  }
+
   // Now that we've finished inlining all of the calls across this SCC, delete
   // all of the trivially dead functions, updating the call graph and the CGSCC
   // pass manager in the process.
@@ -1045,14 +1064,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       UR.UpdatedC = nullptr;
 
     // And delete the actual function from the module.
-    // The Advisor may use Function pointers to efficiently index various
-    // internal maps, e.g. for memoization. Function cleanup passes like
-    // argument promotion create new functions. It is possible for a new
-    // function to be allocated at the address of a deleted function. We could
-    // index using names, but that's inefficient. Alternatively, we let the
-    // Advisor free the functions when it sees fit.
-    DeadF->getBasicBlockList().clear();
-    M.getFunctionList().remove(DeadF);
+    M.getFunctionList().erase(DeadF);
 
     ++NumDeleted;
   }
@@ -1073,8 +1085,7 @@ ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
                                                    bool MandatoryFirst,
                                                    InliningAdvisorMode Mode,
                                                    unsigned MaxDevirtIterations)
-    : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations),
-      PM(), MPM() {
+    : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations) {
   // Run the inliner first. The theory is that we are walking bottom-up and so
   // the callees have already been fully optimized, and we want to inline them
   // into the callers so that our optimizations can reflect that.
@@ -1118,7 +1129,8 @@ PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
   // Discard the InlineAdvisor, a subsequent inlining session should construct
   // its own.
   auto PA = PreservedAnalyses::all();
-  PA.abandon<InlineAdvisorAnalysis>();
+  if (!KeepAdvisorForPrinting)
+    PA.abandon<InlineAdvisorAnalysis>();
   return PA;
 }
 
diff --git a/llvm/lib/Transforms/IPO/ModuleInliner.cpp b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
index ebf080e87c3b..d515303e4911 100644
--- a/llvm/lib/Transforms/IPO/ModuleInliner.cpp
+++ b/llvm/lib/Transforms/IPO/ModuleInliner.cpp
@@ -335,14 +335,7 @@ PreservedAnalyses ModuleInlinerPass::run(Module &M,
     FAM.clear(*DeadF, DeadF->getName());
 
     // And delete the actual function from the module.
-    // The Advisor may use Function pointers to efficiently index various
-    // internal maps, e.g. for memoization. Function cleanup passes like
-    // argument promotion create new functions. It is possible for a new
-    // function to be allocated at the address of a deleted function. We could
-    // index using names, but that's inefficient. Alternatively, we let the
-    // Advisor free the functions when it sees fit.
-    DeadF->getBasicBlockList().clear();
-    M.getFunctionList().remove(DeadF);
+    M.getFunctionList().erase(DeadF);
 
     ++NumDeleted;
   }
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index f289e3ecc979..68f33410c602 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -21,6 +21,7 @@
 
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -153,14 +154,6 @@ static constexpr auto TAG = "[" DEBUG_TYPE "]";
 
 namespace {
 
-enum class AddressSpace : unsigned {
-  Generic = 0,
-  Global = 1,
-  Shared = 3,
-  Constant = 4,
-  Local = 5,
-};
-
 struct AAHeapToShared;
 
 struct AAICVTracker;
@@ -170,7 +163,7 @@ struct AAICVTracker;
 struct OMPInformationCache : public InformationCache {
   OMPInformationCache(Module &M, AnalysisGetter &AG,
                       BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
-                      SmallPtrSetImpl<Kernel> &Kernels)
+                      KernelSet &Kernels)
       : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
         Kernels(Kernels) {
 
@@ -424,6 +417,12 @@ struct OMPInformationCache : public InformationCache {
       recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
   }
 
+  // Helper function to inherit the calling convention of the function callee.
+  void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
+    if (Function *Fn = dyn_cast<Function>(Callee.getCallee()))
+      CI->setCallingConv(Fn->getCallingConv());
+  }
+
   /// Helper to initialize all runtime function information for those defined
   /// in OpenMPKinds.def.
   void initializeRuntimeFunctions() {
@@ -485,7 +484,7 @@ struct OMPInformationCache : public InformationCache {
   }
 
   /// Collection of known kernels (\see Kernel) in the module.
-  SmallPtrSetImpl<Kernel> &Kernels;
+  KernelSet &Kernels;
 
   /// Collection of known OpenMP runtime functions..
   DenseSet<const Function *> RTLFunctions;
@@ -1013,7 +1012,8 @@ private:
     // into a single parallel region is contained in a single basic block
     // without any other instructions. We use the OpenMPIRBuilder to outline
     // that block and call the resulting function via __kmpc_fork_call.
-    auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
+    auto Merge = [&](const SmallVectorImpl<CallInst *> &MergableCIs,
+                     BasicBlock *BB) {
       // TODO: Change the interface to allow single CIs expanded, e.g, to
       // include an outer loop.
       assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
@@ -1075,8 +1075,7 @@ private:
       BranchInst::Create(AfterBB, AfterIP.getBlock());
 
       // Perform the actual outlining.
-      OMPInfoCache.OMPBuilder.finalize(OriginalFn,
-                                       /* AllowExtractorSinking */ true);
+      OMPInfoCache.OMPBuilder.finalize(OriginalFn);
 
       Function *OutlinedFn = MergableCIs.front()->getCaller();
 
@@ -1538,6 +1537,7 @@ private:
 
     CallInst *IssueCallsite =
         CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
+    OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
     RuntimeCall.eraseFromParent();
 
     // Add "wait" runtime call declaration:
@@ -1550,7 +1550,9 @@ private:
             OffloadArray::DeviceIDArgNum), // device_id.
         Handle                             // handle to wait on.
     };
-    CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
+    CallInst *WaitCallsite = CallInst::Create(
+        WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
+    OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
 
     return true;
   }
@@ -1597,8 +1599,10 @@ private:
             &F.getEntryBlock(), F.getEntryBlock().begin()));
       // Create a fallback location if non was found.
       // TODO: Use the debug locations of the calls instead.
-      Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
-      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
+      uint32_t SrcLocStrSize;
+      Constant *Loc =
+          OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
+      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
     }
     return Ident;
   }
@@ -2171,7 +2175,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
       };
 
       auto CallCheck = [&](Instruction &I) {
-        Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
+        Optional<Value *> ReplVal = getValueForCall(A, I, ICV);
         if (ReplVal.hasValue() &&
             ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
           HasChanged = ChangeStatus::CHANGED;
@@ -2197,12 +2201,12 @@ struct AAICVTrackerFunction : public AAICVTracker {
     return HasChanged;
   }
 
-  /// Hepler to check if \p I is a call and get the value for it if it is
+  /// Helper to check if \p I is a call and get the value for it if it is
   /// unique.
-  Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
+  Optional<Value *> getValueForCall(Attributor &A, const Instruction &I,
                                     InternalControlVar &ICV) const {
 
-    const auto *CB = dyn_cast<CallBase>(I);
+    const auto *CB = dyn_cast<CallBase>(&I);
     if (!CB || CB->hasFnAttr("no_openmp") ||
         CB->hasFnAttr("no_openmp_routines"))
       return None;
@@ -2218,8 +2222,8 @@ struct AAICVTrackerFunction : public AAICVTracker {
     if (CalledFunction == GetterRFI.Declaration)
       return None;
     if (CalledFunction == SetterRFI.Declaration) {
-      if (ICVReplacementValuesMap[ICV].count(I))
-        return ICVReplacementValuesMap[ICV].lookup(I);
+      if (ICVReplacementValuesMap[ICV].count(&I))
+        return ICVReplacementValuesMap[ICV].lookup(&I);
 
       return nullptr;
     }
@@ -2231,8 +2235,11 @@ struct AAICVTrackerFunction : public AAICVTracker {
     const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
         *this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
 
-    if (ICVTrackingAA.isAssumedTracked())
-      return ICVTrackingAA.getUniqueReplacementValue(ICV);
+    if (ICVTrackingAA.isAssumedTracked()) {
+      Optional<Value *> URV = ICVTrackingAA.getUniqueReplacementValue(ICV);
+      if (!URV || (*URV && AA::isValidAtPosition(**URV, I, OMPInfoCache)))
+        return URV;
+    }
 
     // If we don't know, assume it changes.
     return nullptr;
@@ -2284,7 +2291,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
           break;
         }
 
-        Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
+        Optional<Value *> NewReplVal = getValueForCall(A, *CurrInst, ICV);
         if (!NewReplVal.hasValue())
           continue;
 
@@ -2548,7 +2555,7 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
   }
 
   /// Set of basic blocks that are executed by a single thread.
-  DenseSet<const BasicBlock *> SingleThreadedBBs;
+  SmallSetVector<const BasicBlock *, 16> SingleThreadedBBs;
 
   /// Total number of basic blocks in this function.
   long unsigned NumBBs;
@@ -2572,7 +2579,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
   if (!A.checkForAllCallSites(PredForCallSite, *this,
                               /* RequiresAllCallSites */ true,
                               AllCallSitesKnown))
-    SingleThreadedBBs.erase(&F->getEntryBlock());
+    SingleThreadedBBs.remove(&F->getEntryBlock());
 
   auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
   auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
@@ -2637,7 +2644,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
 
   for (auto *BB : RPOT) {
     if (!MergePredecessorStates(BB))
-      SingleThreadedBBs.erase(BB);
+      SingleThreadedBBs.remove(BB);
   }
 
   return (NumSingleThreadedBBs == SingleThreadedBBs.size())
@@ -2759,7 +2766,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       if (FreeCalls.size() != 1)
         continue;
 
-      ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
+      auto *AllocSize = cast<ConstantInt>(CB->getArgOperand(0));
 
       LLVM_DEBUG(dbgs() << TAG << "Replace globalization call " << *CB
                         << " with " << AllocSize->getZExtValue()
@@ -2772,7 +2779,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
       auto *SharedMem = new GlobalVariable(
           *M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
-          UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
+          UndefValue::get(Int8ArrTy), CB->getName() + "_shared", nullptr,
           GlobalValue::NotThreadLocal,
           static_cast<unsigned>(AddressSpace::Shared));
       auto *NewBuffer =
@@ -2786,7 +2793,10 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       };
       A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
 
-      SharedMem->setAlignment(MaybeAlign(32));
+      MaybeAlign Alignment = CB->getRetAlign();
+      assert(Alignment &&
+             "HeapToShared on allocation without alignment attribute");
+      SharedMem->setAlignment(MaybeAlign(Alignment));
 
       A.changeValueAfterManifest(*CB, *NewBuffer);
       A.deleteAfterManifest(*CB);
@@ -2813,7 +2823,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
       if (CallBase *CB = dyn_cast<CallBase>(U))
         if (!isa<ConstantInt>(CB->getArgOperand(0)) ||
             !ED.isExecutedByInitialThreadOnly(*CB))
-          MallocCalls.erase(CB);
+          MallocCalls.remove(CB);
     }
 
     findPotentialRemovedFreeCalls(A);
@@ -2825,7 +2835,7 @@ struct AAHeapToSharedFunction : public AAHeapToShared {
   }
 
   /// Collection of all malloc calls in a function.
-  SmallPtrSet<CallBase *, 4> MallocCalls;
+  SmallSetVector<CallBase *, 4> MallocCalls;
   /// Collection of potentially removed free calls in a function.
   SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
 };
@@ -2962,7 +2972,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
         A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
       UsedAssumedInformation = !isAtFixpoint();
       auto *FalseVal =
-          ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0);
+          ConstantInt::getBool(IRP.getAnchorValue().getContext(), false);
       return FalseVal;
     };
 
@@ -3225,8 +3235,11 @@ struct AAKernelInfoFunction : AAKernelInfo {
       OpenMPIRBuilder::LocationDescription Loc(
           InsertPointTy(ParentBB, ParentBB->end()), DL);
       OMPInfoCache.OMPBuilder.updateToLocation(Loc);
-      auto *SrcLocStr = OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc);
-      Value *Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr);
+      uint32_t SrcLocStrSize;
+      auto *SrcLocStr =
+          OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+      Value *Ident =
+          OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
       BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
 
       // Add check for Tid in RegionCheckTidBB
@@ -3237,8 +3250,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
       FunctionCallee HardwareTidFn =
           OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
               M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
-      Value *Tid =
+      CallInst *Tid =
           OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
+      Tid->setDebugLoc(DL);
+      OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
       Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
       OMPInfoCache.OMPBuilder.Builder
           .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
@@ -3251,14 +3266,18 @@ struct AAKernelInfoFunction : AAKernelInfo {
               M, OMPRTL___kmpc_barrier_simple_spmd);
       OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
           RegionBarrierBB, RegionBarrierBB->getFirstInsertionPt()));
-      OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid})
-          ->setDebugLoc(DL);
+      CallInst *Barrier =
+          OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
+      Barrier->setDebugLoc(DL);
+      OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
 
       // Second barrier ensures workers have read broadcast values.
-      if (HasBroadcastValues)
-        CallInst::Create(BarrierFn, {Ident, Tid}, "",
-                         RegionBarrierBB->getTerminator())
-            ->setDebugLoc(DL);
+      if (HasBroadcastValues) {
+        CallInst *Barrier = CallInst::Create(BarrierFn, {Ident, Tid}, "",
+                                             RegionBarrierBB->getTerminator());
+        Barrier->setDebugLoc(DL);
+        OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
+      }
     };
 
     auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
@@ -3352,17 +3371,17 @@ struct AAKernelInfoFunction : AAKernelInfo {
                                 OMP_TGT_EXEC_MODE_SPMD));
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
-        *ConstantInt::getBool(Ctx, 0));
+        *ConstantInt::getBool(Ctx, false));
     A.changeUseAfterManifest(
         KernelDeinitCB->getArgOperandUse(DeinitModeArgNo),
         *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx),
                                 OMP_TGT_EXEC_MODE_SPMD));
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo),
-        *ConstantInt::getBool(Ctx, 0));
+        *ConstantInt::getBool(Ctx, false));
     A.changeUseAfterManifest(
         KernelDeinitCB->getArgOperandUse(DeinitRequiresFullRuntimeArgNo),
-        *ConstantInt::getBool(Ctx, 0));
+        *ConstantInt::getBool(Ctx, false));
 
     ++NumOpenMPTargetRegionKernelsSPMD;
 
@@ -3403,7 +3422,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
 
     // If not SPMD mode, indicate we use a custom state machine now.
     auto &Ctx = getAnchorValue().getContext();
-    auto *FalseVal = ConstantInt::getBool(Ctx, 0);
+    auto *FalseVal = ConstantInt::getBool(Ctx, false);
     A.changeUseAfterManifest(
         KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
 
@@ -3528,10 +3547,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
     FunctionCallee WarpSizeFn =
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
             M, OMPRTL___kmpc_get_warp_size);
-    Instruction *BlockHwSize =
+    CallInst *BlockHwSize =
         CallInst::Create(BlockHwSizeFn, "block.hw_size", InitBB);
+    OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
     BlockHwSize->setDebugLoc(DLoc);
-    Instruction *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+    CallInst *WarpSize = CallInst::Create(WarpSizeFn, "warp.size", InitBB);
+    OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
     WarpSize->setDebugLoc(DLoc);
     Instruction *BlockSize =
         BinaryOperator::CreateSub(BlockHwSize, WarpSize, "block.size", InitBB);
@@ -3571,8 +3592,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
     FunctionCallee BarrierFn =
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
             M, OMPRTL___kmpc_barrier_simple_generic);
-    CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
-        ->setDebugLoc(DLoc);
+    CallInst *Barrier =
+        CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB);
+    OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
+    Barrier->setDebugLoc(DLoc);
 
     if (WorkFnAI->getType()->getPointerAddressSpace() !=
         (unsigned int)AddressSpace::Generic) {
@@ -3588,8 +3611,9 @@ struct AAKernelInfoFunction : AAKernelInfo {
     FunctionCallee KernelParallelFn =
         OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
             M, OMPRTL___kmpc_kernel_parallel);
-    Instruction *IsActiveWorker = CallInst::Create(
+    CallInst *IsActiveWorker = CallInst::Create(
         KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
+    OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
     IsActiveWorker->setDebugLoc(DLoc);
     Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
                                        StateMachineBeginBB);
@@ -3669,10 +3693,13 @@ struct AAKernelInfoFunction : AAKernelInfo {
                        StateMachineIfCascadeCurrentBB)
         ->setDebugLoc(DLoc);
 
-    CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
-                         M, OMPRTL___kmpc_kernel_end_parallel),
-                     {}, "", StateMachineEndParallelBB)
-        ->setDebugLoc(DLoc);
+    FunctionCallee EndParallelFn =
+        OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
+            M, OMPRTL___kmpc_kernel_end_parallel);
+    CallInst *EndParallel =
+        CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB);
+    OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
+    EndParallel->setDebugLoc(DLoc);
     BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
         ->setDebugLoc(DLoc);
 
@@ -4508,6 +4535,8 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
         bool UsedAssumedInformation = false;
         A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
                                UsedAssumedInformation);
+      } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+        A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
       }
     }
   }
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 2d717475ce7f..5f2223e4047e 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -169,8 +169,7 @@ struct FunctionOutliningInfo {
 };
 
 struct FunctionOutliningMultiRegionInfo {
-  FunctionOutliningMultiRegionInfo()
-      : ORI() {}
+  FunctionOutliningMultiRegionInfo() {}
 
   // Container for outline regions
   struct OutlineRegionInfo {
@@ -971,6 +970,9 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
   };
 
   for (User *User : Users) {
+    // Don't bother with BlockAddress used by CallBr for asm goto.
+    if (isa<BlockAddress>(User))
+      continue;
     CallBase *CB = getSupportedCallBase(User);
     Function *Caller = CB->getCaller();
     if (CurrentCaller != Caller) {
@@ -1414,6 +1416,10 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
 
   bool AnyInline = false;
   for (User *User : Users) {
+    // Don't bother with BlockAddress used by CallBr for asm goto.
+    if (isa<BlockAddress>(User))
+      continue;
+
     CallBase *CB = getSupportedCallBase(User);
 
     if (isLimitReached())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index eb1b8a29cfc5..0598f751febe 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -519,13 +519,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
   unsigned NextTmpIdx = 0;
   FAddend TmpResult[3];
 
-  // Points to the constant addend of the resulting simplified expression.
-  // If the resulting expr has constant-addend, this constant-addend is
-  // desirable to reside at the top of the resulting expression tree. Placing
-  // constant close to supper-expr(s) will potentially reveal some optimization
-  // opportunities in super-expr(s).
-  const FAddend *ConstAdd = nullptr;
-
   // Simplified addends are placed <SimpVect>.
   AddendVect SimpVect;
 
@@ -541,6 +534,14 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
     }
 
     Value *Val = ThisAddend->getSymVal();
+
+    // If the resulting expr has constant-addend, this constant-addend is
+    // desirable to reside at the top of the resulting expression tree. Placing
+    // constant close to super-expr(s) will potentially reveal some
+    // optimization opportunities in super-expr(s). Here we do not implement
+    // this logic intentionally and rely on SimplifyAssociativeOrCommutative
+    // call later.
+
     unsigned StartIdx = SimpVect.size();
     SimpVect.push_back(ThisAddend);
 
@@ -569,14 +570,8 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
 
       // Pop all addends being folded and push the resulting folded addend.
       SimpVect.resize(StartIdx);
-      if (Val) {
-        if (!R.isZero()) {
-          SimpVect.push_back(&R);
-        }
-      } else {
-        // Don't push constant addend at this time. It will be the last element
-        // of <SimpVect>.
-        ConstAdd = &R;
+      if (!R.isZero()) {
+        SimpVect.push_back(&R);
       }
     }
   }
@@ -584,9 +579,6 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
   assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
          "out-of-bound access");
 
-  if (ConstAdd)
-    SimpVect.push_back(ConstAdd);
-
   Value *Result;
   if (!SimpVect.empty())
     Result = createNaryFAdd(SimpVect, InstrQuota);
@@ -1296,6 +1288,9 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   // (A*B)+(A*C) -> A*(B+C) etc
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
@@ -1498,15 +1493,18 @@ static Instruction *factorizeFAddFSub(BinaryOperator &I,
     return Lerp;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  if (!Op0->hasOneUse() || !Op1->hasOneUse())
+    return nullptr;
+
   Value *X, *Y, *Z;
   bool IsFMul;
-  if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) &&
-       match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) ||
-      (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) &&
-       match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))))
+  if ((match(Op0, m_FMul(m_Value(X), m_Value(Z))) &&
+       match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z)))) ||
+      (match(Op0, m_FMul(m_Value(Z), m_Value(X))) &&
+       match(Op1, m_c_FMul(m_Value(Y), m_Specific(Z)))))
     IsFMul = true;
-  else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) &&
-           match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z)))))
+  else if (match(Op0, m_FDiv(m_Value(X), m_Value(Z))) &&
+           match(Op1, m_FDiv(m_Value(Y), m_Specific(Z))))
     IsFMul = false;
   else
     return nullptr;
@@ -1541,6 +1539,9 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
     return FoldedFAdd;
 
@@ -1654,6 +1655,14 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
                                      {X->getType()}, {NewStartC, X}, &I));
     }
 
+    // (X * MulC) + X --> X * (MulC + 1.0)
+    Constant *MulC;
+    if (match(&I, m_c_FAdd(m_FMul(m_Value(X), m_ImmConstant(MulC)),
+                           m_Deferred(X)))) {
+      MulC = ConstantExpr::getFAdd(MulC, ConstantFP::get(I.getType(), 1.0));
+      return BinaryOperator::CreateFMulFMF(X, MulC, &I);
+    }
+
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return replaceInstUsesWith(I, V);
   }
@@ -1748,6 +1757,9 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // If this is a 'B = x-(-A)', change to B = x+A.
@@ -2310,6 +2322,9 @@ Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   // Subtraction from -0.0 is the canonical form of fneg.
   // fsub -0.0, X ==> fneg X
   // fsub nsz 0.0, X ==> fneg nsz X
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index de1034c910d5..6bbb0251f2bc 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1727,25 +1727,37 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
       (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
 
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  Value *A, *B, *C, *X, *Y;
+  Value *A, *B, *C, *X, *Y, *Dummy;
+
+  // Match following expressions:
+  // (~(A | B) & C)
+  // (~(A & B) | C)
+  // Captures X = ~(A | B) or ~(A & B)
+  const auto matchNotOrAnd =
+      [Opcode, FlippedOpcode](Value *Op, auto m_A, auto m_B, auto m_C,
+                              Value *&X, bool CountUses = false) -> bool {
+    if (CountUses && !Op->hasOneUse())
+      return false;
+
+    if (match(Op, m_c_BinOp(FlippedOpcode,
+                            m_CombineAnd(m_Value(X),
+                                         m_Not(m_c_BinOp(Opcode, m_A, m_B))),
+                            m_C)))
+      return !CountUses || X->hasOneUse();
+
+    return false;
+  };
 
   // (~(A | B) & C) | ... --> ...
   // (~(A & B) | C) & ... --> ...
   // TODO: One use checks are conservative. We just need to check that a total
   //       number of multiple used values does not exceed reduction
   //       in operations.
-  if (match(Op0,
-            m_c_BinOp(FlippedOpcode,
-                      m_CombineAnd(m_Value(X), m_Not(m_BinOp(Opcode, m_Value(A),
-                                                             m_Value(B)))),
-                      m_Value(C)))) {
+  if (matchNotOrAnd(Op0, m_Value(A), m_Value(B), m_Value(C), X)) {
     // (~(A | B) & C) | (~(A | C) & B) --> (B ^ C) & ~A
     // (~(A & B) | C) & (~(A & C) | B) --> ~((B ^ C) & A)
-    if (match(Op1,
-              m_OneUse(m_c_BinOp(FlippedOpcode,
-                                 m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(A),
-                                                          m_Specific(C)))),
-                                 m_Specific(B))))) {
+    if (matchNotOrAnd(Op1, m_Specific(A), m_Specific(C), m_Specific(B), Dummy,
+                      true)) {
       Value *Xor = Builder.CreateXor(B, C);
       return (Opcode == Instruction::Or)
                  ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(A))
@@ -1754,11 +1766,8 @@ static Instruction *foldComplexAndOrPatterns(BinaryOperator &I,
 
     // (~(A | B) & C) | (~(B | C) & A) --> (A ^ C) & ~B
     // (~(A & B) | C) & (~(B & C) | A) --> ~((A ^ C) & B)
-    if (match(Op1,
-              m_OneUse(m_c_BinOp(FlippedOpcode,
-                                 m_OneUse(m_Not(m_c_BinOp(Opcode, m_Specific(B),
-                                                          m_Specific(C)))),
-                                 m_Specific(A))))) {
+    if (matchNotOrAnd(Op1, m_Specific(B), m_Specific(C), m_Specific(A), Dummy,
+                      true)) {
       Value *Xor = Builder.CreateXor(A, C);
       return (Opcode == Instruction::Or)
                  ? BinaryOperator::CreateAnd(Xor, Builder.CreateNot(B))
@@ -1863,6 +1872,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(I))
@@ -2072,21 +2084,37 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
         if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
           return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
 
-    // (A | B) & ((~A) ^ B) -> (A & B)
-    // (A | B) & (B ^ (~A)) -> (A & B)
-    // (B | A) & ((~A) ^ B) -> (A & B)
-    // (B | A) & (B ^ (~A)) -> (A & B)
+    // (A | B) & (~A ^ B) -> A & B
+    // (A | B) & (B ^ ~A) -> A & B
+    // (B | A) & (~A ^ B) -> A & B
+    // (B | A) & (B ^ ~A) -> A & B
     if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
         match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateAnd(A, B);
 
-    // ((~A) ^ B) & (A | B) -> (A & B)
-    // ((~A) ^ B) & (B | A) -> (A & B)
-    // (B ^ (~A)) & (A | B) -> (A & B)
-    // (B ^ (~A)) & (B | A) -> (A & B)
+    // (~A ^ B) & (A | B) -> A & B
+    // (~A ^ B) & (B | A) -> A & B
+    // (B ^ ~A) & (A | B) -> A & B
+    // (B ^ ~A) & (B | A) -> A & B
     if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
         match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateAnd(A, B);
+
+    // (~A | B) & (A ^ B) -> ~A & B
+    // (~A | B) & (B ^ A) -> ~A & B
+    // (B | ~A) & (A ^ B) -> ~A & B
+    // (B | ~A) & (B ^ A) -> ~A & B
+    if (match(Op0, m_c_Or(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
+
+    // (A ^ B) & (~A | B) -> ~A & B
+    // (B ^ A) & (~A | B) -> ~A & B
+    // (A ^ B) & (B | ~A) -> ~A & B
+    // (B ^ A) & (B | ~A) -> ~A & B
+    if (match(Op1, m_c_Or(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op0, m_c_Xor(m_Specific(A), m_Specific(B))))
+      return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
   }
 
   {
@@ -2640,6 +2668,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(I))
@@ -3528,6 +3559,9 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   if (Instruction *NewXor = foldXorToXor(I, Builder))
     return NewXor;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 14427bd1f2f4..1fb46af46bee 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -352,9 +352,27 @@ Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
 // * Dereferenceable address & few lanes -> scalarize speculative load/selects
 // * Adjacent vector addresses -> masked.load
 // * Narrow width by halfs excluding zero/undef lanes
-// * Vector splat address w/known mask -> scalar load
 // * Vector incrementing address -> vector masked load
 Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
+  auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(2));
+  if (!ConstMask)
+    return nullptr;
+
+  // Vector splat address w/known mask -> scalar load
+  // Fold the gather to load the source vector first lane
+  // because it is reloading the same value each time
+  if (ConstMask->isAllOnesValue())
+    if (auto *SplatPtr = getSplatValue(II.getArgOperand(0))) {
+      auto *VecTy = cast<VectorType>(II.getType());
+      const Align Alignment =
+          cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+      LoadInst *L = Builder.CreateAlignedLoad(VecTy->getElementType(), SplatPtr,
+                                              Alignment, "load.scalar");
+      Value *Shuf =
+          Builder.CreateVectorSplat(VecTy->getElementCount(), L, "broadcast");
+      return replaceInstUsesWith(II, cast<Instruction>(Shuf));
+    }
+
   return nullptr;
 }
 
@@ -362,7 +380,6 @@ Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
 // * Single constant active lane -> store
 // * Adjacent vector addresses -> masked.store
 // * Narrow store width by halfs excluding zero/undef lanes
-// * Vector splat address w/known mask -> scalar store
 // * Vector incrementing address -> vector masked store
 Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
   auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
@@ -373,6 +390,34 @@ Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
   if (ConstMask->isNullValue())
     return eraseInstFromFunction(II);
 
+  // Vector splat address -> scalar store
+  if (auto *SplatPtr = getSplatValue(II.getArgOperand(1))) {
+    // scatter(splat(value), splat(ptr), non-zero-mask) -> store value, ptr
+    if (auto *SplatValue = getSplatValue(II.getArgOperand(0))) {
+      Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+      StoreInst *S =
+          new StoreInst(SplatValue, SplatPtr, /*IsVolatile=*/false, Alignment);
+      S->copyMetadata(II);
+      return S;
+    }
+    // scatter(vector, splat(ptr), splat(true)) -> store extract(vector,
+    // lastlane), ptr
+    if (ConstMask->isAllOnesValue()) {
+      Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+      VectorType *WideLoadTy = cast<VectorType>(II.getArgOperand(1)->getType());
+      ElementCount VF = WideLoadTy->getElementCount();
+      Constant *EC =
+          ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue());
+      Value *RunTimeVF = VF.isScalable() ? Builder.CreateVScale(EC) : EC;
+      Value *LastLane = Builder.CreateSub(RunTimeVF, Builder.getInt32(1));
+      Value *Extract =
+          Builder.CreateExtractElement(II.getArgOperand(0), LastLane);
+      StoreInst *S =
+          new StoreInst(Extract, SplatPtr, /*IsVolatile=*/false, Alignment);
+      S->copyMetadata(II);
+      return S;
+    }
+  }
   if (isa<ScalableVectorType>(ConstMask->getType()))
     return nullptr;
 
@@ -449,7 +494,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
     // ctlz/cttz i1 Op0 --> not Op0
     if (match(Op1, m_Zero()))
       return BinaryOperator::CreateNot(Op0);
-    // If zero is undef, then the input can be assumed to be "true", so the
+    // If zero is poison, then the input can be assumed to be "true", so the
     // instruction simplifies to "false".
     assert(match(Op1, m_One()) && "Expected ctlz/cttz operand to be 0 or 1");
     return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(II.getType()));
@@ -474,7 +519,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
     }
 
     // Zext doesn't change the number of trailing zeros, so narrow:
-    // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsUndef' parameter is 'true'.
+    // cttz(zext(x)) -> zext(cttz(x)) if the 'ZeroIsPoison' parameter is 'true'.
     if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) && match(Op1, m_One())) {
       auto *Cttz = IC.Builder.CreateBinaryIntrinsic(Intrinsic::cttz, X,
                                                     IC.Builder.getTrue());
@@ -511,7 +556,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
   }
 
   // If the input to cttz/ctlz is known to be non-zero,
-  // then change the 'ZeroIsUndef' parameter to 'true'
+  // then change the 'ZeroIsPoison' parameter to 'true'
   // because we know the zero behavior can't affect the result.
   if (!Known.One.isZero() ||
       isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
@@ -1188,6 +1233,21 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Value *IIOperand = II->getArgOperand(0);
     Value *X = nullptr;
 
+    KnownBits Known = computeKnownBits(IIOperand, 0, II);
+    uint64_t LZ = alignDown(Known.countMinLeadingZeros(), 8);
+    uint64_t TZ = alignDown(Known.countMinTrailingZeros(), 8);
+
+    // bswap(x) -> shift(x) if x has exactly one "active byte"
+    if (Known.getBitWidth() - LZ - TZ == 8) {
+      assert(LZ != TZ && "active byte cannot be in the middle");
+      if (LZ > TZ)  // -> shl(x) if the "active byte" is in the low part of x
+        return BinaryOperator::CreateNUWShl(
+            IIOperand, ConstantInt::get(IIOperand->getType(), LZ - TZ));
+      // -> lshr(x) if the "active byte" is in the high part of x
+      return BinaryOperator::CreateExactLShr(
+            IIOperand, ConstantInt::get(IIOperand->getType(), TZ - LZ));
+    }
+
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
     if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
       unsigned C = X->getType()->getScalarSizeInBits() -
@@ -2460,7 +2520,7 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call,
   if (!Call.isByValArgument(ix))
     return false;
 
-  Type *SrcElemTy = SrcTy->getElementType();
+  Type *SrcElemTy = SrcTy->getNonOpaquePointerElementType();
   Type *DstElemTy = Call.getParamByValType(ix);
   if (!SrcElemTy->isSized() || !DstElemTy->isSized())
     return false;
@@ -2571,57 +2631,36 @@ static IntrinsicInst *findInitTrampoline(Value *Callee) {
 }
 
 void InstCombinerImpl::annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
-  unsigned NumArgs = Call.arg_size();
-  ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
-  ConstantInt *Op1C =
-      (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
-  // Bail out if the allocation size is zero (or an invalid alignment of zero
-  // with aligned_alloc).
-  if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
-    return;
-
-  if (isMallocLikeFn(&Call, TLI) && Op0C) {
-    if (isOpNewLikeFn(&Call, TLI))
+  // Note: We only handle cases which can't be driven from generic attributes
+  // here.  So, for example, nonnull and noalias (which are common properties
+  // of some allocation functions) are expected to be handled via annotation
+  // of the respective allocator declaration with generic attributes.
+
+  uint64_t Size;
+  ObjectSizeOpts Opts;
+  if (getObjectSize(&Call, Size, DL, TLI, Opts) && Size > 0) {
+    // TODO: We really should just emit deref_or_null here and then
+    // let the generic inference code combine that with nonnull.
+    if (Call.hasRetAttr(Attribute::NonNull))
       Call.addRetAttr(Attribute::getWithDereferenceableBytes(
-          Call.getContext(), Op0C->getZExtValue()));
+          Call.getContext(), Size));
     else
       Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-          Call.getContext(), Op0C->getZExtValue()));
-  } else if (isAlignedAllocLikeFn(&Call, TLI)) {
-    if (Op1C)
-      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-          Call.getContext(), Op1C->getZExtValue()));
-    // Add alignment attribute if alignment is a power of two constant.
-    if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment) &&
-        isKnownNonZero(Call.getOperand(1), DL, 0, &AC, &Call, &DT)) {
-      uint64_t AlignmentVal = Op0C->getZExtValue();
-      if (llvm::isPowerOf2_64(AlignmentVal)) {
-        Call.removeRetAttr(Attribute::Alignment);
-        Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(),
-                                                    Align(AlignmentVal)));
-      }
-    }
-  } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
-    Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-        Call.getContext(), Op1C->getZExtValue()));
-  } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
-    bool Overflow;
-    const APInt &N = Op0C->getValue();
-    APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
-    if (!Overflow)
-      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-          Call.getContext(), Size.getZExtValue()));
-  } else if (isStrdupLikeFn(&Call, TLI)) {
-    uint64_t Len = GetStringLength(Call.getOperand(0));
-    if (Len) {
-      // strdup
-      if (NumArgs == 1)
-        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-            Call.getContext(), Len));
-      // strndup
-      else if (NumArgs == 2 && Op1C)
-        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-            Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+          Call.getContext(), Size));
+  }
+
+  // Add alignment attribute if alignment is a power of two constant.
+  Value *Alignment = getAllocAlignment(&Call, TLI);
+  if (!Alignment)
+    return;
+
+  ConstantInt *AlignOpC = dyn_cast<ConstantInt>(Alignment);
+  if (AlignOpC && AlignOpC->getValue().ult(llvm::Value::MaximumAlignment)) {
+    uint64_t AlignmentVal = AlignOpC->getZExtValue();
+    if (llvm::isPowerOf2_64(AlignmentVal)) {
+      Call.removeRetAttr(Attribute::Alignment);
+      Call.addRetAttr(Attribute::getWithAlignment(Call.getContext(),
+                                                  Align(AlignmentVal)));
     }
   }
 }
@@ -2744,9 +2783,9 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
         PointerType *NewTy = cast<PointerType>(CI->getOperand(0)->getType());
         if (!NewTy->isOpaque() && Call.isByValArgument(ix)) {
           Call.removeParamAttr(ix, Attribute::ByVal);
-          Call.addParamAttr(
-              ix, Attribute::getWithByValType(
-                      Call.getContext(), NewTy->getElementType()));
+          Call.addParamAttr(ix, Attribute::getWithByValType(
+                                    Call.getContext(),
+                                    NewTy->getNonOpaquePointerElementType()));
         }
         Changed = true;
       }
@@ -2782,7 +2821,8 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
             Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
     }
 
-  if (isAllocLikeFn(&Call, &TLI))
+  if (isAllocationFn(&Call, &TLI) &&
+      isAllocRemovable(&cast<CallBase>(Call), &TLI))
     return visitAllocSite(Call);
 
   // Handle intrinsics which can be used in both call and invoke context.
@@ -2934,7 +2974,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+      AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs());
       if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
         return false;   // Attribute not compatible with transformed value.
     }
@@ -2980,7 +3020,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
-    if (AttrBuilder(CallerPAL.getParamAttrs(i))
+    if (AttrBuilder(FT->getContext(), CallerPAL.getParamAttrs(i))
             .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
       return false;   // Attribute not compatible with transformed value.
 
@@ -2994,12 +3034,12 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     // sized type and the sized type has to have the same size as the old type.
     if (ParamTy != ActTy && CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
-      if (!ParamPTy || !ParamPTy->getElementType()->isSized())
+      if (!ParamPTy || !ParamPTy->getPointerElementType()->isSized())
         return false;
 
       Type *CurElTy = Call.getParamByValType(i);
       if (DL.getTypeAllocSize(CurElTy) !=
-          DL.getTypeAllocSize(ParamPTy->getElementType()))
+          DL.getTypeAllocSize(ParamPTy->getPointerElementType()))
         return false;
     }
   }
@@ -3012,17 +3052,14 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     // If the callee is just a declaration, don't change the varargsness of the
     // call.  We don't want to introduce a varargs call where one doesn't
     // already exist.
-    PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType());
-    if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
+    if (FT->isVarArg() != Call.getFunctionType()->isVarArg())
       return false;
 
     // If both the callee and the cast type are varargs, we still have to make
     // sure the number of fixed parameters are the same or we have the same
     // ABI issues as if we introduce a varargs call.
-    if (FT->isVarArg() &&
-        cast<FunctionType>(APTy->getElementType())->isVarArg() &&
-        FT->getNumParams() !=
-        cast<FunctionType>(APTy->getElementType())->getNumParams())
+    if (FT->isVarArg() && Call.getFunctionType()->isVarArg() &&
+        FT->getNumParams() != Call.getFunctionType()->getNumParams())
       return false;
   }
 
@@ -3045,7 +3082,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
   ArgAttrs.reserve(NumActualArgs);
 
   // Get any return attributes.
-  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+  AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs());
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
@@ -3063,7 +3100,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
 
     // Add any parameter attributes.
     if (CallerPAL.hasParamAttr(i, Attribute::ByVal)) {
-      AttrBuilder AB(CallerPAL.getParamAttrs(i));
+      AttrBuilder AB(FT->getContext(), CallerPAL.getParamAttrs(i));
       AB.addByValAttr(NewArg->getType()->getPointerElementType());
       ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
     } else
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 8df4a4529f47..f11ba8772f3c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -85,13 +85,16 @@ static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
 Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
                                                        AllocaInst &AI) {
   PointerType *PTy = cast<PointerType>(CI.getType());
+  // Opaque pointers don't have an element type we could replace with.
+  if (PTy->isOpaque())
+    return nullptr;
 
   IRBuilderBase::InsertPointGuard Guard(Builder);
   Builder.SetInsertPoint(&AI);
 
   // Get the type really allocated and the type casted to.
   Type *AllocElTy = AI.getAllocatedType();
-  Type *CastElTy = PTy->getElementType();
+  Type *CastElTy = PTy->getNonOpaquePointerElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
 
   // This optimisation does not work for cases where the cast type
@@ -2649,8 +2652,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder,
   if (SrcPTy->isOpaque() || DstPTy->isOpaque())
     return nullptr;
 
-  Type *DstElTy = DstPTy->getElementType();
-  Type *SrcElTy = SrcPTy->getElementType();
+  Type *DstElTy = DstPTy->getNonOpaquePointerElementType();
+  Type *SrcElTy = SrcPTy->getNonOpaquePointerElementType();
 
   // When the type pointed to is not sized the cast cannot be
   // turned into a gep.
@@ -2669,8 +2672,8 @@ static Instruction *convertBitCastToGEP(BitCastInst &CI, IRBuilderBase &Builder,
   // If we found a path from the src to dest, create the getelementptr now.
   if (SrcElTy == DstElTy) {
     SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
-    GetElementPtrInst *GEP =
-        GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs);
+    GetElementPtrInst *GEP = GetElementPtrInst::Create(
+        SrcPTy->getNonOpaquePointerElementType(), Src, Idxs);
 
     // If the source pointer is dereferenceable, then assume it points to an
     // allocated object and apply "inbounds" to the GEP.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ed53b88aed61..fd58a44504b3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -503,7 +503,7 @@ static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC,
 /// Returns true if we can rewrite Start as a GEP with pointer Base
 /// and some integer offset. The nodes that need to be re-written
 /// for this transformation will be added to Explored.
-static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
+static bool canRewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base,
                                   const DataLayout &DL,
                                   SetVector<Value *> &Explored) {
   SmallVector<Value *, 16> WorkList(1, Start);
@@ -551,7 +551,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
         // the original pointer type. We could handle more cases in the
         // future.
         if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
-            GEP->getType() != Start->getType())
+            GEP->getSourceElementType() != ElemTy)
           return false;
 
         if (!Explored.contains(GEP->getOperand(0)))
@@ -627,7 +627,7 @@ static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
 
 /// Returns a re-written value of Start as an indexed GEP using Base as a
 /// pointer.
-static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
+static Value *rewriteGEPAsOffset(Type *ElemTy, Value *Start, Value *Base,
                                  const DataLayout &DL,
                                  SetVector<Value *> &Explored) {
   // Perform all the substitutions. This is a bit tricky because we can
@@ -714,6 +714,8 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
     }
   }
 
+  PointerType *PtrTy =
+      ElemTy->getPointerTo(Start->getType()->getPointerAddressSpace());
   for (Value *Val : Explored) {
     if (Val == Base)
       continue;
@@ -722,22 +724,14 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
     // a GEP or a GEP + ptrtoint.
     setInsertionPoint(Builder, Val, false);
 
-    // If required, create an inttoptr instruction for Base.
-    Value *NewBase = Base;
-    if (!Base->getType()->isPointerTy())
-      NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
-                                               Start->getName() + "to.ptr");
-
-    Value *GEP = Builder.CreateInBoundsGEP(
-        Start->getType()->getPointerElementType(), NewBase,
-        makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
-
-    if (!Val->getType()->isPointerTy()) {
-      Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
-                                              Val->getName() + ".conv");
-      GEP = Cast;
-    }
-    Val->replaceAllUsesWith(GEP);
+    // Cast base to the expected type.
+    Value *NewVal = Builder.CreateBitOrPointerCast(
+        Base, PtrTy, Start->getName() + "to.ptr");
+    NewVal = Builder.CreateInBoundsGEP(
+        ElemTy, NewVal, makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
+    NewVal = Builder.CreateBitOrPointerCast(
+        NewVal, Val->getType(), Val->getName() + ".conv");
+    Val->replaceAllUsesWith(NewVal);
   }
 
   return NewInsts[Start];
@@ -747,7 +741,7 @@ static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
 /// the input Value as a constant indexed GEP. Returns a pair containing
 /// the GEPs Pointer and Index.
 static std::pair<Value *, Value *>
-getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
+getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) {
   Type *IndexType = IntegerType::get(V->getContext(),
                                      DL.getIndexTypeSizeInBits(V->getType()));
 
@@ -759,7 +753,7 @@ getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
       if (!GEP->isInBounds())
         break;
       if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
-          GEP->getType() == V->getType()) {
+          GEP->getSourceElementType() == ElemTy) {
         V = GEP->getOperand(0);
         Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
         Index = ConstantExpr::getAdd(
@@ -798,17 +792,14 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
   if (!GEPLHS->hasAllConstantIndices())
     return nullptr;
 
-  // Make sure the pointers have the same type.
-  if (GEPLHS->getType() != RHS->getType())
-    return nullptr;
-
+  Type *ElemTy = GEPLHS->getSourceElementType();
   Value *PtrBase, *Index;
-  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
+  std::tie(PtrBase, Index) = getAsConstantIndexedAddress(ElemTy, GEPLHS, DL);
 
   // The set of nodes that will take part in this transformation.
   SetVector<Value *> Nodes;
 
-  if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
+  if (!canRewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes))
     return nullptr;
 
   // We know we can re-write this as
@@ -817,7 +808,7 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
   // can't have overflow on either side. We can therefore re-write
   // this as:
   //   OFFSET1 cmp OFFSET2
-  Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
+  Value *NewRHS = rewriteGEPAsOffset(ElemTy, RHS, PtrBase, DL, Nodes);
 
   // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
   // GEP having PtrBase as the pointer base, and has returned in NewRHS the
@@ -894,9 +885,10 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // If the base pointers are different, but the indices are the same, just
     // compare the base pointer.
     if (PtrBase != GEPRHS->getOperand(0)) {
-      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
-      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
-                        GEPRHS->getOperand(0)->getType();
+      bool IndicesTheSame =
+          GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
+          GEPLHS->getType() == GEPRHS->getType() &&
+          GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType();
       if (IndicesTheSame)
         for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
           if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
@@ -1271,8 +1263,8 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // This is only really a signed overflow check if the inputs have been
   // sign-extended; check for that condition. For example, if CI2 is 2^31 and
   // the operands of the add are 64 bits wide, we need at least 33 sign bits.
-  if (IC.ComputeMinSignedBits(A, 0, &I) > NewWidth ||
-      IC.ComputeMinSignedBits(B, 0, &I) > NewWidth)
+  if (IC.ComputeMaxSignificantBits(A, 0, &I) > NewWidth ||
+      IC.ComputeMaxSignificantBits(B, 0, &I) > NewWidth)
     return nullptr;
 
   // In order to replace the original add with a narrower
@@ -2221,7 +2213,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
   Value *X = Shr->getOperand(0);
   CmpInst::Predicate Pred = Cmp.getPredicate();
-  if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() && C.isZero())
+  if (Cmp.isEquality() && Shr->isExact() && C.isZero())
     return new ICmpInst(Pred, X, Cmp.getOperand(1));
 
   const APInt *ShiftVal;
@@ -2247,9 +2239,10 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   // those conditions rather than checking them. This is difficult because of
   // undef/poison (PR34838).
   if (IsAShr) {
-    if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) {
-      // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC)
-      // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC)
+    if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) {
+      // When ShAmtC can be shifted losslessly:
+      // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC)
+      // icmp slt/ult (ashr X, ShAmtC), C --> icmp slt/ult X, (C << ShAmtC)
       APInt ShiftedC = C.shl(ShAmtVal);
       if (ShiftedC.ashr(ShAmtVal) == C)
         return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
@@ -2261,6 +2254,12 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
           (ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
         return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
     }
+    if (Pred == CmpInst::ICMP_UGT) {
+      // icmp ugt (ashr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
+      APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
+      if ((ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
+        return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+    }
 
     // If the compare constant has significant bits above the lowest sign-bit,
     // then convert an unsigned cmp to a test of the sign-bit:
@@ -3957,6 +3956,33 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
       (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
     return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
 
+  {
+    // Similar to above: an unsigned overflow comparison may use offset + mask:
+    // ((Op1 + C) & C) u<  Op1 --> Op1 != 0
+    // ((Op1 + C) & C) u>= Op1 --> Op1 == 0
+    // Op0 u>  ((Op0 + C) & C) --> Op0 != 0
+    // Op0 u<= ((Op0 + C) & C) --> Op0 == 0
+    BinaryOperator *BO;
+    const APInt *C;
+    if ((Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE) &&
+        match(Op0, m_And(m_BinOp(BO), m_LowBitMask(C))) &&
+        match(BO, m_Add(m_Specific(Op1), m_SpecificIntAllowUndef(*C)))) {
+      CmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+      Constant *Zero = ConstantInt::getNullValue(Op1->getType());
+      return new ICmpInst(NewPred, Op1, Zero);
+    }
+
+    if ((Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE) &&
+        match(Op1, m_And(m_BinOp(BO), m_LowBitMask(C))) &&
+        match(BO, m_Add(m_Specific(Op0), m_SpecificIntAllowUndef(*C)))) {
+      CmpInst::Predicate NewPred =
+          Pred == ICmpInst::ICMP_UGT ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+      Constant *Zero = ConstantInt::getNullValue(Op1->getType());
+      return new ICmpInst(NewPred, Op0, Zero);
+    }
+  }
+
   bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
   if (BO0 && isa<OverflowingBinaryOperator>(BO0))
     NoOp0WrapProblem =
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 39b55b028110..7743b4c41555 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -148,6 +148,8 @@ public:
   Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
   Instruction *visitPHINode(PHINode &PN);
   Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+  Instruction *visitGEPOfGEP(GetElementPtrInst &GEP, GEPOperator *Src);
+  Instruction *visitGEPOfBitcast(BitCastInst *BCI, GetElementPtrInst &GEP);
   Instruction *visitAllocaInst(AllocaInst &AI);
   Instruction *visitAllocSite(Instruction &FI);
   Instruction *visitFree(CallInst &FI);
@@ -195,8 +197,6 @@ private:
   bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
   bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
-  Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
-                            SmallVectorImpl<Value *> &NewIndices);
 
   /// Classify whether a cast is worth optimizing.
   ///
@@ -607,6 +607,16 @@ public:
   /// only possible if all operands to the PHI are constants).
   Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
 
+  /// For a binary operator with 2 phi operands, try to hoist the binary
+  /// operation before the phi. This can result in fewer instructions in
+  /// patterns where at least one set of phi operands simplifies.
+  /// Example:
+  /// BB3: binop (phi [X, BB1], [C1, BB2]), (phi [Y, BB1], [C2, BB2])
+  /// -->
+  /// BB1: BO = binop X, Y
+  /// BB3: phi [BO, BB1], [(binop C1, C2), BB2]
+  Instruction *foldBinopWithPhiOperands(BinaryOperator &BO);
+
   /// Given an instruction with a select as one operand and a constant as the
   /// other operand, try to fold the binary operator into the select arguments.
   /// This also works for Cast instructions, which obviously do not have a
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 0dbfdba353c4..756792918dba 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -301,16 +301,17 @@ void PointerReplacer::replace(Instruction *I) {
     assert(V && "Operand not replaced");
     SmallVector<Value *, 8> Indices;
     Indices.append(GEP->idx_begin(), GEP->idx_end());
-    auto *NewI = GetElementPtrInst::Create(
-        V->getType()->getPointerElementType(), V, Indices);
+    auto *NewI =
+        GetElementPtrInst::Create(GEP->getSourceElementType(), V, Indices);
     IC.InsertNewInstWith(NewI, *GEP);
     NewI->takeName(GEP);
     WorkMap[GEP] = NewI;
   } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
     auto *V = getReplacement(BC->getOperand(0));
     assert(V && "Operand not replaced");
-    auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
-                                  V->getType()->getPointerAddressSpace());
+    auto *NewT = PointerType::getWithSamePointeeType(
+        cast<PointerType>(BC->getType()),
+        V->getType()->getPointerAddressSpace());
     auto *NewI = new BitCastInst(V, NewT);
     IC.InsertNewInstWith(NewI, *BC);
     NewI->takeName(BC);
@@ -345,8 +346,7 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) {
 #ifndef NDEBUG
   auto *PT = cast<PointerType>(I.getType());
   auto *NT = cast<PointerType>(V->getType());
-  assert(PT != NT && PT->getElementType() == NT->getElementType() &&
-         "Invalid usage");
+  assert(PT != NT && PT->hasSameElementTypeAs(NT) && "Invalid usage");
 #endif
   WorkMap[&I] = V;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index aca7ec8d7325..1aa10b550fc4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -155,6 +155,9 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
@@ -348,13 +351,21 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
     return CastInst::Create(Instruction::SExt, And, I.getType());
   }
 
-  // (bool X) * Y --> X ? Y : 0
-  // Y * (bool X) --> X ? Y : 0
+  // (zext bool X) * Y --> X ? Y : 0
+  // Y * (zext bool X) --> X ? Y : 0
   if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
   if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
     return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
 
+  // (sext bool X) * C --> X ? -C : 0
+  Constant *ImmC;
+  if (match(Op0, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1) &&
+      match(Op1, m_ImmConstant(ImmC))) {
+    Constant *NegC = ConstantExpr::getNeg(ImmC);
+    return SelectInst::Create(X, NegC, ConstantInt::getNullValue(I.getType()));
+  }
+
   // (lshr X, 31) * Y --> (ashr X, 31) & Y
   // Y * (lshr X, 31) --> (ashr X, 31) & Y
   // TODO: We are not checking one-use because the elimination of the multiply
@@ -442,6 +453,9 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
     return FoldedMul;
 
@@ -742,6 +756,9 @@ static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
 /// division instructions.
 /// Common integer divide transforms
 Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   bool IsSigned = I.getOpcode() == Instruction::SDiv;
   Type *Ty = I.getType();
@@ -1359,6 +1376,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   if (Instruction *R = foldFDivConstantDivisor(I))
     return R;
 
@@ -1460,6 +1480,9 @@ Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
 /// remainder instructions.
 /// Common integer remainder transforms
 Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
@@ -1638,5 +1661,8 @@ Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a6d6b5199105..65e60498ff95 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -943,7 +943,7 @@ static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
 }
 
 /// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
-/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
+/// call to cttz/ctlz with flag 'is_zero_poison' cleared.
 ///
 /// For example, we can fold the following code sequence:
 /// \code
@@ -987,7 +987,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   // sizeof in bits of 'Count'.
   unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
   if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) {
-    // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from
+    // Explicitly clear the 'is_zero_poison' flag. It's always valid to go from
     // true to false on this flag, so we can replace it for all users.
     II->setArgOperand(1, ConstantInt::getFalse(II->getContext()));
     return SelectArg;
@@ -995,7 +995,7 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
 
   // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional
   // zext/trunc) have one use (ending at the select), the cttz/ctlz result will
-  // not be used if the input is zero. Relax to 'undef_on_zero' for that case.
+  // not be used if the input is zero. Relax to 'zero is poison' for that case.
   if (II->hasOneUse() && SelectArg->hasOneUse() &&
       !match(II->getArgOperand(1), m_One()))
     II->setArgOperand(1, ConstantInt::getTrue(II->getContext()));
@@ -2325,8 +2325,9 @@ Instruction *InstCombinerImpl::matchSAddSubSat(Instruction &MinMax1) {
 
   // The two operands of the add/sub must be nsw-truncatable to the NewTy. This
   // is usually achieved via a sext from a smaller type.
-  if (ComputeMinSignedBits(AddSub->getOperand(0), 0, AddSub) > NewBitWidth ||
-      ComputeMinSignedBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth)
+  if (ComputeMaxSignificantBits(AddSub->getOperand(0), 0, AddSub) >
+          NewBitWidth ||
+      ComputeMaxSignificantBits(AddSub->getOperand(1), 0, AddSub) > NewBitWidth)
     return nullptr;
 
   // Finally create and return the sat intrinsic, truncated to the new type
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 06421d553915..17f0c5c4cff0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -369,6 +369,9 @@ static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
 }
 
 Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
+  if (Instruction *Phi = foldBinopWithPhiOperands(I))
+    return Phi;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   assert(Op0->getType() == Op1->getType());
 
@@ -1032,12 +1035,13 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
           NewLShr->setIsExact(I.isExact());
           return NewLShr;
         }
-        // (X << C1) >>u C  --> (X >>u (C - C1)) & (-1 >> C)
-        Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
-        APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
-        return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
-      }
-      if (C1->ugt(ShAmtC)) {
+        if (Op0->hasOneUse()) {
+          // (X << C1) >>u C  --> (X >>u (C - C1)) & (-1 >> C)
+          Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
+          APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
+          return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
+        }
+      } else if (C1->ugt(ShAmtC)) {
         unsigned ShlAmtC = C1->getZExtValue();
         Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmtC - ShAmtC);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
@@ -1046,15 +1050,33 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
           NewShl->setHasNoUnsignedWrap(true);
           return NewShl;
         }
-        // (X << C1) >>u C  --> X << (C1 - C) & (-1 >> C)
-        Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+        if (Op0->hasOneUse()) {
+          // (X << C1) >>u C  --> X << (C1 - C) & (-1 >> C)
+          Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+          APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
+          return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+        }
+      } else {
+        assert(*C1 == ShAmtC);
+        // (X << C) >>u C --> X & (-1 >>u C)
         APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
-        return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+        return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
       }
-      assert(*C1 == ShAmtC);
-      // (X << C) >>u C --> X & (-1 >>u C)
-      APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmtC));
-      return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+    }
+
+    // ((X << C) + Y) >>u C --> (X + (Y >>u C)) & (-1 >>u C)
+    // TODO: Consolidate with the more general transform that starts from shl
+    //       (the shifts are in the opposite order).
+    Value *Y;
+    if (match(Op0,
+              m_OneUse(m_c_Add(m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))),
+                               m_Value(Y))))) {
+      Value *NewLshr = Builder.CreateLShr(Y, Op1);
+      Value *NewAdd = Builder.CreateAdd(NewLshr, X);
+      unsigned Op1Val = C->getLimitedValue(BitWidth);
+      APInt Bits = APInt::getLowBitsSet(BitWidth, BitWidth - Op1Val);
+      Constant *Mask = ConstantInt::get(Ty, Bits);
+      return BinaryOperator::CreateAnd(NewAdd, Mask);
     }
 
     if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) &&
@@ -1094,7 +1116,6 @@ Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
       }
     }
 
-    Value *Y;
     if (ShAmtC == BitWidth - 1) {
       // lshr i32 or(X,-X), 31 --> zext (X != 0)
       if (match(Op0, m_OneUse(m_c_Or(m_Neg(m_Value(X)), m_Deferred(X)))))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 4dc712f32536..71a5ae24eead 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -800,22 +800,21 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // Round NTZ down to the next byte.  If we have 11 trailing zeros, then
         // we need all the bits down to bit 8.  Likewise, round NLZ.  If we
         // have 14 leading zeros, round to 8.
-        NLZ &= ~7;
-        NTZ &= ~7;
+        NLZ = alignDown(NLZ, 8);
+        NTZ = alignDown(NTZ, 8);
         // If we need exactly one byte, we can do this transformation.
-        if (BitWidth-NLZ-NTZ == 8) {
-          unsigned ResultBit = NTZ;
-          unsigned InputBit = BitWidth-NTZ-8;
-
+        if (BitWidth - NLZ - NTZ == 8) {
           // Replace this with either a left or right shift to get the byte into
           // the right place.
           Instruction *NewVal;
-          if (InputBit > ResultBit)
-            NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
-                    ConstantInt::get(I->getType(), InputBit-ResultBit));
+          if (NLZ > NTZ)
+            NewVal = BinaryOperator::CreateLShr(
+                II->getArgOperand(0),
+                ConstantInt::get(I->getType(), NLZ - NTZ));
           else
-            NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
-                    ConstantInt::get(I->getType(), ResultBit-InputBit));
+            NewVal = BinaryOperator::CreateShl(
+                II->getArgOperand(0),
+                ConstantInt::get(I->getType(), NTZ - NLZ));
           NewVal->takeName(I);
           return InsertNewInstWith(NewVal, *I);
         }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index c6a4602e59e3..736cf9c825d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -495,8 +495,7 @@ Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
           }
 
           GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
-              cast<PointerType>(NewPtr->getType())->getElementType(), NewPtr,
-              NewOps);
+              GEP->getSourceElementType(), NewPtr, NewOps);
           NewGEP->setIsInBounds(GEP->isInBounds());
           return NewGEP;
         }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index eb5eadba194d..029be5257694 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1027,13 +1027,11 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
   if (!ConstIsRHS)
     std::swap(Op0, Op1);
 
-  auto *BO = cast<BinaryOperator>(&I);
-  Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
-                                  SO->getName() + ".op");
-  auto *FPInst = dyn_cast<Instruction>(RI);
-  if (FPInst && isa<FPMathOperator>(FPInst))
-    FPInst->copyFastMathFlags(BO);
-  return RI;
+  Value *NewBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), Op0,
+                                     Op1, SO->getName() + ".op");
+  if (auto *NewBOI = dyn_cast<Instruction>(NewBO))
+    NewBOI->copyIRFlags(&I);
+  return NewBO;
 }
 
 Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
@@ -1289,6 +1287,70 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
   return replaceInstUsesWith(I, NewPN);
 }
 
+Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
+  // TODO: This should be similar to the incoming values check in foldOpIntoPhi:
+  //       we are guarding against replicating the binop in >1 predecessor.
+  //       This could miss matching a phi with 2 constant incoming values.
+  auto *Phi0 = dyn_cast<PHINode>(BO.getOperand(0));
+  auto *Phi1 = dyn_cast<PHINode>(BO.getOperand(1));
+  if (!Phi0 || !Phi1 || !Phi0->hasOneUse() || !Phi1->hasOneUse() ||
+      Phi0->getNumOperands() != 2 || Phi1->getNumOperands() != 2)
+    return nullptr;
+
+  // TODO: Remove the restriction for binop being in the same block as the phis.
+  if (BO.getParent() != Phi0->getParent() ||
+      BO.getParent() != Phi1->getParent())
+    return nullptr;
+
+  // Match a pair of incoming constants for one of the predecessor blocks.
+  BasicBlock *ConstBB, *OtherBB;
+  Constant *C0, *C1;
+  if (match(Phi0->getIncomingValue(0), m_ImmConstant(C0))) {
+    ConstBB = Phi0->getIncomingBlock(0);
+    OtherBB = Phi0->getIncomingBlock(1);
+  } else if (match(Phi0->getIncomingValue(1), m_ImmConstant(C0))) {
+    ConstBB = Phi0->getIncomingBlock(1);
+    OtherBB = Phi0->getIncomingBlock(0);
+  } else {
+    return nullptr;
+  }
+  if (!match(Phi1->getIncomingValueForBlock(ConstBB), m_ImmConstant(C1)))
+    return nullptr;
+
+  // The block that we are hoisting to must reach here unconditionally.
+  // Otherwise, we could be speculatively executing an expensive or
+  // non-speculative op.
+  auto *PredBlockBranch = dyn_cast<BranchInst>(OtherBB->getTerminator());
+  if (!PredBlockBranch || PredBlockBranch->isConditional() ||
+      !DT.isReachableFromEntry(OtherBB))
+    return nullptr;
+
+  // TODO: This check could be tightened to only apply to binops (div/rem) that
+  //       are not safe to speculatively execute. But that could allow hoisting
+  //       potentially expensive instructions (fdiv for example).
+  for (auto BBIter = BO.getParent()->begin(); &*BBIter != &BO; ++BBIter)
+    if (!isGuaranteedToTransferExecutionToSuccessor(&*BBIter))
+      return nullptr;
+
+  // Make a new binop in the predecessor block with the non-constant incoming
+  // values.
+  Builder.SetInsertPoint(PredBlockBranch);
+  Value *NewBO = Builder.CreateBinOp(BO.getOpcode(),
+                                     Phi0->getIncomingValueForBlock(OtherBB),
+                                     Phi1->getIncomingValueForBlock(OtherBB));
+  if (auto *NotFoldedNewBO = dyn_cast<BinaryOperator>(NewBO))
+    NotFoldedNewBO->copyIRFlags(&BO);
+
+  // Fold constants for the predecessor block with constant incoming values.
+  Constant *NewC = ConstantExpr::get(BO.getOpcode(), C0, C1);
+
+  // Replace the binop with a phi of the new values. The old phis are dead.
+  PHINode *NewPhi = PHINode::Create(BO.getType(), 2);
+  NewPhi->addIncoming(NewBO, OtherBB);
+  NewPhi->addIncoming(NewC, ConstBB);
+  return NewPhi;
+}
+
 Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
   if (!isa<Constant>(I.getOperand(1)))
     return nullptr;
@@ -1307,10 +1369,11 @@ Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
 /// is a sequence of GEP indices into the pointed type that will land us at the
 /// specified offset. If so, fill them into NewIndices and return the resultant
 /// element type, otherwise return null.
-Type *
-InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset,
-                                      SmallVectorImpl<Value *> &NewIndices) {
-  Type *Ty = PtrTy->getElementType();
+static Type *findElementAtOffset(PointerType *PtrTy, int64_t IntOffset,
+                                 SmallVectorImpl<Value *> &NewIndices,
+                                 const DataLayout &DL) {
+  // Only used by visitGEPOfBitcast(), which is skipped for opaque pointers.
+  Type *Ty = PtrTy->getNonOpaquePointerElementType();
   if (!Ty->isSized())
     return nullptr;
 
@@ -1320,7 +1383,7 @@ InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t IntOffset,
     return nullptr;
 
   for (const APInt &Index : Indices)
-    NewIndices.push_back(Builder.getInt(Index));
+    NewIndices.push_back(ConstantInt::get(PtrTy->getContext(), Index));
   return Ty;
 }
 
@@ -1884,12 +1947,254 @@ static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
   return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
 }
 
+Instruction *InstCombinerImpl::visitGEPOfGEP(GetElementPtrInst &GEP,
+                                             GEPOperator *Src) {
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction with matching element type, combine the
+  // indices of the two getelementptr instructions into a single instruction.
+  if (Src->getResultElementType() != GEP.getSourceElementType())
+    return nullptr;
+
+  if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
+    return nullptr;
+
+  if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+      Src->hasOneUse()) {
+    Value *GO1 = GEP.getOperand(1);
+    Value *SO1 = Src->getOperand(1);
+
+    if (LI) {
+      // Try to reassociate loop invariant GEP chains to enable LICM.
+      if (Loop *L = LI->getLoopFor(GEP.getParent())) {
+        // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
+        // invariant: this breaks the dependence between GEPs and allows LICM
+        // to hoist the invariant part out of the loop.
+        if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
+          // We have to be careful here.
+          // We have something like:
+          //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx
+          //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
+          // If we just swap idx & idx2 then we could inadvertantly
+          // change %src from a vector to a scalar, or vice versa.
+          // Cases:
+          //  1) %base a scalar & idx a scalar & idx2 a vector
+          //      => Swapping idx & idx2 turns %src into a vector type.
+          //  2) %base a scalar & idx a vector & idx2 a scalar
+          //      => Swapping idx & idx2 turns %src in a scalar type
+          //  3) %base, %idx, and %idx2 are scalars
+          //      => %src & %gep are scalars
+          //      => swapping idx & idx2 is safe
+          //  4) %base a vector
+          //      => %src is a vector
+          //      => swapping idx & idx2 is safe.
+          auto *SO0 = Src->getOperand(0);
+          auto *SO0Ty = SO0->getType();
+          if (!isa<VectorType>(GEP.getType()) || // case 3
+              isa<VectorType>(SO0Ty)) { // case 4
+            Src->setOperand(1, GO1);
+            GEP.setOperand(1, SO1);
+            return &GEP;
+          } else {
+            // Case 1 or 2
+            // -- have to recreate %src & %gep
+            // put NewSrc at same location as %src
+            Builder.SetInsertPoint(cast<Instruction>(Src));
+            Value *NewSrc = Builder.CreateGEP(
+                GEP.getSourceElementType(), SO0, GO1, Src->getName());
+            // Propagate 'inbounds' if the new source was not constant-folded.
+            if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
+              NewSrcGEPI->setIsInBounds(Src->isInBounds());
+            GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+                GEP.getSourceElementType(), NewSrc, {SO1});
+            NewGEP->setIsInBounds(GEP.isInBounds());
+            return NewGEP;
+          }
+        }
+      }
+    }
+  }
+
+  // Note that if our source is a gep chain itself then we wait for that
+  // chain to be resolved before we perform this transformation.  This
+  // avoids us creating a TON of code in some cases.
+  if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
+    if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
+      return nullptr;   // Wait until our source is folded to completion.
+
+  SmallVector<Value*, 8> Indices;
+
+  // Find out whether the last index in the source GEP is a sequential idx.
+  bool EndsWithSequential = false;
+  for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
+       I != E; ++I)
+    EndsWithSequential = I.isSequential();
+
+  // Can we combine the two pointer arithmetics offsets?
+  if (EndsWithSequential) {
+    // Replace: gep (gep %P, long B), long A, ...
+    // With:    T = long A+B; gep %P, T, ...
+    Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
+    Value *GO1 = GEP.getOperand(1);
+
+    // If they aren't the same type, then the input hasn't been processed
+    // by the loop above yet (which canonicalizes sequential index types to
+    // intptr_t).  Just avoid transforming this until the input has been
+    // normalized.
+    if (SO1->getType() != GO1->getType())
+      return nullptr;
+
+    Value *Sum =
+        SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+    // Only do the combine when we are sure the cost after the
+    // merge is never more than that before the merge.
+    if (Sum == nullptr)
+      return nullptr;
+
+    // Update the GEP in place if possible.
+    if (Src->getNumOperands() == 2) {
+      GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
+      replaceOperand(GEP, 0, Src->getOperand(0));
+      replaceOperand(GEP, 1, Sum);
+      return &GEP;
+    }
+    Indices.append(Src->op_begin()+1, Src->op_end()-1);
+    Indices.push_back(Sum);
+    Indices.append(GEP.op_begin()+2, GEP.op_end());
+  } else if (isa<Constant>(*GEP.idx_begin()) &&
+             cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+             Src->getNumOperands() != 1) {
+    // Otherwise we can do the fold if the first index of the GEP is a zero
+    Indices.append(Src->op_begin()+1, Src->op_end());
+    Indices.append(GEP.idx_begin()+1, GEP.idx_end());
+  }
+
+  if (!Indices.empty())
+    return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
+               ? GetElementPtrInst::CreateInBounds(
+                     Src->getSourceElementType(), Src->getOperand(0), Indices,
+                     GEP.getName())
+               : GetElementPtrInst::Create(Src->getSourceElementType(),
+                                           Src->getOperand(0), Indices,
+                                           GEP.getName());
+
+  return nullptr;
+}
+
+// Note that we may have also stripped an address space cast in between.
+Instruction *InstCombinerImpl::visitGEPOfBitcast(BitCastInst *BCI,
+                                                 GetElementPtrInst &GEP) {
+  // With opaque pointers, there is no pointer element type we can use to
+  // adjust the GEP type.
+  PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
+  if (SrcType->isOpaque())
+    return nullptr;
+
+  Type *GEPEltType = GEP.getSourceElementType();
+  Type *SrcEltType = SrcType->getNonOpaquePointerElementType();
+  Value *SrcOp = BCI->getOperand(0);
+
+  // GEP directly using the source operand if this GEP is accessing an element
+  // of a bitcasted pointer to vector or array of the same dimensions:
+  // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
+  // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
+  auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
+                                        const DataLayout &DL) {
+    auto *VecVTy = cast<FixedVectorType>(VecTy);
+    return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
+           ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
+           DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
+  };
+  if (GEP.getNumOperands() == 3 &&
+      ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) &&
+        areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
+       (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() &&
+        areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
+
+    // Create a new GEP here, as using `setOperand()` followed by
+    // `setSourceElementType()` won't actually update the type of the
+    // existing GEP Value. Causing issues if this Value is accessed when
+    // constructing an AddrSpaceCastInst
+    SmallVector<Value *, 8> Indices(GEP.indices());
+    Value *NGEP = GEP.isInBounds()
+                      ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, Indices)
+                      : Builder.CreateGEP(SrcEltType, SrcOp, Indices);
+    NGEP->takeName(&GEP);
+
+    // Preserve GEP address space to satisfy users
+    if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+      return new AddrSpaceCastInst(NGEP, GEP.getType());
+
+    return replaceInstUsesWith(GEP, NGEP);
+  }
+
+  // See if we can simplify:
+  //   X = bitcast A* to B*
+  //   Y = gep X, <...constant indices...>
+  // into a gep of the original struct. This is important for SROA and alias
+  // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
+  unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEP.getType());
+  APInt Offset(OffsetBits, 0);
+
+  // If the bitcast argument is an allocation, The bitcast is for convertion
+  // to actual type of allocation. Removing such bitcasts, results in having
+  // GEPs with i8* base and pure byte offsets. That means GEP is not aware of
+  // struct or array hierarchy.
+  // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have
+  // a better chance to succeed.
+  if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) &&
+      !isAllocationFn(SrcOp, &TLI)) {
+    // If this GEP instruction doesn't move the pointer, just replace the GEP
+    // with a bitcast of the real input to the dest type.
+    if (!Offset) {
+      // If the bitcast is of an allocation, and the allocation will be
+      // converted to match the type of the cast, don't touch this.
+      if (isa<AllocaInst>(SrcOp)) {
+        // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+        if (Instruction *I = visitBitCast(*BCI)) {
+          if (I != BCI) {
+            I->takeName(BCI);
+            BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
+            replaceInstUsesWith(*BCI, I);
+          }
+          return &GEP;
+        }
+      }
+
+      if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
+        return new AddrSpaceCastInst(SrcOp, GEP.getType());
+      return new BitCastInst(SrcOp, GEP.getType());
+    }
+
+    // Otherwise, if the offset is non-zero, we need to find out if there is a
+    // field at Offset in 'A's type.  If so, we can pull the cast through the
+    // GEP.
+    SmallVector<Value*, 8> NewIndices;
+    if (findElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices, DL)) {
+      Value *NGEP =
+          GEP.isInBounds()
+              ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
+              : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
+
+      if (NGEP->getType() == GEP.getType())
+        return replaceInstUsesWith(GEP, NGEP);
+      NGEP->takeName(&GEP);
+
+      if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+        return new AddrSpaceCastInst(NGEP, GEP.getType());
+      return new BitCastInst(NGEP, GEP.getType());
+    }
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
-  SmallVector<Value *, 8> Ops(GEP.operands());
+  Value *PtrOp = GEP.getOperand(0);
+  SmallVector<Value *, 8> Indices(GEP.indices());
   Type *GEPType = GEP.getType();
   Type *GEPEltType = GEP.getSourceElementType();
   bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
-  if (Value *V = SimplifyGEPInst(GEPEltType, Ops, GEP.isInBounds(),
+  if (Value *V = SimplifyGEPInst(GEPEltType, PtrOp, Indices, GEP.isInBounds(),
                                  SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
 
@@ -1912,8 +2217,6 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     // undef elements to decrease demanded bits
   }
 
-  Value *PtrOp = GEP.getOperand(0);
-
   // Eliminate unneeded casts for indices, and replace indices which displace
   // by multiples of a zero size type with zero.
   bool MadeChange = false;
@@ -2063,132 +2366,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     PtrOp = NewGEP;
   }
 
-  // Combine Indices - If the source pointer to this getelementptr instruction
-  // is a getelementptr instruction, combine the indices of the two
-  // getelementptr instructions into a single instruction.
-  if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
-    if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
-      return nullptr;
-
-    if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
-        Src->hasOneUse()) {
-      Value *GO1 = GEP.getOperand(1);
-      Value *SO1 = Src->getOperand(1);
-
-      if (LI) {
-        // Try to reassociate loop invariant GEP chains to enable LICM.
-        if (Loop *L = LI->getLoopFor(GEP.getParent())) {
-          // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
-          // invariant: this breaks the dependence between GEPs and allows LICM
-          // to hoist the invariant part out of the loop.
-          if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
-            // We have to be careful here.
-            // We have something like:
-            //  %src = getelementptr <ty>, <ty>* %base, <ty> %idx
-            //  %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
-            // If we just swap idx & idx2 then we could inadvertantly
-            // change %src from a vector to a scalar, or vice versa.
-            // Cases:
-            //  1) %base a scalar & idx a scalar & idx2 a vector
-            //      => Swapping idx & idx2 turns %src into a vector type.
-            //  2) %base a scalar & idx a vector & idx2 a scalar
-            //      => Swapping idx & idx2 turns %src in a scalar type
-            //  3) %base, %idx, and %idx2 are scalars
-            //      => %src & %gep are scalars
-            //      => swapping idx & idx2 is safe
-            //  4) %base a vector
-            //      => %src is a vector
-            //      => swapping idx & idx2 is safe.
-            auto *SO0 = Src->getOperand(0);
-            auto *SO0Ty = SO0->getType();
-            if (!isa<VectorType>(GEPType) || // case 3
-                isa<VectorType>(SO0Ty)) {    // case 4
-              Src->setOperand(1, GO1);
-              GEP.setOperand(1, SO1);
-              return &GEP;
-            } else {
-              // Case 1 or 2
-              // -- have to recreate %src & %gep
-              // put NewSrc at same location as %src
-              Builder.SetInsertPoint(cast<Instruction>(PtrOp));
-              Value *NewSrc =
-                  Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName());
-              // Propagate 'inbounds' if the new source was not constant-folded.
-              if (auto *NewSrcGEPI = dyn_cast<GetElementPtrInst>(NewSrc))
-                NewSrcGEPI->setIsInBounds(Src->isInBounds());
-              GetElementPtrInst *NewGEP =
-                  GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
-              NewGEP->setIsInBounds(GEP.isInBounds());
-              return NewGEP;
-            }
-          }
-        }
-      }
-    }
-
-    // Note that if our source is a gep chain itself then we wait for that
-    // chain to be resolved before we perform this transformation.  This
-    // avoids us creating a TON of code in some cases.
-    if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
-      if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
-        return nullptr;   // Wait until our source is folded to completion.
-
-    SmallVector<Value*, 8> Indices;
-
-    // Find out whether the last index in the source GEP is a sequential idx.
-    bool EndsWithSequential = false;
-    for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
-         I != E; ++I)
-      EndsWithSequential = I.isSequential();
-
-    // Can we combine the two pointer arithmetics offsets?
-    if (EndsWithSequential) {
-      // Replace: gep (gep %P, long B), long A, ...
-      // With:    T = long A+B; gep %P, T, ...
-      Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
-      Value *GO1 = GEP.getOperand(1);
-
-      // If they aren't the same type, then the input hasn't been processed
-      // by the loop above yet (which canonicalizes sequential index types to
-      // intptr_t).  Just avoid transforming this until the input has been
-      // normalized.
-      if (SO1->getType() != GO1->getType())
-        return nullptr;
-
-      Value *Sum =
-          SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
-      // Only do the combine when we are sure the cost after the
-      // merge is never more than that before the merge.
-      if (Sum == nullptr)
-        return nullptr;
-
-      // Update the GEP in place if possible.
-      if (Src->getNumOperands() == 2) {
-        GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
-        replaceOperand(GEP, 0, Src->getOperand(0));
-        replaceOperand(GEP, 1, Sum);
-        return &GEP;
-      }
-      Indices.append(Src->op_begin()+1, Src->op_end()-1);
-      Indices.push_back(Sum);
-      Indices.append(GEP.op_begin()+2, GEP.op_end());
-    } else if (isa<Constant>(*GEP.idx_begin()) &&
-               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
-               Src->getNumOperands() != 1) {
-      // Otherwise we can do the fold if the first index of the GEP is a zero
-      Indices.append(Src->op_begin()+1, Src->op_end());
-      Indices.append(GEP.idx_begin()+1, GEP.idx_end());
-    }
-
-    if (!Indices.empty())
-      return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
-                 ? GetElementPtrInst::CreateInBounds(
-                       Src->getSourceElementType(), Src->getOperand(0), Indices,
-                       GEP.getName())
-                 : GetElementPtrInst::Create(Src->getSourceElementType(),
-                                             Src->getOperand(0), Indices,
-                                             GEP.getName());
-  }
+  if (auto *Src = dyn_cast<GEPOperator>(PtrOp))
+    if (Instruction *I = visitGEPOfGEP(GEP, Src))
+      return I;
 
   // Skip if GEP source element type is scalable. The type alloc size is unknown
   // at compile-time.
@@ -2234,9 +2414,13 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   Value *StrippedPtr = PtrOp->stripPointerCasts();
   PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
 
-  if (StrippedPtr != PtrOp) {
+  // TODO: The basic approach of these folds is not compatible with opaque
+  // pointers, because we can't use bitcasts as a hint for a desirable GEP
+  // type. Instead, we should perform canonicalization directly on the GEP
+  // type. For now, skip these.
+  if (StrippedPtr != PtrOp && !StrippedPtrTy->isOpaque()) {
     bool HasZeroPointerIndex = false;
-    Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();
+    Type *StrippedPtrEltTy = StrippedPtrTy->getNonOpaquePointerElementType();
 
     if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
       HasZeroPointerIndex = C->isZero();
@@ -2420,103 +2604,9 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       ASCStrippedPtrOp = BC;
   }
 
-  if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
-    Value *SrcOp = BCI->getOperand(0);
-    PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
-    Type *SrcEltType = SrcType->getElementType();
-
-    // GEP directly using the source operand if this GEP is accessing an element
-    // of a bitcasted pointer to vector or array of the same dimensions:
-    // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
-    // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
-    auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
-                                          const DataLayout &DL) {
-      auto *VecVTy = cast<FixedVectorType>(VecTy);
-      return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
-             ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
-             DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
-    };
-    if (GEP.getNumOperands() == 3 &&
-        ((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) &&
-          areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
-         (isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() &&
-          areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
-
-      // Create a new GEP here, as using `setOperand()` followed by
-      // `setSourceElementType()` won't actually update the type of the
-      // existing GEP Value. Causing issues if this Value is accessed when
-      // constructing an AddrSpaceCastInst
-      Value *NGEP =
-          GEP.isInBounds()
-              ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]})
-              : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]});
-      NGEP->takeName(&GEP);
-
-      // Preserve GEP address space to satisfy users
-      if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
-        return new AddrSpaceCastInst(NGEP, GEPType);
-
-      return replaceInstUsesWith(GEP, NGEP);
-    }
-
-    // See if we can simplify:
-    //   X = bitcast A* to B*
-    //   Y = gep X, <...constant indices...>
-    // into a gep of the original struct. This is important for SROA and alias
-    // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
-    unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
-    APInt Offset(OffsetBits, 0);
-
-    // If the bitcast argument is an allocation, The bitcast is for convertion
-    // to actual type of allocation. Removing such bitcasts, results in having
-    // GEPs with i8* base and pure byte offsets. That means GEP is not aware of
-    // struct or array hierarchy.
-    // By avoiding such GEPs, phi translation and MemoryDependencyAnalysis have
-    // a better chance to succeed.
-    if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset) &&
-        !isAllocationFn(SrcOp, &TLI)) {
-      // If this GEP instruction doesn't move the pointer, just replace the GEP
-      // with a bitcast of the real input to the dest type.
-      if (!Offset) {
-        // If the bitcast is of an allocation, and the allocation will be
-        // converted to match the type of the cast, don't touch this.
-        if (isa<AllocaInst>(SrcOp)) {
-          // See if the bitcast simplifies, if so, don't nuke this GEP yet.
-          if (Instruction *I = visitBitCast(*BCI)) {
-            if (I != BCI) {
-              I->takeName(BCI);
-              BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
-              replaceInstUsesWith(*BCI, I);
-            }
-            return &GEP;
-          }
-        }
-
-        if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
-          return new AddrSpaceCastInst(SrcOp, GEPType);
-        return new BitCastInst(SrcOp, GEPType);
-      }
-
-      // Otherwise, if the offset is non-zero, we need to find out if there is a
-      // field at Offset in 'A's type.  If so, we can pull the cast through the
-      // GEP.
-      SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
-        Value *NGEP =
-            GEP.isInBounds()
-                ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
-                : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
-
-        if (NGEP->getType() == GEPType)
-          return replaceInstUsesWith(GEP, NGEP);
-        NGEP->takeName(&GEP);
-
-        if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
-          return new AddrSpaceCastInst(NGEP, GEPType);
-        return new BitCastInst(NGEP, GEPType);
-      }
-    }
-  }
+  if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp))
+    if (Instruction *I = visitGEPOfBitcast(BCI, GEP))
+      return I;
 
   if (!GEP.isInBounds()) {
     unsigned IdxWidth =
@@ -2533,8 +2623,7 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize());
         if (BasePtrOffset.ule(AllocSize)) {
           return GetElementPtrInst::CreateInBounds(
-              GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
-              GEP.getName());
+              GEP.getSourceElementType(), PtrOp, Indices, GEP.getName());
         }
       }
     }
@@ -2553,10 +2642,6 @@ static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo &TLI,
   if (auto *LI = dyn_cast<LoadInst>(V))
     return isa<GlobalVariable>(LI->getPointerOperand());
   // Two distinct allocations will never be equal.
-  // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
-  // through bitcasts of V can cause
-  // the result statement below to be true, even when AI and V (ex:
-  // i8* ->i32* ->i8* of AI) are the same allocations.
   return isAllocLikeFn(V, &TLI) && V != AI;
 }
 
@@ -2659,7 +2744,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
           continue;
         }
 
-        if (isReallocLikeFn(I, &TLI, true)) {
+        if (isReallocLikeFn(I, &TLI)) {
           Users.emplace_back(I);
           Worklist.push_back(I);
           continue;
@@ -2682,6 +2767,8 @@ static bool isAllocSiteRemovable(Instruction *AI,
 }
 
 Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
+  assert(isa<AllocaInst>(MI) || isAllocRemovable(&cast<CallBase>(MI), &TLI));
+
   // If we have a malloc call which is only used in any amount of comparisons to
   // null and free calls, delete the calls and replace the comparisons with true
   // or false as appropriate.
@@ -2900,7 +2987,7 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
   // If we had free(realloc(...)) with no intervening uses, then eliminate the
   // realloc() entirely.
   if (CallInst *CI = dyn_cast<CallInst>(Op)) {
-    if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI, true)) {
+    if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI)) {
       return eraseInstFromFunction(
           *replaceInstUsesWith(*CI, CI->getOperand(0)));
     }
@@ -3709,16 +3796,61 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
   return nullptr;
 }
 
+/// Check for case where the call writes to an otherwise dead alloca.  This
+/// shows up for unused out-params in idiomatic C/C++ code.   Note that this
+/// helper *only* analyzes the write; doesn't check any other legality aspect.
+static bool SoleWriteToDeadLocal(Instruction *I, TargetLibraryInfo &TLI) {
+  auto *CB = dyn_cast<CallBase>(I);
+  if (!CB)
+    // TODO: handle e.g. store to alloca here - only worth doing if we extend
+    // to allow reload along used path as described below.  Otherwise, this
+    // is simply a store to a dead allocation which will be removed.
+    return false;
+  Optional<MemoryLocation> Dest = MemoryLocation::getForDest(CB, TLI);
+  if (!Dest)
+    return false;
+  auto *AI = dyn_cast<AllocaInst>(getUnderlyingObject(Dest->Ptr));
+  if (!AI)
+    // TODO: allow malloc?
+    return false;
+  // TODO: allow memory access dominated by move point?  Note that since AI
+  // could have a reference to itself captured by the call, we would need to
+  // account for cycles in doing so.
+  SmallVector<const User *> AllocaUsers;
+  SmallPtrSet<const User *, 4> Visited;
+  auto pushUsers = [&](const Instruction &I) {
+    for (const User *U : I.users()) {
+      if (Visited.insert(U).second)
+        AllocaUsers.push_back(U);
+    }
+  };
+  pushUsers(*AI);
+  while (!AllocaUsers.empty()) {
+    auto *UserI = cast<Instruction>(AllocaUsers.pop_back_val());
+    if (isa<BitCastInst>(UserI) || isa<GetElementPtrInst>(UserI) ||
+        isa<AddrSpaceCastInst>(UserI)) {
+      pushUsers(*UserI);
+      continue;
+    }
+    if (UserI == CB)
+      continue;
+    // TODO: support lifetime.start/end here
+    return false;
+  }
+  return true;
+}
+
 /// Try to move the specified instruction from its current block into the
 /// beginning of DestBlock, which can only happen if it's safe to move the
 /// instruction past all of the instructions between it and the end of its
 /// block.
-static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock,
+                                 TargetLibraryInfo &TLI) {
   assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!");
   BasicBlock *SrcBlock = I->getParent();
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
-  if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
+  if (isa<PHINode>(I) || I->isEHPad() || I->mayThrow() || !I->willReturn() ||
       I->isTerminator())
     return false;
 
@@ -3738,6 +3870,14 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
     if (CI->isConvergent())
       return false;
   }
+
+  // Unless we can prove that the memory write isn't visibile except on the
+  // path we're sinking to, we must bail.
+  if (I->mayWriteToMemory()) {
+    if (!SoleWriteToDeadLocal(I, TLI))
+      return false;
+  }
+
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
   if (I->mayReadFromMemory()) {
@@ -3746,7 +3886,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
     // successor block.
     if (DestBlock->getUniquePredecessor() != I->getParent())
       return false;
-    for (BasicBlock::iterator Scan = I->getIterator(),
+    for (BasicBlock::iterator Scan = std::next(I->getIterator()),
                               E = I->getParent()->end();
          Scan != E; ++Scan)
       if (Scan->mayWriteToMemory())
@@ -3906,12 +4046,11 @@ bool InstCombinerImpl::run() {
       // predecessor, so that we don't have to split the critical edge.
       // Another option where we can sink is a block that ends with a
       // terminator that does not pass control to other block (such as
-      // return or unreachable). In this case:
+      // return or unreachable or resume). In this case:
       //   - I dominates the User (by SSA form);
       //   - the User will be executed at most once.
       // So sinking I down to User is always profitable or neutral.
-      if (UserParent->getUniquePredecessor() == BB ||
-          (isa<ReturnInst>(Term) || isa<UnreachableInst>(Term))) {
+      if (UserParent->getUniquePredecessor() == BB || succ_empty(Term)) {
         assert(DT.dominates(BB, UserParent) && "Dominance relation broken?");
         return UserParent;
       }
@@ -3922,7 +4061,7 @@ bool InstCombinerImpl::run() {
     if (OptBB) {
       auto *UserParent = *OptBB;
       // Okay, the CFG is simple enough, try to sink this instruction.
-      if (TryToSinkInstruction(I, UserParent)) {
+      if (TryToSinkInstruction(I, UserParent, TLI)) {
         LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
         MadeIRChange = true;
         // We'll add uses of the sunk instruction below, but since
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index bd2dc8d639fc..6e72255e51ae 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1547,10 +1547,9 @@ void AddressSanitizer::getInterestingMemoryOperands(
     Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
                              XCHG->getCompareOperand()->getType(), None);
   } else if (auto CI = dyn_cast<CallInst>(I)) {
-    auto *F = CI->getCalledFunction();
-    if (F && (F->getName().startswith("llvm.masked.load.") ||
-              F->getName().startswith("llvm.masked.store."))) {
-      bool IsWrite = F->getName().startswith("llvm.masked.store.");
+    if (CI->getIntrinsicID() == Intrinsic::masked_load ||
+        CI->getIntrinsicID() == Intrinsic::masked_store) {
+      bool IsWrite = CI->getIntrinsicID() == Intrinsic::masked_store;
       // Masked store has an initial operand for the value.
       unsigned OpOffset = IsWrite ? 1 : 0;
       if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads)
@@ -1559,7 +1558,7 @@ void AddressSanitizer::getInterestingMemoryOperands(
       auto BasePtr = CI->getOperand(OpOffset);
       if (ignoreAccess(LI, BasePtr))
         return;
-      auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      Type *Ty = IsWrite ? CI->getArgOperand(0)->getType() : CI->getType();
       MaybeAlign Alignment = Align(1);
       // Otherwise no alignment guarantees. We probably got Undef.
       if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
@@ -1653,11 +1652,10 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
                                         const DataLayout &DL, Type *IntptrTy,
                                         Value *Mask, Instruction *I,
                                         Value *Addr, MaybeAlign Alignment,
-                                        unsigned Granularity, uint32_t TypeSize,
+                                        unsigned Granularity, Type *OpType,
                                         bool IsWrite, Value *SizeArgument,
                                         bool UseCalls, uint32_t Exp) {
-  auto *VTy = cast<FixedVectorType>(
-      cast<PointerType>(Addr->getType())->getElementType());
+  auto *VTy = cast<FixedVectorType>(OpType);
   uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
   unsigned Num = VTy->getNumElements();
   auto Zero = ConstantInt::get(IntptrTy, 0);
@@ -1735,7 +1733,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
   unsigned Granularity = 1 << Mapping.Scale;
   if (O.MaybeMask) {
     instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(),
-                                Addr, O.Alignment, Granularity, O.TypeSize,
+                                Addr, O.Alignment, Granularity, O.OpType,
                                 O.IsWrite, nullptr, UseCalls, Exp);
   } else {
     doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment,
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 9f26b37bbc79..ff3aa14a2a83 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -208,6 +208,14 @@ static cl::opt<bool> ClEventCallbacks(
     cl::desc("Insert calls to __dfsan_*_callback functions on data events."),
     cl::Hidden, cl::init(false));
 
+// Experimental feature that inserts callbacks for conditionals, including:
+// conditional branch, switch, select.
+// This must be true for dfsan_set_conditional_callback() to have effect.
+static cl::opt<bool> ClConditionalCallbacks(
+    "dfsan-conditional-callbacks",
+    cl::desc("Insert calls to callback functions on conditionals."), cl::Hidden,
+    cl::init(false));
+
 // Controls whether the pass tracks the control flow of select instructions.
 static cl::opt<bool> ClTrackSelectControlFlow(
     "dfsan-track-select-control-flow",
@@ -428,6 +436,8 @@ class DataFlowSanitizer {
   FunctionType *DFSanSetLabelFnTy;
   FunctionType *DFSanNonzeroLabelFnTy;
   FunctionType *DFSanVarargWrapperFnTy;
+  FunctionType *DFSanConditionalCallbackFnTy;
+  FunctionType *DFSanConditionalCallbackOriginFnTy;
   FunctionType *DFSanCmpCallbackFnTy;
   FunctionType *DFSanLoadStoreCallbackFnTy;
   FunctionType *DFSanMemTransferCallbackFnTy;
@@ -444,6 +454,8 @@ class DataFlowSanitizer {
   FunctionCallee DFSanLoadCallbackFn;
   FunctionCallee DFSanStoreCallbackFn;
   FunctionCallee DFSanMemTransferCallbackFn;
+  FunctionCallee DFSanConditionalCallbackFn;
+  FunctionCallee DFSanConditionalCallbackOriginFn;
   FunctionCallee DFSanCmpCallbackFn;
   FunctionCallee DFSanChainOriginFn;
   FunctionCallee DFSanChainOriginIfTaintedFn;
@@ -454,7 +466,7 @@ class DataFlowSanitizer {
   MDNode *OriginStoreWeights;
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
-  AttrBuilder ReadOnlyNoneAttrs;
+  AttributeMask ReadOnlyNoneAttrs;
 
   /// Memory map parameters used in calculation mapping application addresses
   /// to shadow addresses and origin addresses.
@@ -642,6 +654,10 @@ struct DFSanFunction {
 
   Align getShadowAlign(Align InstAlignment);
 
+  // If ClConditionalCallbacks is enabled, insert a callback after a given
+  // branch instruction using the given conditional expression.
+  void addConditionalCallbacksIfEnabled(Instruction &I, Value *Condition);
+
 private:
   /// Collapses the shadow with aggregate type into a single primitive shadow
   /// value.
@@ -748,6 +764,8 @@ public:
   void visitSelectInst(SelectInst &I);
   void visitMemSetInst(MemSetInst &I);
   void visitMemTransferInst(MemTransferInst &I);
+  void visitBranchInst(BranchInst &BR);
+  void visitSwitchInst(SwitchInst &SW);
 
 private:
   void visitCASOrRMW(Align InstAlignment, Instruction &I);
@@ -971,6 +989,22 @@ Value *DFSanFunction::collapseToPrimitiveShadow(Value *Shadow,
   return PrimitiveShadow;
 }
 
+void DFSanFunction::addConditionalCallbacksIfEnabled(Instruction &I,
+                                                     Value *Condition) {
+  if (!ClConditionalCallbacks) {
+    return;
+  }
+  IRBuilder<> IRB(&I);
+  Value *CondShadow = getShadow(Condition);
+  if (DFS.shouldTrackOrigins()) {
+    Value *CondOrigin = getOrigin(Condition);
+    IRB.CreateCall(DFS.DFSanConditionalCallbackOriginFn,
+                   {CondShadow, CondOrigin});
+  } else {
+    IRB.CreateCall(DFS.DFSanConditionalCallbackFn, {CondShadow});
+  }
+}
+
 Type *DataFlowSanitizer::getShadowTy(Type *OrigTy) {
   if (!OrigTy->isSized())
     return PrimitiveShadowTy;
@@ -1032,6 +1066,13 @@ bool DataFlowSanitizer::initializeModule(Module &M) {
       FunctionType::get(Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
   DFSanVarargWrapperFnTy = FunctionType::get(
       Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  DFSanConditionalCallbackFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
+                        /*isVarArg=*/false);
+  Type *DFSanConditionalCallbackOriginArgs[2] = {PrimitiveShadowTy, OriginTy};
+  DFSanConditionalCallbackOriginFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), DFSanConditionalCallbackOriginArgs,
+      /*isVarArg=*/false);
   DFSanCmpCallbackFnTy =
       FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
                         /*isVarArg=*/false);
@@ -1160,7 +1201,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     // F is called by a wrapped custom function with primitive shadows. So
     // its arguments and return value need conversion.
     DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true,
-                       /*ForceZeroLabels=*/false);
+                       /*IsForceZeroLabels=*/false);
     Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI;
     ++ValAI;
     for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) {
@@ -1271,6 +1312,10 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   DFSanRuntimeFunctions.insert(
       DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
+      DFSanConditionalCallbackFn.getCallee()->stripPointerCasts());
+  DFSanRuntimeFunctions.insert(
+      DFSanConditionalCallbackOriginFn.getCallee()->stripPointerCasts());
+  DFSanRuntimeFunctions.insert(
       DFSanCmpCallbackFn.getCallee()->stripPointerCasts());
   DFSanRuntimeFunctions.insert(
       DFSanChainOriginFn.getCallee()->stripPointerCasts());
@@ -1292,6 +1337,12 @@ void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
       "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy);
   DFSanCmpCallbackFn =
       Mod->getOrInsertFunction("__dfsan_cmp_callback", DFSanCmpCallbackFnTy);
+
+  DFSanConditionalCallbackFn = Mod->getOrInsertFunction(
+      "__dfsan_conditional_callback", DFSanConditionalCallbackFnTy);
+  DFSanConditionalCallbackOriginFn =
+      Mod->getOrInsertFunction("__dfsan_conditional_callback_origin",
+                               DFSanConditionalCallbackOriginFnTy);
 }
 
 void DataFlowSanitizer::injectMetadataGlobals(Module &M) {
@@ -2593,6 +2644,8 @@ void DFSanVisitor::visitSelectInst(SelectInst &I) {
   Value *FalseOrigin =
       ShouldTrackOrigins ? DFSF.getOrigin(I.getFalseValue()) : nullptr;
 
+  DFSF.addConditionalCallbacksIfEnabled(I, I.getCondition());
+
   if (isa<VectorType>(I.getCondition()->getType())) {
     ShadowSel = DFSF.combineShadowsThenConvert(I.getType(), TrueShadow,
                                                FalseShadow, &I);
@@ -2683,6 +2736,17 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
   }
 }
 
+void DFSanVisitor::visitBranchInst(BranchInst &BR) {
+  if (!BR.isConditional())
+    return;
+
+  DFSF.addConditionalCallbacksIfEnabled(BR, BR.getCondition());
+}
+
+void DFSanVisitor::visitSwitchInst(SwitchInst &SW) {
+  DFSF.addConditionalCallbacksIfEnabled(SW, SW.getCondition());
+}
+
 static bool isAMustTailRetVal(Value *RetVal) {
   // Tail call may have a bitcast between return.
   if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 8d3bc1383e96..fb10a99d1338 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1403,16 +1403,16 @@ bool HWAddressSanitizer::instrumentStack(
 
     size_t Size = getAllocaSizeInBytes(*AI);
     size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+    auto TagEnd = [&](Instruction *Node) {
+      IRB.SetInsertPoint(Node);
+      Value *UARTag = getUARTag(IRB, StackTag);
+      tagAlloca(IRB, AI, UARTag, AlignedSize);
+    };
     bool StandardLifetime =
         UnrecognizedLifetimes.empty() && isStandardLifetime(Info, GetDT());
     if (DetectUseAfterScope && StandardLifetime) {
       IntrinsicInst *Start = Info.LifetimeStart[0];
       IRB.SetInsertPoint(Start->getNextNode());
-      auto TagEnd = [&](Instruction *Node) {
-        IRB.SetInsertPoint(Node);
-        Value *UARTag = getUARTag(IRB, StackTag);
-        tagAlloca(IRB, AI, UARTag, AlignedSize);
-      };
       tagAlloca(IRB, AI, Tag, Size);
       if (!forAllReachableExits(GetDT(), GetPDT(), Start, Info.LifetimeEnd,
                                 RetVec, TagEnd)) {
@@ -1421,11 +1421,8 @@ bool HWAddressSanitizer::instrumentStack(
       }
     } else {
       tagAlloca(IRB, AI, Tag, Size);
-      for (auto *RI : RetVec) {
-        IRB.SetInsertPoint(RI);
-        Value *UARTag = getUARTag(IRB, StackTag);
-        tagAlloca(IRB, AI, UARTag, AlignedSize);
-      }
+      for (auto *RI : RetVec)
+        TagEnd(RI);
       if (!StandardLifetime) {
         for (auto &II : Info.LifetimeStart)
           II->eraseFromParent();
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index de34348606ef..ab179b03dd29 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -248,8 +248,7 @@ public:
   PGOCounterPromoter(
       DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
       Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI)
-      : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
-        LI(LI), BFI(BFI) {
+      : LoopToCandidates(LoopToCands), L(CurLoop), LI(LI), BFI(BFI) {
 
     // Skip collection of ExitBlocks and InsertPts for loops that will not be
     // able to have counters promoted.
@@ -446,24 +445,19 @@ llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
   return new InstrProfilingLegacyPass(Options, IsCS);
 }
 
-static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
-  InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
-  if (Inc)
-    return Inc;
-  return dyn_cast<InstrProfIncrementInst>(Instr);
-}
-
 bool InstrProfiling::lowerIntrinsics(Function *F) {
   bool MadeChange = false;
   PromotionCandidates.clear();
   for (BasicBlock &BB : *F) {
     for (Instruction &Instr : llvm::make_early_inc_range(BB)) {
-      InstrProfIncrementInst *Inc = castToIncrementInst(&Instr);
-      if (Inc) {
-        lowerIncrement(Inc);
+      if (auto *IPIS = dyn_cast<InstrProfIncrementInstStep>(&Instr)) {
+        lowerIncrement(IPIS);
+        MadeChange = true;
+      } else if (auto *IPI = dyn_cast<InstrProfIncrementInst>(&Instr)) {
+        lowerIncrement(IPI);
         MadeChange = true;
-      } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
-        lowerValueProfileInst(Ind);
+      } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
+        lowerValueProfileInst(IPVP);
         MadeChange = true;
       }
     }
@@ -540,19 +534,14 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) {
 
 /// Check if the module contains uses of any profiling intrinsics.
 static bool containsProfilingIntrinsics(Module &M) {
-  if (auto *F = M.getFunction(
-          Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
-    if (!F->use_empty())
-      return true;
-  if (auto *F = M.getFunction(
-          Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
-    if (!F->use_empty())
-      return true;
-  if (auto *F = M.getFunction(
-          Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
-    if (!F->use_empty())
-      return true;
-  return false;
+  auto containsIntrinsic = [&](int ID) {
+    if (auto *F = M.getFunction(Intrinsic::getName(ID)))
+      return !F->use_empty();
+    return false;
+  };
+  return containsIntrinsic(llvm::Intrinsic::instrprof_increment) ||
+         containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) ||
+         containsIntrinsic(llvm::Intrinsic::instrprof_value_profile);
 }
 
 bool InstrProfiling::run(
@@ -771,7 +760,7 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 }
 
 /// Get the name of a profiling variable for a particular function.
-static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix,
+static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix,
                               bool &Renamed) {
   StringRef NamePrefix = getInstrProfNameVarPrefix();
   StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
@@ -860,7 +849,7 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
 }
 
 GlobalVariable *
-InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
   GlobalVariable *NamePtr = Inc->getName();
   auto &PD = ProfileDataMap[NamePtr];
   if (PD.RegionCounters)
@@ -997,8 +986,11 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
         ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
   }
 
-  if (DebugInfoCorrelate)
+  if (DebugInfoCorrelate) {
+    // Mark the counter variable as used so that it isn't optimized out.
+    CompilerUsedVars.push_back(PD.RegionCounters);
     return PD.RegionCounters;
+  }
 
   // Create data variable.
   auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext());
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 727672fa0605..8fedefccf0e1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -156,6 +156,7 @@ struct InterestingMemoryAccess {
   Value *Addr = nullptr;
   bool IsWrite;
   unsigned Alignment;
+  Type *AccessTy;
   uint64_t TypeSize;
   Value *MaybeMask = nullptr;
 };
@@ -181,7 +182,7 @@ public:
                          Value *Addr, uint32_t TypeSize, bool IsWrite);
   void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
                                    Instruction *I, Value *Addr,
-                                   unsigned Alignment, uint32_t TypeSize,
+                                   unsigned Alignment, Type *AccessTy,
                                    bool IsWrite);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
@@ -334,36 +335,32 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
 
   InterestingMemoryAccess Access;
 
-  const DataLayout &DL = I->getModule()->getDataLayout();
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     if (!ClInstrumentReads)
       return None;
     Access.IsWrite = false;
-    Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    Access.AccessTy = LI->getType();
     Access.Alignment = LI->getAlignment();
     Access.Addr = LI->getPointerOperand();
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
     if (!ClInstrumentWrites)
       return None;
     Access.IsWrite = true;
-    Access.TypeSize =
-        DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    Access.AccessTy = SI->getValueOperand()->getType();
     Access.Alignment = SI->getAlignment();
     Access.Addr = SI->getPointerOperand();
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
     if (!ClInstrumentAtomics)
       return None;
     Access.IsWrite = true;
-    Access.TypeSize =
-        DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    Access.AccessTy = RMW->getValOperand()->getType();
     Access.Alignment = 0;
     Access.Addr = RMW->getPointerOperand();
   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
     if (!ClInstrumentAtomics)
       return None;
     Access.IsWrite = true;
-    Access.TypeSize =
-        DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    Access.AccessTy = XCHG->getCompareOperand()->getType();
     Access.Alignment = 0;
     Access.Addr = XCHG->getPointerOperand();
   } else if (auto *CI = dyn_cast<CallInst>(I)) {
@@ -376,16 +373,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
           return None;
         // Masked store has an initial operand for the value.
         OpOffset = 1;
+        Access.AccessTy = CI->getArgOperand(0)->getType();
         Access.IsWrite = true;
       } else {
         if (!ClInstrumentReads)
           return None;
+        Access.AccessTy = CI->getType();
         Access.IsWrite = false;
       }
 
       auto *BasePtr = CI->getOperand(0 + OpOffset);
-      auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType();
-      Access.TypeSize = DL.getTypeStoreSizeInBits(Ty);
       if (auto *AlignmentConstant =
               dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
         Access.Alignment = (unsigned)AlignmentConstant->getZExtValue();
@@ -412,15 +409,16 @@ MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
   if (Access.Addr->isSwiftError())
     return None;
 
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  Access.TypeSize = DL.getTypeStoreSizeInBits(Access.AccessTy);
   return Access;
 }
 
 void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
                                               Instruction *I, Value *Addr,
                                               unsigned Alignment,
-                                              uint32_t TypeSize, bool IsWrite) {
-  auto *VTy = cast<FixedVectorType>(
-      cast<PointerType>(Addr->getType())->getElementType());
+                                              Type *AccessTy, bool IsWrite) {
+  auto *VTy = cast<FixedVectorType>(AccessTy);
   uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
   unsigned Num = VTy->getNumElements();
   auto *Zero = ConstantInt::get(IntptrTy, 0);
@@ -469,7 +467,7 @@ void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
 
   if (Access.MaybeMask) {
     instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
-                                Access.Alignment, Access.TypeSize,
+                                Access.Alignment, Access.AccessTy,
                                 Access.IsWrite);
   } else {
     // Since the access counts will be accumulated across the entire allocation,
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 446e601cd4d7..cfe993dedbc2 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -492,7 +492,7 @@ class MemorySanitizer {
 public:
   MemorySanitizer(Module &M, MemorySanitizerOptions Options)
       : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
-        Recover(Options.Recover) {
+        Recover(Options.Recover), EagerChecks(Options.EagerChecks) {
     initializeModule(M);
   }
 
@@ -522,6 +522,7 @@ private:
   /// Track origins (allocation points) of uninitialized values.
   int TrackOrigins;
   bool Recover;
+  bool EagerChecks;
 
   LLVMContext *C;
   Type *IntptrTy;
@@ -665,10 +666,12 @@ template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
 
 } // end anonymous namespace
 
-MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
+MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K,
+                                               bool EagerChecks)
     : Kernel(getOptOrDefault(ClEnableKmsan, K)),
       TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
-      Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
+      Recover(getOptOrDefault(ClKeepGoing, Kernel || R)),
+      EagerChecks(getOptOrDefault(ClEagerChecks, EagerChecks)) {}
 
 PreservedAnalyses MemorySanitizerPass::run(Function &F,
                                            FunctionAnalysisManager &FAM) {
@@ -695,6 +698,8 @@ void MemorySanitizerPass::printPipeline(
     OS << "recover;";
   if (Options.Kernel)
     OS << "kernel;";
+  if (Options.EagerChecks)
+    OS << "eager-checks;";
   OS << "track-origins=" << Options.TrackOrigins;
   OS << ">";
 }
@@ -1667,9 +1672,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// This function either returns the value set earlier with setShadow,
   /// or extracts if from ParamTLS (for function arguments).
   Value *getShadow(Value *V) {
-    if (!PropagateShadow) return getCleanShadow(V);
     if (Instruction *I = dyn_cast<Instruction>(V)) {
-      if (I->getMetadata("nosanitize"))
+      if (!PropagateShadow || I->getMetadata("nosanitize"))
         return getCleanShadow(V);
       // For instructions the shadow is already stored in the map.
       Value *Shadow = ShadowMap[V];
@@ -1681,7 +1685,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return Shadow;
     }
     if (UndefValue *U = dyn_cast<UndefValue>(V)) {
-      Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      Value *AllOnes = (PropagateShadow && PoisonUndef) ? getPoisonedShadow(V)
+                                                        : getCleanShadow(V);
       LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
       (void)U;
       return AllOnes;
@@ -1701,22 +1706,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           continue;
         }
 
-        bool FArgByVal = FArg.hasByValAttr();
-        bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef);
-        bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef;
-        unsigned Size =
-            FArg.hasByValAttr()
-                ? DL.getTypeAllocSize(FArg.getParamByValType())
-                : DL.getTypeAllocSize(FArg.getType());
+        unsigned Size = FArg.hasByValAttr()
+                            ? DL.getTypeAllocSize(FArg.getParamByValType())
+                            : DL.getTypeAllocSize(FArg.getType());
 
         if (A == &FArg) {
           bool Overflow = ArgOffset + Size > kParamTLSSize;
-          if (FArgEagerCheck) {
-            *ShadowPtr = getCleanShadow(V);
-            setOrigin(A, getCleanOrigin());
-            break;
-          } else if (FArgByVal) {
-            Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+          if (FArg.hasByValAttr()) {
             // ByVal pointer itself has clean shadow. We copy the actual
             // argument shadow to the underlying memory.
             // Figure out maximal valid memcpy alignment.
@@ -1727,40 +1723,38 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                    /*isStore*/ true)
                     .first;
             // TODO(glider): need to copy origins.
-            if (Overflow) {
+            if (!PropagateShadow || Overflow) {
               // ParamTLS overflow.
               EntryIRB.CreateMemSet(
                   CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()),
                   Size, ArgAlign);
             } else {
+              Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
               const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
               Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
                                                  CopyAlign, Size);
               LLVM_DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
               (void)Cpy;
             }
+          }
+
+          if (!PropagateShadow || Overflow || FArg.hasByValAttr() ||
+              (MS.EagerChecks && FArg.hasAttribute(Attribute::NoUndef))) {
             *ShadowPtr = getCleanShadow(V);
+            setOrigin(A, getCleanOrigin());
           } else {
             // Shadow over TLS
             Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
-            if (Overflow) {
-              // ParamTLS overflow.
-              *ShadowPtr = getCleanShadow(V);
-            } else {
-              *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
-                                                      kShadowTLSAlignment);
+            *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
+                                                    kShadowTLSAlignment);
+            if (MS.TrackOrigins) {
+              Value *OriginPtr =
+                  getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
+              setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
             }
           }
           LLVM_DEBUG(dbgs()
                      << "  ARG:    " << FArg << " ==> " << **ShadowPtr << "\n");
-          if (MS.TrackOrigins && !Overflow) {
-            Value *OriginPtr =
-                getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
-            setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
-          } else {
-            setOrigin(A, getCleanOrigin());
-          }
-
           break;
         }
 
@@ -3664,7 +3658,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // will become a non-readonly function after it is instrumented by us. To
       // prevent this code from being optimized out, mark that function
       // non-readonly in advance.
-      AttrBuilder B;
+      AttributeMask B;
       B.addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::ReadNone)
           .addAttribute(Attribute::WriteOnly)
@@ -3679,7 +3673,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
     }
     IRBuilder<> IRB(&CB);
-    bool MayCheckCall = ClEagerChecks;
+    bool MayCheckCall = MS.EagerChecks;
     if (Function *Func = CB.getCalledFunction()) {
       // __sanitizer_unaligned_{load,store} functions may be called by users
       // and always expects shadows in the TLS. So don't check them.
@@ -3697,15 +3691,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         continue;
       }
       unsigned Size = 0;
-      Value *Store = nullptr;
-      // Compute the Shadow for arg even if it is ByVal, because
-      // in that case getShadow() will copy the actual arg shadow to
-      // __msan_param_tls.
-      Value *ArgShadow = getShadow(A);
-      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
-      LLVM_DEBUG(dbgs() << "  Arg#" << i << ": " << *A
-                        << " Shadow: " << *ArgShadow << "\n");
-      bool ArgIsInitialized = false;
       const DataLayout &DL = F.getParent()->getDataLayout();
 
       bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
@@ -3716,6 +3701,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         insertShadowCheck(A, &CB);
         Size = DL.getTypeAllocSize(A->getType());
       } else {
+        bool ArgIsInitialized = false;
+        Value *Store = nullptr;
+        // Compute the Shadow for arg even if it is ByVal, because
+        // in that case getShadow() will copy the actual arg shadow to
+        // __msan_param_tls.
+        Value *ArgShadow = getShadow(A);
+        Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+        LLVM_DEBUG(dbgs() << "  Arg#" << i << ": " << *A
+                          << " Shadow: " << *ArgShadow << "\n");
         if (ByVal) {
           // ByVal requires some special handling as it's too big for a single
           // load
@@ -3732,10 +3726,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
               getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
                                  /*isStore*/ false)
                   .first;
-
-          Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
-                                   Alignment, Size);
-          // TODO(glider): need to copy origins.
+          if (!PropagateShadow) {
+            Store = IRB.CreateMemSet(ArgShadowBase,
+                                     Constant::getNullValue(IRB.getInt8Ty()),
+                                     Size, Alignment);
+          } else {
+            Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+                                     Alignment, Size);
+          }
         } else {
           // Any other parameters mean we need bit-grained tracking of uninit
           // data
@@ -3832,10 +3830,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
     bool HasNoUndef =
         F.hasRetAttribute(Attribute::NoUndef);
-    bool StoreShadow = !(ClEagerChecks && HasNoUndef);
+    bool StoreShadow = !(MS.EagerChecks && HasNoUndef);
     // FIXME: Consider using SpecialCaseList to specify a list of functions that
     // must always return fully initialized values. For now, we hardcode "main".
-    bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main");
+    bool EagerCheck = (MS.EagerChecks && HasNoUndef) || (F.getName() == "main");
 
     Value *Shadow = getShadow(RetVal);
     bool StoreOrigin = true;
@@ -5359,7 +5357,7 @@ bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
   MemorySanitizerVisitor Visitor(F, *this, TLI);
 
   // Clear out readonly/readnone attributes.
-  AttrBuilder B;
+  AttributeMask B;
   B.addAttribute(Attribute::ReadOnly)
       .addAttribute(Attribute::ReadNone)
       .addAttribute(Attribute::WriteOnly)
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index b6ba1fc2132c..c46415e5b1f4 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -877,7 +877,10 @@ populateEHOperandBundle(VPCandidateInfo &Cand,
                         DenseMap<BasicBlock *, ColorVector> &BlockColors,
                         SmallVectorImpl<OperandBundleDef> &OpBundles) {
   auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst);
-  if (OrigCall && !isa<IntrinsicInst>(OrigCall)) {
+  if (!OrigCall)
+    return;
+
+  if (!isa<IntrinsicInst>(OrigCall)) {
     // The instrumentation call should belong to the same funclet as a
     // non-intrinsic call, so just copy the operand bundle, if any exists.
     Optional<OperandBundleUse> ParentFunclet =
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index da8ee1f15bf8..d3b60c7add34 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -917,8 +917,7 @@ void ModuleSanitizerCoverage::InjectTraceForGep(
 
 void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
     Function &, ArrayRef<LoadInst *> Loads, ArrayRef<StoreInst *> Stores) {
-  auto CallbackIdx = [&](const Value *Ptr) -> int {
-    auto ElementTy = cast<PointerType>(Ptr->getType())->getElementType();
+  auto CallbackIdx = [&](Type *ElementTy) -> int {
     uint64_t TypeSize = DL->getTypeStoreSizeInBits(ElementTy);
     return TypeSize == 8     ? 0
            : TypeSize == 16  ? 1
@@ -932,7 +931,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
   for (auto LI : Loads) {
     IRBuilder<> IRB(LI);
     auto Ptr = LI->getPointerOperand();
-    int Idx = CallbackIdx(Ptr);
+    int Idx = CallbackIdx(LI->getType());
     if (Idx < 0)
       continue;
     IRB.CreateCall(SanCovLoadFunction[Idx],
@@ -941,7 +940,7 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
   for (auto SI : Stores) {
     IRBuilder<> IRB(SI);
     auto Ptr = SI->getPointerOperand();
-    int Idx = CallbackIdx(Ptr);
+    int Idx = CallbackIdx(SI->getValueOperand()->getType());
     if (Idx < 0)
       continue;
     IRB.CreateCall(SanCovStoreFunction[Idx],
diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index 764dc5f92707..c11691c613ac 100644
--- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -42,7 +42,7 @@ enum class ARCRuntimeEntryPointKind {
   Autorelease,
   StoreStrong,
   RetainRV,
-  ClaimRV,
+  UnsafeClaimRV,
   RetainAutorelease,
   RetainAutoreleaseRV,
 };
@@ -62,7 +62,7 @@ public:
     Autorelease = nullptr;
     StoreStrong = nullptr;
     RetainRV = nullptr;
-    ClaimRV = nullptr;
+    UnsafeClaimRV = nullptr;
     RetainAutorelease = nullptr;
     RetainAutoreleaseRV = nullptr;
   }
@@ -87,9 +87,9 @@ public:
     case ARCRuntimeEntryPointKind::RetainRV:
       return getIntrinsicEntryPoint(RetainRV,
                                 Intrinsic::objc_retainAutoreleasedReturnValue);
-    case ARCRuntimeEntryPointKind::ClaimRV:
+    case ARCRuntimeEntryPointKind::UnsafeClaimRV:
       return getIntrinsicEntryPoint(
-          ClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue);
+          UnsafeClaimRV, Intrinsic::objc_unsafeClaimAutoreleasedReturnValue);
     case ARCRuntimeEntryPointKind::RetainAutorelease:
       return getIntrinsicEntryPoint(RetainAutorelease,
                                     Intrinsic::objc_retainAutorelease);
@@ -127,7 +127,7 @@ private:
   Function *RetainRV = nullptr;
 
   /// Declaration for objc_unsafeClaimAutoreleasedReturnValue().
-  Function *ClaimRV = nullptr;
+  Function *UnsafeClaimRV = nullptr;
 
   /// Declaration for objc_retainAutorelease().
   Function *RetainAutorelease = nullptr;
diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 4921209f041b..de0f5803b4c7 100644
--- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -194,9 +194,6 @@ llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
       return CanInterruptRV(Class);
     }
   }
-
-  case RetainRVDep:
-    return CanInterruptRV(GetBasicARCInstKind(Inst));
   }
 
   llvm_unreachable("Invalid dependence flavor");
diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
index cf4c05ebe91c..dd6a1c3f9795 100644
--- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -46,8 +46,7 @@ enum DependenceKind {
   AutoreleasePoolBoundary,
   CanChangeRetainCount,
   RetainAutoreleaseDep,       ///< Blocks objc_retainAutorelease.
-  RetainAutoreleaseRVDep,     ///< Blocks objc_retainAutoreleaseReturnValue.
-  RetainRVDep                 ///< Blocks objc_retainAutoreleasedReturnValue.
+  RetainAutoreleaseRVDep      ///< Blocks objc_retainAutoreleaseReturnValue.
 };
 
 /// Find dependent instructions. If there is exactly one dependent instruction,
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index c2ed94e8e1f6..9e2832827686 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -433,7 +433,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
     // If we succeed in our optimization, fall through.
     LLVM_FALLTHROUGH;
   case ARCInstKind::RetainRV:
-  case ARCInstKind::ClaimRV: {
+  case ARCInstKind::UnsafeClaimRV: {
     bool IsInstContainedInBundle = BundledInsts->contains(Inst);
 
     // Return now if the target doesn't need a special inline-asm marker. Return
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 0fa4904456cd..b6dc97f1e43f 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -515,7 +515,7 @@ class ObjCARCOpt {
       Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
       Instruction *Inst, ARCInstKind Class, const Value *Arg);
 
-  /// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV.  If the
+  /// Try to optimize an AutoreleaseRV with a RetainRV or UnsafeClaimRV.  If the
   /// optimization occurs, returns true to indicate that the caller should
   /// assume the instructions are dead.
   bool OptimizeInlinedAutoreleaseRVCall(
@@ -705,14 +705,14 @@ bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
     return true;
   }
 
-  // ClaimRV is a frontend peephole for RetainRV + Release.  Since the
-  // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release.
-  assert(Class == ARCInstKind::ClaimRV);
+  // UnsafeClaimRV is a frontend peephole for RetainRV + Release.  Since the
+  // AutoreleaseRV and RetainRV cancel out, replace UnsafeClaimRV with Release.
+  assert(Class == ARCInstKind::UnsafeClaimRV);
   Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0);
   CallInst *Release = CallInst::Create(
       EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst);
-  assert(IsAlwaysTail(ARCInstKind::ClaimRV) &&
-         "Expected ClaimRV to be safe to tail call");
+  assert(IsAlwaysTail(ARCInstKind::UnsafeClaimRV) &&
+         "Expected UnsafeClaimRV to be safe to tail call");
   Release->setTailCall();
   Inst->replaceAllUsesWith(CallArg);
   EraseInstruction(Inst);
@@ -810,7 +810,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     BlockColors = colorEHFunclets(F);
 
   // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired
-  // with RetainRV and ClaimRV.
+  // with RetainRV and UnsafeClaimRV.
   Instruction *DelayedAutoreleaseRV = nullptr;
   const Value *DelayedAutoreleaseRVArg = nullptr;
   auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) {
@@ -837,7 +837,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       return false;
 
     // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and
-    // ClaimRV, it's probably safe to skip over even opaque function calls
+    // UnsafeClaimRV, it's probably safe to skip over even opaque function calls
     // here since OptimizeInlinedAutoreleaseRVCall will confirm that they
     // have the same RCIdentityRoot.  However, what really matters is
     // skipping instructions or intrinsics that the inliner could leave behind;
@@ -881,7 +881,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       setDelayedAutoreleaseRV(Inst);
       continue;
     case ARCInstKind::RetainRV:
-    case ARCInstKind::ClaimRV:
+    case ARCInstKind::UnsafeClaimRV:
       if (DelayedAutoreleaseRV) {
         // We have a potential RV pair.  Check if they cancel out.
         if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class,
@@ -979,9 +979,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
     CallInst *CI = cast<CallInst>(Inst);
     if (IsNullOrUndef(CI->getArgOperand(0))) {
       Changed = true;
-      Type *Ty = CI->getArgOperand(0)->getType();
-      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                    Constant::getNullValue(Ty), CI);
+      new StoreInst(ConstantInt::getTrue(CI->getContext()),
+                    UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI);
       Value *NewValue = UndefValue::get(CI->getType());
       LLVM_DEBUG(
           dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
@@ -999,9 +998,8 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
     if (IsNullOrUndef(CI->getArgOperand(0)) ||
         IsNullOrUndef(CI->getArgOperand(1))) {
       Changed = true;
-      Type *Ty = CI->getArgOperand(0)->getType();
-      new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
-                    Constant::getNullValue(Ty), CI);
+      new StoreInst(ConstantInt::getTrue(CI->getContext()),
+                    UndefValue::get(Type::getInt1PtrTy(CI->getContext())), CI);
 
       Value *NewValue = UndefValue::get(CI->getType());
       LLVM_DEBUG(
@@ -1165,7 +1163,7 @@ void ObjCARCOpt::OptimizeIndividualCallImpl(
       DepInst = findSingleDependency(AutoreleasePoolBoundary, Arg,
                                      Inst->getParent(), Inst, PA);
       break;
-    case ARCInstKind::ClaimRV:
+    case ARCInstKind::UnsafeClaimRV:
     case ARCInstKind::RetainRV:
     case ARCInstKind::AutoreleaseRV:
       // Don't move these; the RV optimization depends on the autoreleaseRV
diff --git a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 6d0a67c91cfa..1624cf26094a 100644
--- a/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/llvm/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -32,7 +32,6 @@
 namespace llvm {
 
 class AAResults;
-class DataLayout;
 class PHINode;
 class SelectInst;
 class Value;
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index b693acceb3f6..1cda206a7e14 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -579,6 +579,7 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() {
   // Don't compute the post ordering unless we needed it.
   bool HavePostOrder = false;
   bool Changed = false;
+  SmallVector<DominatorTree::UpdateType, 10> DeletedEdges;
 
   for (auto *BB : BlocksWithDeadTerminators) {
     auto &Info = BlockInfo[BB];
@@ -617,7 +618,6 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() {
     makeUnconditional(BB, PreferredSucc->BB);
 
     // Inform the dominators about the deleted CFG edges.
-    SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
     for (auto *Succ : RemovedSuccessors) {
       // It might have happened that the same successor appeared multiple times
       // and the CFG edge wasn't really removed.
@@ -629,13 +629,14 @@ bool AggressiveDeadCodeElimination::updateDeadRegions() {
       }
     }
 
-    DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager)
-        .applyUpdates(DeletedEdges);
-
     NumBranchesRemoved += 1;
     Changed = true;
   }
 
+  if (!DeletedEdges.empty())
+    DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager)
+        .applyUpdates(DeletedEdges);
+
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 37a7053d778e..25e8c3ef3b48 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -414,6 +414,14 @@ void ConstantHoistingPass::collectConstantCandidates(
   IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
   APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
   auto *GEPO = cast<GEPOperator>(ConstExpr);
+
+  // TODO: If we have a mix of inbounds and non-inbounds GEPs, then basing a
+  // non-inbounds GEP on an inbounds GEP is potentially incorrect. Restrict to
+  // inbounds GEP for now -- alternatively, we could drop inbounds from the
+  // constant expression,
+  if (!GEPO->isInBounds())
+    return;
+
   if (!GEPO->accumulateConstantOffset(*DL, Offset))
     return;
 
@@ -470,7 +478,7 @@ void ConstantHoistingPass::collectConstantCandidates(
   // Visit constant expressions that have constant integers.
   if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
     // Handle constant gep expressions.
-    if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing())
+    if (ConstHoistGEP && isa<GEPOperator>(ConstExpr))
       collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr);
 
     // Only visit constant cast expressions.
@@ -810,7 +818,7 @@ void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
 
   // Visit constant expression.
   if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
-    if (ConstExpr->isGEPWithNoNotionalOverIndexing()) {
+    if (isa<GEPOperator>(ConstExpr)) {
       // Operand is a ConstantGEP, replace it.
       updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat);
       return;
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 7f2d5d7d9987..13963657d183 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -43,6 +43,51 @@ DEBUG_COUNTER(EliminatedCounter, "conds-eliminated",
 
 static int64_t MaxConstraintValue = std::numeric_limits<int64_t>::max();
 
+namespace {
+struct ConstraintTy {
+  SmallVector<int64_t, 8> Coefficients;
+
+  ConstraintTy(SmallVector<int64_t, 8> Coefficients)
+      : Coefficients(Coefficients) {}
+
+  unsigned size() const { return Coefficients.size(); }
+};
+
+/// Struct to manage a list of constraints.
+struct ConstraintListTy {
+  SmallVector<ConstraintTy, 4> Constraints;
+
+  ConstraintListTy() {}
+
+  ConstraintListTy(const SmallVector<ConstraintTy, 4> &Constraints)
+      : Constraints(Constraints) {}
+
+  void mergeIn(const ConstraintListTy &Other) {
+    append_range(Constraints, Other.Constraints);
+  }
+
+  unsigned size() const { return Constraints.size(); }
+
+  unsigned empty() const { return Constraints.empty(); }
+
+  /// Returns true if any constraint has a non-zero coefficient for any of the
+  /// newly added indices. Zero coefficients for new indices are removed. If it
+  /// returns true, no new variable need to be added to the system.
+  bool needsNewIndices(const DenseMap<Value *, unsigned> &NewIndices) {
+    assert(size() == 1);
+    for (unsigned I = 0; I < NewIndices.size(); ++I) {
+      int64_t Last = get(0).Coefficients.pop_back_val();
+      if (Last != 0)
+        return true;
+    }
+    return false;
+  }
+
+  ConstraintTy &get(unsigned I) { return Constraints[I]; }
+};
+
+} // namespace
+
 // Decomposes \p V into a vector of pairs of the form { c, X } where c * X. The
 // sum of the pairs equals \p V.  The first pair is the constant-factor and X
 // must be nullptr. If the expression cannot be decomposed, returns an empty
@@ -108,24 +153,15 @@ static SmallVector<std::pair<int64_t, Value *>, 4> decompose(Value *V) {
   if (match(V, m_NUWSub(m_Value(Op0), m_ConstantInt(CI))))
     return {{-1 * CI->getSExtValue(), nullptr}, {1, Op0}};
   if (match(V, m_NUWSub(m_Value(Op0), m_Value(Op1))))
-    return {{0, nullptr}, {1, Op0}, {1, Op1}};
+    return {{0, nullptr}, {1, Op0}, {-1, Op1}};
 
   return {{0, nullptr}, {1, V}};
 }
 
-struct ConstraintTy {
-  SmallVector<int64_t, 8> Coefficients;
-
-  ConstraintTy(SmallVector<int64_t, 8> Coefficients)
-      : Coefficients(Coefficients) {}
-
-  unsigned size() const { return Coefficients.size(); }
-};
-
 /// Turn a condition \p CmpI into a vector of constraints, using indices from \p
 /// Value2Index. Additional indices for newly discovered values are added to \p
 /// NewIndices.
-static SmallVector<ConstraintTy, 4>
+static ConstraintListTy
 getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
               const DenseMap<Value *, unsigned> &Value2Index,
               DenseMap<Value *, unsigned> &NewIndices) {
@@ -151,11 +187,15 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
                          Value2Index, NewIndices);
 
   if (Pred == CmpInst::ICMP_EQ) {
+    if (match(Op1, m_Zero()))
+      return getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index,
+                           NewIndices);
+
     auto A =
         getConstraint(CmpInst::ICMP_UGE, Op0, Op1, Value2Index, NewIndices);
     auto B =
         getConstraint(CmpInst::ICMP_ULE, Op0, Op1, Value2Index, NewIndices);
-    append_range(A, B);
+    A.mergeIn(B);
     return A;
   }
 
@@ -200,10 +240,10 @@ getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
     R[GetOrAddIndex(KV.second)] -= KV.first;
 
   R[0] = Offset1 + Offset2 + (Pred == CmpInst::ICMP_ULT ? -1 : 0);
-  return {R};
+  return {{R}};
 }
 
-static SmallVector<ConstraintTy, 4>
+static ConstraintListTy
 getConstraint(CmpInst *Cmp, const DenseMap<Value *, unsigned> &Value2Index,
               DenseMap<Value *, unsigned> &NewIndices) {
   return getConstraint(Cmp->getPredicate(), Cmp->getOperand(0),
@@ -397,21 +437,10 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
         if (R.size() != 1)
           continue;
 
-        // Check if all coefficients of new indices are 0 after building the
-        // constraint. Skip if any of the new indices has a non-null
-        // coefficient.
-        bool HasNewIndex = false;
-        for (unsigned I = 0; I < NewIndices.size(); ++I) {
-          int64_t Last = R[0].Coefficients.pop_back_val();
-          if (Last != 0) {
-            HasNewIndex = true;
-            break;
-          }
-        }
-        if (HasNewIndex || R[0].size() == 1)
+        if (R.needsNewIndices(NewIndices))
           continue;
 
-        if (CS.isConditionImplied(R[0].Coefficients)) {
+        if (CS.isConditionImplied(R.get(0).Coefficients)) {
           if (!DebugCounter::shouldExecute(EliminatedCounter))
             continue;
 
@@ -432,7 +461,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
           Changed = true;
         }
         if (CS.isConditionImplied(
-                ConstraintSystem::negate(R[0].Coefficients))) {
+                ConstraintSystem::negate(R.get(0).Coefficients))) {
           if (!DebugCounter::shouldExecute(EliminatedCounter))
             continue;
 
@@ -479,7 +508,7 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT) {
 
     LLVM_DEBUG(dbgs() << "Adding " << *CB.Condition << " " << CB.Not << "\n");
     bool Added = false;
-    for (auto &C : R) {
+    for (auto &C : R.Constraints) {
       auto Coeffs = C.Coefficients;
       LLVM_DEBUG({
         dbgs() << "  constraint: ";
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index eadbb4293539..ae636e7b61f7 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -699,17 +699,14 @@ bool isNoopIntrinsic(Instruction *I) {
 }
 
 // Check if we can ignore \p D for DSE.
-bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller,
-                const TargetLibraryInfo &TLI) {
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
   Instruction *DI = D->getMemoryInst();
   // Calls that only access inaccessible memory cannot read or write any memory
   // locations we consider for elimination.
   if (auto *CB = dyn_cast<CallBase>(DI))
-    if (CB->onlyAccessesInaccessibleMemory()) {
-      if (isAllocLikeFn(DI, &TLI))
-        return false;
+    if (CB->onlyAccessesInaccessibleMemory())
       return true;
-    }
+
   // We can eliminate stores to locations not visible to the caller across
   // throwing instructions.
   if (DI->mayThrow() && !DefVisibleToCaller)
@@ -759,10 +756,8 @@ struct DSEState {
   SmallVector<MemoryDef *, 64> MemDefs;
   // Any that should be skipped as they are already deleted
   SmallPtrSet<MemoryAccess *, 4> SkipStores;
-  // Keep track of all of the objects that are invisible to the caller before
-  // the function returns.
-  // SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
-  DenseMap<const Value *, bool> InvisibleToCallerBeforeRet;
+  // Keep track whether a given object is captured before return or not.
+  DenseMap<const Value *, bool> CapturedBeforeReturn;
   // Keep track of all of the objects that are invisible to the caller after
   // the function returns.
   DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
@@ -805,12 +800,8 @@ struct DSEState {
     // Treat byval or inalloca arguments the same as Allocas, stores to them are
     // dead at the end of the function.
     for (Argument &AI : F.args())
-      if (AI.hasPassPointeeByValueCopyAttr()) {
-        // For byval, the caller doesn't know the address of the allocation.
-        if (AI.hasByValAttr())
-          InvisibleToCallerBeforeRet.insert({&AI, true});
+      if (AI.hasPassPointeeByValueCopyAttr())
         InvisibleToCallerAfterRet.insert({&AI, true});
-      }
 
     // Collect whether there is any irreducible control flow in the function.
     ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
@@ -835,6 +826,20 @@ struct DSEState {
     if (!isGuaranteedLoopIndependent(DeadI, KillingI, DeadLoc))
       return OW_Unknown;
 
+    const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts();
+    const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts();
+    const Value *DeadUndObj = getUnderlyingObject(DeadPtr);
+    const Value *KillingUndObj = getUnderlyingObject(KillingPtr);
+
+    // Check whether the killing store overwrites the whole object, in which
+    // case the size/offset of the dead store does not matter.
+    if (DeadUndObj == KillingUndObj && KillingLoc.Size.isPrecise()) {
+      uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F);
+      if (KillingUndObjSize != MemoryLocation::UnknownSize &&
+          KillingUndObjSize == KillingLoc.Size.getValue())
+        return OW_Complete;
+    }
+
     // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
     // get imprecise values here, though (except for unknown sizes).
     if (!KillingLoc.Size.isPrecise() || !DeadLoc.Size.isPrecise()) {
@@ -875,14 +880,6 @@ struct DSEState {
         return OW_Complete;
     }
 
-    // Check to see if the killing store is to the entire object (either a
-    // global, an alloca, or a byval/inalloca argument).  If so, then it clearly
-    // overwrites any other store to the same object.
-    const Value *DeadPtr = DeadLoc.Ptr->stripPointerCasts();
-    const Value *KillingPtr = KillingLoc.Ptr->stripPointerCasts();
-    const Value *DeadUndObj = getUnderlyingObject(DeadPtr);
-    const Value *KillingUndObj = getUnderlyingObject(KillingPtr);
-
     // If we can't resolve the same pointers to the same object, then we can't
     // analyze them at all.
     if (DeadUndObj != KillingUndObj) {
@@ -896,12 +893,6 @@ struct DSEState {
       return OW_Unknown;
     }
 
-    // If the KillingI store is to a recognizable object, get its size.
-    uint64_t KillingUndObjSize = getPointerSize(KillingUndObj, DL, TLI, &F);
-    if (KillingUndObjSize != MemoryLocation::UnknownSize)
-      if (KillingUndObjSize == KillingSize && KillingUndObjSize >= DeadSize)
-        return OW_Complete;
-
     // Okay, we have stores to two completely different pointers.  Try to
     // decompose the pointer into a "base + constant_offset" form.  If the base
     // pointers are equal, then we can reason about the two stores.
@@ -957,31 +948,30 @@ struct DSEState {
       return true;
     auto I = InvisibleToCallerAfterRet.insert({V, false});
     if (I.second) {
-      if (!isInvisibleToCallerBeforeRet(V)) {
+      if (!isInvisibleToCallerOnUnwind(V)) {
         I.first->second = false;
-      } else {
-        auto *Inst = dyn_cast<Instruction>(V);
-        if (Inst && isAllocLikeFn(Inst, &TLI))
-          I.first->second = !PointerMayBeCaptured(V, true, false);
+      } else if (isNoAliasCall(V)) {
+        I.first->second = !PointerMayBeCaptured(V, true, false);
       }
     }
     return I.first->second;
   }
 
-  bool isInvisibleToCallerBeforeRet(const Value *V) {
-    if (isa<AllocaInst>(V))
+  bool isInvisibleToCallerOnUnwind(const Value *V) {
+    bool RequiresNoCaptureBeforeUnwind;
+    if (!isNotVisibleOnUnwind(V, RequiresNoCaptureBeforeUnwind))
+      return false;
+    if (!RequiresNoCaptureBeforeUnwind)
       return true;
-    auto I = InvisibleToCallerBeforeRet.insert({V, false});
-    if (I.second) {
-      auto *Inst = dyn_cast<Instruction>(V);
-      if (Inst && isAllocLikeFn(Inst, &TLI))
-        // NOTE: This could be made more precise by PointerMayBeCapturedBefore
-        // with the killing MemoryDef. But we refrain from doing so for now to
-        // limit compile-time and this does not cause any changes to the number
-        // of stores removed on a large test set in practice.
-        I.first->second = !PointerMayBeCaptured(V, false, true);
-    }
-    return I.first->second;
+
+    auto I = CapturedBeforeReturn.insert({V, true});
+    if (I.second)
+      // NOTE: This could be made more precise by PointerMayBeCapturedBefore
+      // with the killing MemoryDef. But we refrain from doing so for now to
+      // limit compile-time and this does not cause any changes to the number
+      // of stores removed on a large test set in practice.
+      I.first->second = PointerMayBeCaptured(V, false, true);
+    return !I.first->second;
   }
 
   Optional<MemoryLocation> getLocForWrite(Instruction *I) const {
@@ -1269,8 +1259,7 @@ struct DSEState {
       MemoryDef *CurrentDef = cast<MemoryDef>(Current);
       Instruction *CurrentI = CurrentDef->getMemoryInst();
 
-      if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(KillingUndObj),
-                     TLI)) {
+      if (canSkipDef(CurrentDef, !isInvisibleToCallerOnUnwind(KillingUndObj))) {
         CanOptimize = false;
         continue;
       }
@@ -1442,7 +1431,7 @@ struct DSEState {
         continue;
       }
 
-      if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj)) {
+      if (UseInst->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj)) {
         LLVM_DEBUG(dbgs() << "  ... found throwing instruction\n");
         return None;
       }
@@ -1623,7 +1612,7 @@ struct DSEState {
     // First see if we can ignore it by using the fact that KillingI is an
     // alloca/alloca like object that is not visible to the caller during
     // execution of the function.
-    if (KillingUndObj && isInvisibleToCallerBeforeRet(KillingUndObj))
+    if (KillingUndObj && isInvisibleToCallerOnUnwind(KillingUndObj))
       return false;
 
     if (KillingI->getParent() == DeadI->getParent())
@@ -1639,7 +1628,7 @@ struct DSEState {
   bool isDSEBarrier(const Value *KillingUndObj, Instruction *DeadI) {
     // If DeadI may throw it acts as a barrier, unless we are to an
     // alloca/alloca like object that does not escape.
-    if (DeadI->mayThrow() && !isInvisibleToCallerBeforeRet(KillingUndObj))
+    if (DeadI->mayThrow() && !isInvisibleToCallerOnUnwind(KillingUndObj))
       return true;
 
     // If DeadI is an atomic load/store stronger than monotonic, do not try to
@@ -1696,6 +1685,84 @@ struct DSEState {
     return MadeChange;
   }
 
+  /// If we have a zero initializing memset following a call to malloc,
+  /// try folding it into a call to calloc.
+  bool tryFoldIntoCalloc(MemoryDef *Def, const Value *DefUO) {
+    Instruction *DefI = Def->getMemoryInst();
+    MemSetInst *MemSet = dyn_cast<MemSetInst>(DefI);
+    if (!MemSet)
+      // TODO: Could handle zero store to small allocation as well.
+      return false;
+    Constant *StoredConstant = dyn_cast<Constant>(MemSet->getValue());
+    if (!StoredConstant || !StoredConstant->isNullValue())
+      return false;
+
+    if (!isRemovable(DefI))
+      // The memset might be volatile..
+      return false;
+
+    if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
+        F.hasFnAttribute(Attribute::SanitizeAddress) ||
+        F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+        F.getName() == "calloc")
+      return false;
+    auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUO));
+    if (!Malloc)
+      return false;
+    auto *InnerCallee = Malloc->getCalledFunction();
+    if (!InnerCallee)
+      return false;
+    LibFunc Func;
+    if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
+        Func != LibFunc_malloc)
+      return false;
+
+    auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
+      // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
+      // of malloc block
+      auto *MallocBB = Malloc->getParent(),
+        *MemsetBB = Memset->getParent();
+      if (MallocBB == MemsetBB)
+        return true;
+      auto *Ptr = Memset->getArgOperand(0);
+      auto *TI = MallocBB->getTerminator();
+      ICmpInst::Predicate Pred;
+      BasicBlock *TrueBB, *FalseBB;
+      if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
+                          FalseBB)))
+        return false;
+      if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
+        return false;
+      return true;
+    };
+
+    if (Malloc->getOperand(0) != MemSet->getLength())
+      return false;
+    if (!shouldCreateCalloc(Malloc, MemSet) ||
+        !DT.dominates(Malloc, MemSet) ||
+        !memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT))
+      return false;
+    IRBuilder<> IRB(Malloc);
+    const auto &DL = Malloc->getModule()->getDataLayout();
+    auto *Calloc =
+      emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1),
+                 Malloc->getArgOperand(0), IRB, TLI);
+    if (!Calloc)
+      return false;
+    MemorySSAUpdater Updater(&MSSA);
+    auto *LastDef =
+      cast<MemoryDef>(Updater.getMemorySSA()->getMemoryAccess(Malloc));
+    auto *NewAccess =
+      Updater.createMemoryAccessAfter(cast<Instruction>(Calloc), LastDef,
+                                      LastDef);
+    auto *NewAccessMD = cast<MemoryDef>(NewAccess);
+    Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
+    Updater.removeMemoryAccess(Malloc);
+    Malloc->replaceAllUsesWith(Calloc);
+    Malloc->eraseFromParent();
+    return true;
+  }
+
   /// \returns true if \p Def is a no-op store, either because it
   /// directly stores back a loaded value or stores zero to a calloced object.
   bool storeIsNoop(MemoryDef *Def, const Value *DefUO) {
@@ -1713,81 +1780,15 @@ struct DSEState {
     if (!isRemovable(DefI))
       return false;
 
-    if (StoredConstant && StoredConstant->isNullValue()) {
-      auto *DefUOInst = dyn_cast<Instruction>(DefUO);
-      if (DefUOInst) {
-        if (isCallocLikeFn(DefUOInst, &TLI)) {
-          auto *UnderlyingDef =
-              cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
-          // If UnderlyingDef is the clobbering access of Def, no instructions
-          // between them can modify the memory location.
-          auto *ClobberDef =
-              MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
-          return UnderlyingDef == ClobberDef;
-        }
-
-        if (MemSet) {
-          if (F.hasFnAttribute(Attribute::SanitizeMemory) ||
-              F.hasFnAttribute(Attribute::SanitizeAddress) ||
-              F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
-              F.getName() == "calloc")
-            return false;
-          auto *Malloc = const_cast<CallInst *>(dyn_cast<CallInst>(DefUOInst));
-          if (!Malloc)
-            return false;
-          auto *InnerCallee = Malloc->getCalledFunction();
-          if (!InnerCallee)
-            return false;
-          LibFunc Func;
-          if (!TLI.getLibFunc(*InnerCallee, Func) || !TLI.has(Func) ||
-              Func != LibFunc_malloc)
-            return false;
-
-          auto shouldCreateCalloc = [](CallInst *Malloc, CallInst *Memset) {
-            // Check for br(icmp ptr, null), truebb, falsebb) pattern at the end
-            // of malloc block
-            auto *MallocBB = Malloc->getParent(),
-                 *MemsetBB = Memset->getParent();
-            if (MallocBB == MemsetBB)
-              return true;
-            auto *Ptr = Memset->getArgOperand(0);
-            auto *TI = MallocBB->getTerminator();
-            ICmpInst::Predicate Pred;
-            BasicBlock *TrueBB, *FalseBB;
-            if (!match(TI, m_Br(m_ICmp(Pred, m_Specific(Ptr), m_Zero()), TrueBB,
-                                FalseBB)))
-              return false;
-            if (Pred != ICmpInst::ICMP_EQ || MemsetBB != FalseBB)
-              return false;
-            return true;
-          };
-
-          if (Malloc->getOperand(0) == MemSet->getLength()) {
-            if (shouldCreateCalloc(Malloc, MemSet) &&
-                DT.dominates(Malloc, MemSet) &&
-                memoryIsNotModifiedBetween(Malloc, MemSet, BatchAA, DL, &DT)) {
-              IRBuilder<> IRB(Malloc);
-              const auto &DL = Malloc->getModule()->getDataLayout();
-              if (auto *Calloc =
-                      emitCalloc(ConstantInt::get(IRB.getIntPtrTy(DL), 1),
-                                 Malloc->getArgOperand(0), IRB, TLI)) {
-                MemorySSAUpdater Updater(&MSSA);
-                auto *LastDef = cast<MemoryDef>(
-                    Updater.getMemorySSA()->getMemoryAccess(Malloc));
-                auto *NewAccess = Updater.createMemoryAccessAfter(
-                    cast<Instruction>(Calloc), LastDef, LastDef);
-                auto *NewAccessMD = cast<MemoryDef>(NewAccess);
-                Updater.insertDef(NewAccessMD, /*RenameUses=*/true);
-                Updater.removeMemoryAccess(Malloc);
-                Malloc->replaceAllUsesWith(Calloc);
-                Malloc->eraseFromParent();
-                return true;
-              }
-              return false;
-            }
-          }
-        }
-      }
+    if (StoredConstant && isAllocationFn(DefUO, &TLI)) {
+      auto *CB = cast<CallBase>(DefUO);
+      auto *InitC = getInitialValueOfAllocation(CB, &TLI,
+                                                StoredConstant->getType());
+      // If the clobbering access is LiveOnEntry, no instructions between them
+      // can modify the memory location.
+      if (InitC && InitC == StoredConstant)
+        return MSSA.isLiveOnEntryDef(
+            MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def));
     }
 
     if (!Store)
@@ -2074,6 +2075,15 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
       MadeChange = true;
       continue;
     }
+
+    // Can we form a calloc from a memset/malloc pair?
+    if (!Shortend && State.tryFoldIntoCalloc(KillingDef, KillingUndObj)) {
+      LLVM_DEBUG(dbgs() << "DSE: Remove memset after forming calloc:\n"
+                        << "  DEAD: " << *KillingI << '\n');
+      State.deleteDeadInstruction(KillingI);
+      MadeChange = true;
+      continue;
+    }
   }
 
   if (EnablePartialOverwriteTracking)
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index a24997dd3fd4..59b934c16c8a 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -827,10 +827,13 @@ private:
                         const ParseMemoryInst &Later);
 
   Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+    // TODO: We could insert relevant casts on type mismatch here.
     if (auto *LI = dyn_cast<LoadInst>(Inst))
-      return LI;
-    if (auto *SI = dyn_cast<StoreInst>(Inst))
-      return SI->getValueOperand();
+      return LI->getType() == ExpectedType ? LI : nullptr;
+    else if (auto *SI = dyn_cast<StoreInst>(Inst)) {
+      Value *V = SI->getValueOperand();
+      return V->getType() == ExpectedType ? V : nullptr;
+    }
     assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
     auto *II = cast<IntrinsicInst>(Inst);
     if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 00506fb86006..398c93e8758c 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1104,20 +1104,19 @@ bool GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
   }
   assert(DepInfo.isDef() && "follows from above");
 
-  // Loading the allocation -> undef.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
-      isAlignedAllocLikeFn(DepInst, TLI) ||
-      // Loading immediately after lifetime begin -> undef.
-      isLifetimeStart(DepInst)) {
+  // Loading the alloca -> undef.
+  // Loading immediately after lifetime begin -> undef.
+  if (isa<AllocaInst>(DepInst) || isLifetimeStart(DepInst)) {
     Res = AvailableValue::get(UndefValue::get(Load->getType()));
     return true;
   }
 
-  // Loading from calloc (which zero initializes memory) -> zero
-  if (isCallocLikeFn(DepInst, TLI)) {
-    Res = AvailableValue::get(Constant::getNullValue(Load->getType()));
-    return true;
-  }
+  if (isAllocationFn(DepInst, TLI))
+    if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst),
+                                                    TLI, Load->getType())) {
+      Res = AvailableValue::get(InitVal);
+      return true;
+    }
 
   if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
     // Reject loads and stores that are to the same address but are of
@@ -1769,7 +1768,7 @@ bool GVNPass::processAssumeIntrinsic(AssumeInst *IntrinsicI) {
       // Insert a new store to null instruction before the load to indicate that
       // this code is not reachable.  FIXME: We could insert unreachable
       // instruction directly because we can modify the CFG.
-      auto *NewS = new StoreInst(UndefValue::get(Int8Ty),
+      auto *NewS = new StoreInst(PoisonValue::get(Int8Ty),
                                  Constant::getNullValue(Int8Ty->getPointerTo()),
                                  IntrinsicI);
       if (MSSAU) {
@@ -2991,12 +2990,12 @@ void GVNPass::addDeadBlock(BasicBlock *BB) {
       }
     }
 
-    // Now undef the incoming values from the dead predecessors.
+    // Now poison the incoming values from the dead predecessors.
     for (BasicBlock *P : predecessors(B)) {
       if (!DeadBlocks.count(P))
         continue;
       for (PHINode &Phi : B->phis()) {
-        Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
+        Phi.setIncomingValueForBlock(P, PoisonValue::get(Phi.getType()));
         if (MD)
           MD->invalidateCachedPointerInfo(&Phi);
       }
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 7001d330fce0..ceb03eb17f6d 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -138,8 +138,6 @@ AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
 
 namespace {
 
-struct RewritePhi;
-
 class IndVarSimplify {
   LoopInfo *LI;
   ScalarEvolution *SE;
@@ -982,6 +980,7 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
   assert(isLoopCounter(IndVar, L, SE));
   const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
   const SCEV *IVInit = AR->getStart();
+  assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
 
   // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter
   // finds a valid pointer IV. Sign extend ExitCount in order to materialize a
@@ -1004,13 +1003,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
     assert(SE->isLoopInvariant(IVOffset, L) &&
            "Computed iteration count is not loop invariant!");
 
-    // We could handle pointer IVs other than i8*, but we need to compensate for
-    // gep index scaling.
-    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
-                             cast<PointerType>(IndVar->getType())
-                                 ->getElementType())->isOne() &&
-           "unit stride pointer IV must be i8*");
-
     const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset);
     BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
     return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI);
@@ -1026,7 +1018,6 @@ static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
     // IVInit integer and ExitCount pointer would only occur if a canonical IV
     // were generated on top of case #2, which is not expected.
 
-    assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
     // For unit stride, IVCount = Start + ExitCount with 2's complement
     // overflow.
 
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 883d4afff3bd..8f5933b7bd71 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -250,12 +250,6 @@ public:
 
 char InferAddressSpaces::ID = 0;
 
-namespace llvm {
-
-void initializeInferAddressSpacesPass(PassRegistry &);
-
-} // end namespace llvm
-
 INITIALIZE_PASS_BEGIN(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index fe9a7211967c..a3efad104ca6 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -728,8 +728,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
   // Handle some boolean conditions.
   if (I->getType()->getPrimitiveSizeInBits() == 1) {
     using namespace PatternMatch;
-
-    assert(Preference == WantInteger && "One-bit non-integer type?");
+    if (Preference != WantInteger)
+      return false;
     // X | true -> true
     // X & false -> false
     Value *Op0, *Op1;
@@ -789,8 +789,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
 
   // Try to simplify some other binary operator values.
   } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-    assert(Preference != WantBlockAddress
-            && "A binary operator creating a block address?");
+    if (Preference != WantInteger)
+      return false;
     if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
       PredValueInfoTy LHSVals;
       computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
@@ -811,7 +811,8 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
 
   // Handle compare with phi operand, where the PHI is defined in this block.
   if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
-    assert(Preference == WantInteger && "Compares only produce integers");
+    if (Preference != WantInteger)
+      return false;
     Type *CmpType = Cmp->getType();
     Value *CmpLHS = Cmp->getOperand(0);
     Value *CmpRHS = Cmp->getOperand(1);
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index bc792ca3d8da..7fb1a25bdf13 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1355,7 +1355,7 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
         TargetTransformInfo::TCC_Free)
       return false;
     // For a GEP, we cannot simply use getUserCost because currently it
-    // optimistically assume that a GEP will fold into addressing mode
+    // optimistically assumes that a GEP will fold into addressing mode
     // regardless of its users.
     const BasicBlock *BB = GEP->getParent();
     for (const User *U : GEP->users()) {
@@ -1923,26 +1923,15 @@ bool isNotCapturedBeforeOrInLoop(const Value *V, const Loop *L,
                                      L->getHeader()->getTerminator(), DT);
 }
 
-/// Return true iff we can prove that a caller of this function can not inspect
-/// the contents of the provided object in a well defined program.
-bool isKnownNonEscaping(Value *Object, const Loop *L,
-                        const TargetLibraryInfo *TLI, DominatorTree *DT) {
-  if (isa<AllocaInst>(Object))
-    // Since the alloca goes out of scope, we know the caller can't retain a
-    // reference to it and be well defined.  Thus, we don't need to check for
-    // capture.
-    return true;
+/// Return true if we can prove that a caller cannot inspect the object if an
+/// unwind occurs inside the loop.
+bool isNotVisibleOnUnwindInLoop(const Value *Object, const Loop *L,
+                                DominatorTree *DT) {
+  bool RequiresNoCaptureBeforeUnwind;
+  if (!isNotVisibleOnUnwind(Object, RequiresNoCaptureBeforeUnwind))
+    return false;
 
-  // For all other objects we need to know that the caller can't possibly
-  // have gotten a reference to the object.  There are two components of
-  // that:
-  //   1) Object can't be escaped by this function.  This is what
-  //      PointerMayBeCaptured checks.
-  //   2) Object can't have been captured at definition site.  For this, we
-  //      need to know the return value is noalias.  At the moment, we use a
-  //      weaker condition and handle only AllocLikeFunctions (which are
-  //      known to be noalias).  TODO
-  return isAllocLikeFn(Object, TLI) &&
+  return !RequiresNoCaptureBeforeUnwind ||
          isNotCapturedBeforeOrInLoop(Object, L, DT);
 }
 
@@ -2030,7 +2019,7 @@ bool llvm::promoteLoopAccessesToScalars(
     // this by proving that the caller can't have a reference to the object
     // after return and thus can't possibly load from the object.
     Value *Object = getUnderlyingObject(SomePtr);
-    if (!isKnownNonEscaping(Object, CurLoop, TLI, DT))
+    if (!isNotVisibleOnUnwindInLoop(Object, CurLoop, DT))
       return false;
     // Subtlety: Alloca's aren't visible to callers, but *are* potentially
     // visible to other threads if captured and used during their lifetimes.
@@ -2163,7 +2152,7 @@ bool llvm::promoteLoopAccessesToScalars(
     else {
       Value *Object = getUnderlyingObject(SomePtr);
       SafeToInsertStore =
-          (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+          (isNoAliasCall(Object) || isa<AllocaInst>(Object)) &&
           isNotCapturedBeforeOrInLoop(Object, CurLoop, DT);
     }
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 5814e2f043d5..361d6c0d9381 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -407,25 +407,19 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   if (!L->getLoopLatch())
     return LoopDeletionResult::Unmodified;
 
-  auto *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
-  if (BTC->isZero()) {
-    // SCEV knows this backedge isn't taken!
-    breakLoopBackedge(L, DT, SE, LI, MSSA);
-    ++NumBackedgesBroken;
-    return LoopDeletionResult::Deleted;
-  }
-
-  // If SCEV leaves open the possibility of a zero trip count, see if
-  // symbolically evaluating the first iteration lets us prove the backedge
-  // unreachable.
-  if (isa<SCEVCouldNotCompute>(BTC) || !SE.isKnownNonZero(BTC))
-    if (canProveExitOnFirstIteration(L, DT, LI)) {
-      breakLoopBackedge(L, DT, SE, LI, MSSA);
-      ++NumBackedgesBroken;
-      return LoopDeletionResult::Deleted;
+  auto *BTCMax = SE.getConstantMaxBackedgeTakenCount(L);
+  if (!BTCMax->isZero()) {
+    auto *BTC = SE.getBackedgeTakenCount(L);
+    if (!BTC->isZero()) {
+      if (!isa<SCEVCouldNotCompute>(BTC) && SE.isKnownNonZero(BTC))
+        return LoopDeletionResult::Unmodified;
+      if (!canProveExitOnFirstIteration(L, DT, LI))
+        return LoopDeletionResult::Unmodified;
     }
-
-  return LoopDeletionResult::Unmodified;
+  }
+  ++NumBackedgesBroken;
+  breakLoopBackedge(L, DT, SE, LI, MSSA);
+  return LoopDeletionResult::Deleted;
 }
 
 /// Remove a loop if it is dead.
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 965d1575518e..c46db4e63bfe 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -10,10 +10,13 @@
 //
 // The intention is to optimise loop nests like this, which together access an
 // array linearly:
+//
 //   for (int i = 0; i < N; ++i)
 //     for (int j = 0; j < M; ++j)
 //       f(A[i*M+j]);
+//
 // into one loop:
+//
 //   for (int i = 0; i < (N*M); ++i)
 //     f(A[i]);
 //
@@ -22,7 +25,27 @@
 // expression like i*M+j. If they had any other uses, we would have to insert a
 // div/mod to reconstruct the original values, so this wouldn't be profitable.
 //
-// We also need to prove that N*M will not overflow.
+// We also need to prove that N*M will not overflow. The preferred solution is
+// to widen the IV, which avoids overflow checks, so that is tried first. If
+// the IV cannot be widened, then we try to determine that this new tripcount
+// expression won't overflow.
+//
+// Q: Does LoopFlatten use SCEV?
+// Short answer: Yes and no.
+//
+// Long answer:
+// For this transformation to be valid, we require all uses of the induction
+// variables to be linear expressions of the form i*M+j. The different Loop
+// APIs are used to get some loop components like the induction variable,
+// compare statement, etc. In addition, we do some pattern matching to find the
+// linear expressions and other loop components like the loop increment. The
+// latter are examples of expressions that do use the induction variable, but
+// are safe to ignore when we check all uses to be of the form i*M+j. We keep
+// track of all of this in bookkeeping struct FlattenInfo.
+// We assume the loops to be canonical, i.e. starting at 0 and increment with
+// 1. This makes RHS of the compare the loop tripcount (with the right
+// predicate). We use SCEV to then sanity check that this tripcount matches
+// with the tripcount as computed by SCEV.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,6 +54,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -70,37 +94,54 @@ static cl::opt<bool>
                               "trip counts will never overflow"));
 
 static cl::opt<bool>
-    WidenIV("loop-flatten-widen-iv", cl::Hidden,
-            cl::init(true),
+    WidenIV("loop-flatten-widen-iv", cl::Hidden, cl::init(true),
             cl::desc("Widen the loop induction variables, if possible, so "
                      "overflow checks won't reject flattening"));
 
+// We require all uses of both induction variables to match this pattern:
+//
+//   (OuterPHI * InnerTripCount) + InnerPHI
+//
+// I.e., it needs to be a linear expression of the induction variables and the
+// inner loop trip count. We keep track of all different expressions on which
+// checks will be performed in this bookkeeping struct.
+//
 struct FlattenInfo {
-  Loop *OuterLoop = nullptr;
+  Loop *OuterLoop = nullptr;  // The loop pair to be flattened.
   Loop *InnerLoop = nullptr;
-  // These PHINodes correspond to loop induction variables, which are expected
-  // to start at zero and increment by one on each loop.
-  PHINode *InnerInductionPHI = nullptr;
-  PHINode *OuterInductionPHI = nullptr;
-  Value *InnerTripCount = nullptr;
-  Value *OuterTripCount = nullptr;
-  BinaryOperator *InnerIncrement = nullptr;
-  BinaryOperator *OuterIncrement = nullptr;
-  BranchInst *InnerBranch = nullptr;
-  BranchInst *OuterBranch = nullptr;
-  SmallPtrSet<Value *, 4> LinearIVUses;
+
+  PHINode *InnerInductionPHI = nullptr; // These PHINodes correspond to loop
+  PHINode *OuterInductionPHI = nullptr; // induction variables, which are
+                                        // expected to start at zero and
+                                        // increment by one on each loop.
+
+  Value *InnerTripCount = nullptr; // The product of these two tripcounts
+  Value *OuterTripCount = nullptr; // will be the new flattened loop
+                                   // tripcount. Also used to recognise a
+                                   // linear expression that will be replaced.
+
+  SmallPtrSet<Value *, 4> LinearIVUses;  // Contains the linear expressions
+                                         // of the form i*M+j that will be
+                                         // replaced.
+
+  BinaryOperator *InnerIncrement = nullptr;  // Uses of induction variables in
+  BinaryOperator *OuterIncrement = nullptr;  // loop control statements that
+  BranchInst *InnerBranch = nullptr;         // are safe to ignore.
+
+  BranchInst *OuterBranch = nullptr; // The instruction that needs to be
+                                     // updated with new tripcount.
+
   SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
 
-  // Whether this holds the flatten info before or after widening.
-  bool Widened = false;
+  bool Widened = false; // Whether this holds the flatten info before or after
+                        // widening.
 
-  // Holds the old/narrow induction phis, i.e. the Phis before IV widening has
-  // been applied. This bookkeeping is used so we can skip some checks on these
-  // phi nodes.
-  PHINode *NarrowInnerInductionPHI = nullptr;
-  PHINode *NarrowOuterInductionPHI = nullptr;
+  PHINode *NarrowInnerInductionPHI = nullptr; // Holds the old/narrow induction
+  PHINode *NarrowOuterInductionPHI = nullptr; // phis, i.e. the Phis before IV
+                                              // has been apllied. Used to skip
+                                              // checks on phi nodes.
 
-  FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL) {};
+  FlattenInfo(Loop *OL, Loop *IL) : OuterLoop(OL), InnerLoop(IL){};
 
   bool isNarrowInductionPhi(PHINode *Phi) {
     // This can't be the narrow phi if we haven't widened the IV first.
@@ -108,6 +149,118 @@ struct FlattenInfo {
       return false;
     return NarrowInnerInductionPHI == Phi || NarrowOuterInductionPHI == Phi;
   }
+  bool isInnerLoopIncrement(User *U) {
+    return InnerIncrement == U;
+  }
+  bool isOuterLoopIncrement(User *U) {
+    return OuterIncrement == U;
+  }
+  bool isInnerLoopTest(User *U) {
+    return InnerBranch->getCondition() == U;
+  }
+
+  bool checkOuterInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) {
+    for (User *U : OuterInductionPHI->users()) {
+      if (isOuterLoopIncrement(U))
+        continue;
+
+      auto IsValidOuterPHIUses = [&] (User *U) -> bool {
+        LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
+        if (!ValidOuterPHIUses.count(U)) {
+          LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+          return false;
+        }
+        LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+        return true;
+      };
+
+      if (auto *V = dyn_cast<TruncInst>(U)) {
+        for (auto *K : V->users()) {
+          if (!IsValidOuterPHIUses(K))
+            return false;
+        }
+        continue;
+      }
+
+      if (!IsValidOuterPHIUses(U))
+        return false;
+    }
+    return true;
+  }
+
+  bool matchLinearIVUser(User *U, Value *InnerTripCount,
+                         SmallPtrSet<Value *, 4> &ValidOuterPHIUses) {
+    LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
+    Value *MatchedMul = nullptr;
+    Value *MatchedItCount = nullptr;
+
+    bool IsAdd = match(U, m_c_Add(m_Specific(InnerInductionPHI),
+                                  m_Value(MatchedMul))) &&
+                 match(MatchedMul, m_c_Mul(m_Specific(OuterInductionPHI),
+                                           m_Value(MatchedItCount)));
+
+    // Matches the same pattern as above, except it also looks for truncs
+    // on the phi, which can be the result of widening the induction variables.
+    bool IsAddTrunc =
+        match(U, m_c_Add(m_Trunc(m_Specific(InnerInductionPHI)),
+                         m_Value(MatchedMul))) &&
+        match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(OuterInductionPHI)),
+                                  m_Value(MatchedItCount)));
+
+    if (!MatchedItCount)
+      return false;
+
+    // Look through extends if the IV has been widened.
+    if (Widened &&
+        (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) {
+      assert(MatchedItCount->getType() == InnerInductionPHI->getType() &&
+             "Unexpected type mismatch in types after widening");
+      MatchedItCount = isa<SExtInst>(MatchedItCount)
+                           ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0)
+                           : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0);
+    }
+
+    if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) {
+      LLVM_DEBUG(dbgs() << "Use is optimisable\n");
+      ValidOuterPHIUses.insert(MatchedMul);
+      LinearIVUses.insert(U);
+      return true;
+    }
+
+    LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
+    return false;
+  }
+
+  bool checkInnerInductionPhiUsers(SmallPtrSet<Value *, 4> &ValidOuterPHIUses) {
+    Value *SExtInnerTripCount = InnerTripCount;
+    if (Widened &&
+        (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount)))
+      SExtInnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0);
+
+    for (User *U : InnerInductionPHI->users()) {
+      if (isInnerLoopIncrement(U))
+        continue;
+
+      // After widening the IVs, a trunc instruction might have been introduced,
+      // so look through truncs.
+      if (isa<TruncInst>(U)) {
+        if (!U->hasOneUse())
+          return false;
+        U = *U->user_begin();
+      }
+
+      // If the use is in the compare (which is also the condition of the inner
+      // branch) then the compare has been altered by another transformation e.g
+      // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is
+      // a constant. Ignore this use as the compare gets removed later anyway.
+      if (isInnerLoopTest(U))
+        continue;
+
+      if (!matchLinearIVUser(U, SExtInnerTripCount, ValidOuterPHIUses))
+        return false;
+    }
+    return true;
+  }
 };
 
 static bool
@@ -121,6 +274,77 @@ setLoopComponents(Value *&TC, Value *&TripCount, BinaryOperator *&Increment,
   return true;
 }
 
+// Given the RHS of the loop latch compare instruction, verify with SCEV
+// that this is indeed the loop tripcount.
+// TODO: This used to be a straightforward check but has grown to be quite
+// complicated now. It is therefore worth revisiting what the additional
+// benefits are of this (compared to relying on canonical loops and pattern
+// matching).
+static bool verifyTripCount(Value *RHS, Loop *L,
+     SmallPtrSetImpl<Instruction *> &IterationInstructions,
+    PHINode *&InductionPHI, Value *&TripCount, BinaryOperator *&Increment,
+    BranchInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) {
+  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n");
+    return false;
+  }
+
+  // The Extend=false flag is used for getTripCountFromExitCount as we want
+  // to verify and match it with the pattern matched tripcount. Please note
+  // that overflow checks are performed in checkOverflow, but are first tried
+  // to avoid by widening the IV.
+  const SCEV *SCEVTripCount =
+      SE->getTripCountFromExitCount(BackedgeTakenCount, /*Extend=*/false);
+
+  const SCEV *SCEVRHS = SE->getSCEV(RHS);
+  if (SCEVRHS == SCEVTripCount)
+    return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+  ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS);
+  if (ConstantRHS) {
+    const SCEV *BackedgeTCExt = nullptr;
+    if (IsWidened) {
+      const SCEV *SCEVTripCountExt;
+      // Find the extended backedge taken count and extended trip count using
+      // SCEV. One of these should now match the RHS of the compare.
+      BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType());
+      SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false);
+      if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) {
+        LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+        return false;
+      }
+    }
+    // If the RHS of the compare is equal to the backedge taken count we need
+    // to add one to get the trip count.
+    if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) {
+      ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1);
+      Value *NewRHS = ConstantInt::get(
+          ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue());
+      return setLoopComponents(NewRHS, TripCount, Increment,
+                               IterationInstructions);
+    }
+    return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+  }
+  // If the RHS isn't a constant then check that the reason it doesn't match
+  // the SCEV trip count is because the RHS is a ZExt or SExt instruction
+  // (and take the trip count to be the RHS).
+  if (!IsWidened) {
+    LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+    return false;
+  }
+  auto *TripCountInst = dyn_cast<Instruction>(RHS);
+  if (!TripCountInst) {
+    LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
+    return false;
+  }
+  if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) ||
+      SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) {
+    LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
+    return false;
+  }
+  return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+}
+
 // Finds the induction variable, increment and trip count for a simple loop that
 // we can flatten.
 static bool findLoopComponents(
@@ -197,63 +421,9 @@ static bool findLoopComponents(
   // another transformation has changed the compare (e.g. icmp ult %inc,
   // tripcount -> icmp ult %j, tripcount-1), or both.
   Value *RHS = Compare->getOperand(1);
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
-    LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n");
-    return false;
-  }
-  // The use of the Extend=false flag on getTripCountFromExitCount was added
-  // during a refactoring to preserve existing behavior.  However, there's
-  // nothing obvious in the surrounding code when handles the overflow case.
-  // FIXME: audit code to establish whether there's a latent bug here.
-  const SCEV *SCEVTripCount =
-    SE->getTripCountFromExitCount(BackedgeTakenCount, false);
-  const SCEV *SCEVRHS = SE->getSCEV(RHS);
-  if (SCEVRHS == SCEVTripCount)
-    return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
-  ConstantInt *ConstantRHS = dyn_cast<ConstantInt>(RHS);
-  if (ConstantRHS) {
-    const SCEV *BackedgeTCExt = nullptr;
-    if (IsWidened) {
-      const SCEV *SCEVTripCountExt;
-      // Find the extended backedge taken count and extended trip count using
-      // SCEV. One of these should now match the RHS of the compare.
-      BackedgeTCExt = SE->getZeroExtendExpr(BackedgeTakenCount, RHS->getType());
-      SCEVTripCountExt = SE->getTripCountFromExitCount(BackedgeTCExt, false);
-      if (SCEVRHS != BackedgeTCExt && SCEVRHS != SCEVTripCountExt) {
-        LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
-        return false;
-      }
-    }
-    // If the RHS of the compare is equal to the backedge taken count we need
-    // to add one to get the trip count.
-    if (SCEVRHS == BackedgeTCExt || SCEVRHS == BackedgeTakenCount) {
-      ConstantInt *One = ConstantInt::get(ConstantRHS->getType(), 1);
-      Value *NewRHS = ConstantInt::get(
-          ConstantRHS->getContext(), ConstantRHS->getValue() + One->getValue());
-      return setLoopComponents(NewRHS, TripCount, Increment,
-                               IterationInstructions);
-    }
-    return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
-  }
-  // If the RHS isn't a constant then check that the reason it doesn't match
-  // the SCEV trip count is because the RHS is a ZExt or SExt instruction
-  // (and take the trip count to be the RHS).
-  if (!IsWidened) {
-    LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
-    return false;
-  }
-  auto *TripCountInst = dyn_cast<Instruction>(RHS);
-  if (!TripCountInst) {
-    LLVM_DEBUG(dbgs() << "Could not find valid trip count\n");
-    return false;
-  }
-  if ((!isa<ZExtInst>(TripCountInst) && !isa<SExtInst>(TripCountInst)) ||
-      SE->getSCEV(TripCountInst->getOperand(0)) != SCEVTripCount) {
-    LLVM_DEBUG(dbgs() << "Could not find valid extended trip count\n");
-    return false;
-  }
-  return setLoopComponents(RHS, TripCount, Increment, IterationInstructions);
+
+  return verifyTripCount(RHS, L, IterationInstructions, InductionPHI, TripCount,
+                         Increment, BackBranch, SE, IsWidened);
 }
 
 static bool checkPHIs(FlattenInfo &FI, const TargetTransformInfo *TTI) {
@@ -399,108 +569,26 @@ checkOuterLoopInsts(FlattenInfo &FI,
   return true;
 }
 
-static bool checkIVUsers(FlattenInfo &FI) {
-  // We require all uses of both induction variables to match this pattern:
-  //
-  //   (OuterPHI * InnerTripCount) + InnerPHI
-  //
-  // Any uses of the induction variables not matching that pattern would
-  // require a div/mod to reconstruct in the flattened loop, so the
-  // transformation wouldn't be profitable.
-
-  Value *InnerTripCount = FI.InnerTripCount;
-  if (FI.Widened &&
-      (isa<SExtInst>(InnerTripCount) || isa<ZExtInst>(InnerTripCount)))
-    InnerTripCount = cast<Instruction>(InnerTripCount)->getOperand(0);
 
+
+// We require all uses of both induction variables to match this pattern:
+//
+//   (OuterPHI * InnerTripCount) + InnerPHI
+//
+// Any uses of the induction variables not matching that pattern would
+// require a div/mod to reconstruct in the flattened loop, so the
+// transformation wouldn't be profitable.
+static bool checkIVUsers(FlattenInfo &FI) {
   // Check that all uses of the inner loop's induction variable match the
   // expected pattern, recording the uses of the outer IV.
   SmallPtrSet<Value *, 4> ValidOuterPHIUses;
-  for (User *U : FI.InnerInductionPHI->users()) {
-    if (U == FI.InnerIncrement)
-      continue;
-
-    // After widening the IVs, a trunc instruction might have been introduced,
-    // so look through truncs.
-    if (isa<TruncInst>(U)) {
-      if (!U->hasOneUse())
-        return false;
-      U = *U->user_begin();
-    }
-
-    // If the use is in the compare (which is also the condition of the inner
-    // branch) then the compare has been altered by another transformation e.g
-    // icmp ult %inc, tripcount -> icmp ult %j, tripcount-1, where tripcount is
-    // a constant. Ignore this use as the compare gets removed later anyway.
-    if (U == FI.InnerBranch->getCondition())
-      continue;
-
-    LLVM_DEBUG(dbgs() << "Found use of inner induction variable: "; U->dump());
-
-    Value *MatchedMul = nullptr;
-    Value *MatchedItCount = nullptr;
-    bool IsAdd = match(U, m_c_Add(m_Specific(FI.InnerInductionPHI),
-                                  m_Value(MatchedMul))) &&
-                 match(MatchedMul, m_c_Mul(m_Specific(FI.OuterInductionPHI),
-                                           m_Value(MatchedItCount)));
-
-    // Matches the same pattern as above, except it also looks for truncs
-    // on the phi, which can be the result of widening the induction variables.
-    bool IsAddTrunc =
-        match(U, m_c_Add(m_Trunc(m_Specific(FI.InnerInductionPHI)),
-                         m_Value(MatchedMul))) &&
-        match(MatchedMul, m_c_Mul(m_Trunc(m_Specific(FI.OuterInductionPHI)),
-                                  m_Value(MatchedItCount)));
-
-    if (!MatchedItCount)
-      return false;
-    // Look through extends if the IV has been widened.
-    if (FI.Widened &&
-        (isa<SExtInst>(MatchedItCount) || isa<ZExtInst>(MatchedItCount))) {
-      assert(MatchedItCount->getType() == FI.InnerInductionPHI->getType() &&
-             "Unexpected type mismatch in types after widening");
-      MatchedItCount = isa<SExtInst>(MatchedItCount)
-                           ? dyn_cast<SExtInst>(MatchedItCount)->getOperand(0)
-                           : dyn_cast<ZExtInst>(MatchedItCount)->getOperand(0);
-    }
-
-    if ((IsAdd || IsAddTrunc) && MatchedItCount == InnerTripCount) {
-      LLVM_DEBUG(dbgs() << "Use is optimisable\n");
-      ValidOuterPHIUses.insert(MatchedMul);
-      FI.LinearIVUses.insert(U);
-    } else {
-      LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
-      return false;
-    }
-  }
+  if (!FI.checkInnerInductionPhiUsers(ValidOuterPHIUses))
+    return false;
 
   // Check that there are no uses of the outer IV other than the ones found
   // as part of the pattern above.
-  for (User *U : FI.OuterInductionPHI->users()) {
-    if (U == FI.OuterIncrement)
-      continue;
-
-    auto IsValidOuterPHIUses = [&] (User *U) -> bool {
-      LLVM_DEBUG(dbgs() << "Found use of outer induction variable: "; U->dump());
-      if (!ValidOuterPHIUses.count(U)) {
-        LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n");
-        return false;
-      }
-      LLVM_DEBUG(dbgs() << "Use is optimisable\n");
-      return true;
-    };
-
-    if (auto *V = dyn_cast<TruncInst>(U)) {
-      for (auto *K : V->users()) {
-        if (!IsValidOuterPHIUses(K))
-          return false;
-      }
-      continue;
-    }
-
-    if (!IsValidOuterPHIUses(U))
-      return false;
-  }
+  if (!FI.checkOuterInductionPhiUsers(ValidOuterPHIUses))
+    return false;
 
   LLVM_DEBUG(dbgs() << "checkIVUsers: OK\n";
              dbgs() << "Found " << FI.LinearIVUses.size()
@@ -535,7 +623,7 @@ static OverflowResult checkOverflow(FlattenInfo &FI, DominatorTree *DT,
     for (Value *U : V->users()) {
       if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
         for (Value *GEPUser : U->users()) {
-          Instruction *GEPUserInst = dyn_cast<Instruction>(GEPUser);
+          auto *GEPUserInst = cast<Instruction>(GEPUser);
           if (!isa<LoadInst>(GEPUserInst) &&
               !(isa<StoreInst>(GEPUserInst) &&
                 GEP == GEPUserInst->getOperand(1)))
@@ -611,7 +699,8 @@ static bool CanFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
 
 static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
                               ScalarEvolution *SE, AssumptionCache *AC,
-                              const TargetTransformInfo *TTI, LPMUpdater *U) {
+                              const TargetTransformInfo *TTI, LPMUpdater *U,
+                              MemorySSAUpdater *MSSAU) {
   Function *F = FI.OuterLoop->getHeader()->getParent();
   LLVM_DEBUG(dbgs() << "Checks all passed, doing the transformation\n");
   {
@@ -647,7 +736,11 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
   InnerExitingBlock->getTerminator()->eraseFromParent();
   BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+
+  // Update the DomTree and MemorySSA.
   DT->deleteEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
+  if (MSSAU)
+    MSSAU->removeEdge(InnerExitingBlock, FI.InnerLoop->getHeader());
 
   // Replace all uses of the polynomial calculated from the two induction
   // variables with the one new one.
@@ -658,8 +751,8 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
       OuterValue = Builder.CreateTrunc(FI.OuterInductionPHI, V->getType(),
                                        "flatten.trunciv");
 
-    LLVM_DEBUG(dbgs() << "Replacing: "; V->dump();
-               dbgs() << "with:      "; OuterValue->dump());
+    LLVM_DEBUG(dbgs() << "Replacing: "; V->dump(); dbgs() << "with:      ";
+               OuterValue->dump());
     V->replaceAllUsesWith(OuterValue);
   }
 
@@ -698,7 +791,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   // (OuterTripCount * InnerTripCount) as the new trip count is safe.
   if (InnerType != OuterType ||
       InnerType->getScalarSizeInBits() >= MaxLegalSize ||
-      MaxLegalType->getScalarSizeInBits() < InnerType->getScalarSizeInBits() * 2) {
+      MaxLegalType->getScalarSizeInBits() <
+          InnerType->getScalarSizeInBits() * 2) {
     LLVM_DEBUG(dbgs() << "Can't widen the IV\n");
     return false;
   }
@@ -708,10 +802,10 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   unsigned ElimExt = 0;
   unsigned Widened = 0;
 
-  auto CreateWideIV = [&] (WideIVInfo WideIV, bool &Deleted) -> bool {
-    PHINode *WidePhi = createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts,
-                                    ElimExt, Widened, true /* HasGuards */,
-                                    true /* UsePostIncrementRanges */);
+  auto CreateWideIV = [&](WideIVInfo WideIV, bool &Deleted) -> bool {
+    PHINode *WidePhi =
+        createWideIV(WideIV, LI, SE, Rewriter, DT, DeadInsts, ElimExt, Widened,
+                     true /* HasGuards */, true /* UsePostIncrementRanges */);
     if (!WidePhi)
       return false;
     LLVM_DEBUG(dbgs() << "Created wide phi: "; WidePhi->dump());
@@ -721,14 +815,14 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   };
 
   bool Deleted;
-  if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false }, Deleted))
+  if (!CreateWideIV({FI.InnerInductionPHI, MaxLegalType, false}, Deleted))
     return false;
   // Add the narrow phi to list, so that it will be adjusted later when the
   // the transformation is performed.
   if (!Deleted)
     FI.InnerPHIsToTransform.insert(FI.InnerInductionPHI);
 
-  if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false }, Deleted))
+  if (!CreateWideIV({FI.OuterInductionPHI, MaxLegalType, false}, Deleted))
     return false;
 
   assert(Widened && "Widened IV expected");
@@ -744,7 +838,8 @@ static bool CanWidenIV(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
 
 static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
                             ScalarEvolution *SE, AssumptionCache *AC,
-                            const TargetTransformInfo *TTI, LPMUpdater *U) {
+                            const TargetTransformInfo *TTI, LPMUpdater *U,
+                            MemorySSAUpdater *MSSAU) {
   LLVM_DEBUG(
       dbgs() << "Loop flattening running on outer loop "
              << FI.OuterLoop->getHeader()->getName() << " and inner loop "
@@ -773,7 +868,7 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
 
   // If we have widened and can perform the transformation, do that here.
   if (CanFlatten)
-    return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
+    return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
 
   // Otherwise, if we haven't widened the IV, check if the new iteration
   // variable might overflow. In this case, we need to version the loop, and
@@ -791,18 +886,19 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   }
 
   LLVM_DEBUG(dbgs() << "Multiply cannot overflow, modifying loop in-place\n");
-  return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
+  return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
 }
 
 bool Flatten(LoopNest &LN, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE,
-             AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U) {
+             AssumptionCache *AC, TargetTransformInfo *TTI, LPMUpdater *U,
+             MemorySSAUpdater *MSSAU) {
   bool Changed = false;
   for (Loop *InnerLoop : LN.getLoops()) {
     auto *OuterLoop = InnerLoop->getParentLoop();
     if (!OuterLoop)
       continue;
     FlattenInfo FI(OuterLoop, InnerLoop);
-    Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U);
+    Changed |= FlattenLoopPair(FI, DT, LI, SE, AC, TTI, U, MSSAU);
   }
   return Changed;
 }
@@ -813,16 +909,30 @@ PreservedAnalyses LoopFlattenPass::run(LoopNest &LN, LoopAnalysisManager &LAM,
 
   bool Changed = false;
 
+  Optional<MemorySSAUpdater> MSSAU;
+  if (AR.MSSA) {
+    MSSAU = MemorySSAUpdater(AR.MSSA);
+    if (VerifyMemorySSA)
+      AR.MSSA->verifyMemorySSA();
+  }
+
   // The loop flattening pass requires loops to be
   // in simplified form, and also needs LCSSA. Running
   // this pass will simplify all loops that contain inner loops,
   // regardless of whether anything ends up being flattened.
-  Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U);
+  Changed |= Flatten(LN, &AR.DT, &AR.LI, &AR.SE, &AR.AC, &AR.TTI, &U,
+                     MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
 
   if (!Changed)
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  if (AR.MSSA && VerifyMemorySSA)
+    AR.MSSA->verifyMemorySSA();
+
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 namespace {
@@ -842,6 +952,7 @@ public:
     AU.addPreserved<TargetTransformInfoWrapperPass>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addPreserved<AssumptionCacheTracker>();
+    AU.addPreserved<MemorySSAWrapperPass>();
   }
 };
 } // namespace
@@ -854,7 +965,9 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(LoopFlattenLegacyPass, "loop-flatten", "Flattens loops",
                     false, false)
 
-FunctionPass *llvm::createLoopFlattenPass() { return new LoopFlattenLegacyPass(); }
+FunctionPass *llvm::createLoopFlattenPass() {
+  return new LoopFlattenLegacyPass();
+}
 
 bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
   ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -864,10 +977,17 @@ bool LoopFlattenLegacyPass::runOnFunction(Function &F) {
   auto &TTIP = getAnalysis<TargetTransformInfoWrapperPass>();
   auto *TTI = &TTIP.getTTI(F);
   auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto *MSSA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+
+  Optional<MemorySSAUpdater> MSSAU;
+  if (MSSA)
+    MSSAU = MemorySSAUpdater(&MSSA->getMSSA());
+
   bool Changed = false;
   for (Loop *L : *LI) {
     auto LN = LoopNest::getLoopNest(*L, *SE);
-    Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr);
+    Changed |= Flatten(*LN, DT, LI, SE, AC, TTI, nullptr,
+                       MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
   }
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 5d00fa56e888..35ba4e2b4032 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1117,7 +1117,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
-  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+  SCEVExpanderCleaner ExpCleaner(Expander);
 
   Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
   Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
@@ -1328,7 +1328,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
 
-  SCEVExpanderCleaner ExpCleaner(Expander, *DT);
+  SCEVExpanderCleaner ExpCleaner(Expander);
 
   bool Changed = false;
   const SCEV *StrStart = StoreEv->getStart();
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 9f605b4ac4ad..c2b065c4eb31 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -292,33 +292,6 @@ static LoopVector populateWorklist(Loop &L) {
   return LoopList;
 }
 
-static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
-  PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
-  if (InnerIndexVar)
-    return InnerIndexVar;
-  if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
-    return nullptr;
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PhiVar = cast<PHINode>(I);
-    Type *PhiTy = PhiVar->getType();
-    if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
-        !PhiTy->isPointerTy())
-      return nullptr;
-    const SCEVAddRecExpr *AddRec =
-        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
-    if (!AddRec || !AddRec->isAffine())
-      continue;
-    const SCEV *Step = AddRec->getStepRecurrence(*SE);
-    if (!isa<SCEVConstant>(Step))
-      continue;
-    // Found the induction variable.
-    // FIXME: Handle loops with more than one induction variable. Note that,
-    // currently, legality makes sure we have only one induction variable.
-    return PhiVar;
-  }
-  return nullptr;
-}
-
 namespace {
 
 /// LoopInterchangeLegality checks if it is legal to interchange the loop.
@@ -332,9 +305,13 @@ public:
   bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
                            CharMatrix &DepMatrix);
 
+  /// Discover induction PHIs in the header of \p L. Induction
+  /// PHIs are added to \p Inductions.
+  bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
+
   /// Check if the loop structure is understood. We do not handle triangular
   /// loops for now.
-  bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+  bool isLoopStructureUnderstood();
 
   bool currentLimitations();
 
@@ -342,6 +319,10 @@ public:
     return OuterInnerReductions;
   }
 
+  const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
+    return InnerLoopInductions;
+  }
+
 private:
   bool tightlyNested(Loop *Outer, Loop *Inner);
   bool containsUnsafeInstructions(BasicBlock *BB);
@@ -365,6 +346,9 @@ private:
   /// Set of reduction PHIs taking part of a reduction across the inner and
   /// outer loop.
   SmallPtrSet<PHINode *, 4> OuterInnerReductions;
+
+  /// Set of inner loop induction PHIs
+  SmallVector<PHINode *, 8> InnerLoopInductions;
 };
 
 /// LoopInterchangeProfitability checks if it is profitable to interchange the
@@ -635,25 +619,26 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   return true;
 }
 
-bool LoopInterchangeLegality::isLoopStructureUnderstood(
-    PHINode *InnerInduction) {
-  unsigned Num = InnerInduction->getNumOperands();
+bool LoopInterchangeLegality::isLoopStructureUnderstood() {
   BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
-  for (unsigned i = 0; i < Num; ++i) {
-    Value *Val = InnerInduction->getOperand(i);
-    if (isa<Constant>(Val))
-      continue;
-    Instruction *I = dyn_cast<Instruction>(Val);
-    if (!I)
-      return false;
-    // TODO: Handle triangular loops.
-    // e.g. for(int i=0;i<N;i++)
-    //        for(int j=i;j<N;j++)
-    unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
-    if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
-            InnerLoopPreheader &&
-        !OuterLoop->isLoopInvariant(I)) {
-      return false;
+  for (PHINode *InnerInduction : InnerLoopInductions) {
+    unsigned Num = InnerInduction->getNumOperands();
+    for (unsigned i = 0; i < Num; ++i) {
+      Value *Val = InnerInduction->getOperand(i);
+      if (isa<Constant>(Val))
+        continue;
+      Instruction *I = dyn_cast<Instruction>(Val);
+      if (!I)
+        return false;
+      // TODO: Handle triangular loops.
+      // e.g. for(int i=0;i<N;i++)
+      //        for(int j=i;j<N;j++)
+      unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+      if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+              InnerLoopPreheader &&
+          !OuterLoop->isLoopInvariant(I)) {
+        return false;
+      }
     }
   }
 
@@ -682,27 +667,34 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood(
     // Return true if V is InnerInduction, or a cast from
     // InnerInduction, or a binary operator that involves
     // InnerInduction and a constant.
-    std::function<bool(Value *)> IsPathToIndVar;
-    IsPathToIndVar = [&InnerInduction, &IsPathToIndVar](Value *V) -> bool {
-      if (V == InnerInduction)
+    std::function<bool(Value *)> IsPathToInnerIndVar;
+    IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
+      if (llvm::is_contained(InnerLoopInductions, V))
         return true;
       if (isa<Constant>(V))
         return true;
-      Instruction *I = dyn_cast<Instruction>(V);
+      const Instruction *I = dyn_cast<Instruction>(V);
       if (!I)
         return false;
       if (isa<CastInst>(I))
-        return IsPathToIndVar(I->getOperand(0));
+        return IsPathToInnerIndVar(I->getOperand(0));
       if (isa<BinaryOperator>(I))
-        return IsPathToIndVar(I->getOperand(0)) &&
-               IsPathToIndVar(I->getOperand(1));
+        return IsPathToInnerIndVar(I->getOperand(0)) &&
+               IsPathToInnerIndVar(I->getOperand(1));
       return false;
     };
 
-    if (IsPathToIndVar(Op0) && !isa<Constant>(Op0)) {
+    // In case of multiple inner loop indvars, it is okay if LHS and RHS
+    // are both inner indvar related variables.
+    if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
+      return true;
+
+    // Otherwise we check if the cmp instruction compares an inner indvar
+    // related variable (Left) with a outer loop invariant (Right).
+    if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
       Left = Op0;
       Right = Op1;
-    } else if (IsPathToIndVar(Op1) && !isa<Constant>(Op1)) {
+    } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
       Left = Op1;
       Right = Op0;
     }
@@ -793,7 +785,6 @@ bool LoopInterchangeLegality::findInductionAndReductions(
 // This function indicates the current limitations in the transform as a result
 // of which we do not proceed.
 bool LoopInterchangeLegality::currentLimitations() {
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
 
   // transform currently expects the loop latches to also be the exiting
@@ -815,7 +806,6 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  PHINode *InnerInductionVar;
   SmallVector<PHINode *, 8> Inductions;
   if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
     LLVM_DEBUG(
@@ -831,20 +821,6 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // TODO: Currently we handle only loops with 1 induction variable.
-  if (Inductions.size() != 1) {
-    LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
-                      << "supported currently.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
-                                      OuterLoop->getStartLoc(),
-                                      OuterLoop->getHeader())
-             << "Only outer loops with 1 induction variable can be "
-                "interchanged currently.";
-    });
-    return true;
-  }
-
   Inductions.clear();
   if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
     LLVM_DEBUG(
@@ -860,24 +836,8 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // TODO: Currently we handle only loops with 1 induction variable.
-  if (Inductions.size() != 1) {
-    LLVM_DEBUG(
-        dbgs() << "We currently only support loops with 1 induction variable."
-               << "Failed to interchange due to current limitation\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Only inner loops with 1 induction variable can be "
-                "interchanged currently.";
-    });
-    return true;
-  }
-  InnerInductionVar = Inductions.pop_back_val();
-
   // TODO: Triangular loops are not handled for now.
-  if (!isLoopStructureUnderstood(InnerInductionVar)) {
+  if (!isLoopStructureUnderstood()) {
     LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
     ORE->emit([&]() {
       return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
@@ -888,79 +848,17 @@ bool LoopInterchangeLegality::currentLimitations() {
     return true;
   }
 
-  // TODO: Current limitation: Since we split the inner loop latch at the point
-  // were induction variable is incremented (induction.next); We cannot have
-  // more than 1 user of induction.next since it would result in broken code
-  // after split.
-  // e.g.
-  // for(i=0;i<N;i++) {
-  //    for(j = 0;j<M;j++) {
-  //      A[j+1][i+2] = A[j][i]+k;
-  //  }
-  // }
-  Instruction *InnerIndexVarInc = nullptr;
-  if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
-    InnerIndexVarInc =
-        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
-  else
-    InnerIndexVarInc =
-        dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
-
-  if (!InnerIndexVarInc) {
-    LLVM_DEBUG(
-        dbgs() << "Did not find an instruction to increment the induction "
-               << "variable.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "The inner loop does not increment the induction variable.";
-    });
-    return true;
-  }
-
-  // Since we split the inner loop latch on this induction variable. Make sure
-  // we do not have any instruction between the induction variable and branch
-  // instruction.
-
-  bool FoundInduction = false;
-  for (const Instruction &I :
-       llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
-    if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
-        isa<ZExtInst>(I))
-      continue;
-
-    // We found an instruction. If this is not induction variable then it is not
-    // safe to split this loop latch.
-    if (!I.isIdenticalTo(InnerIndexVarInc)) {
-      LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
-                        << "variable increment and branch.\n");
-      ORE->emit([&]() {
-        return OptimizationRemarkMissed(
-                   DEBUG_TYPE, "UnsupportedInsBetweenInduction",
-                   InnerLoop->getStartLoc(), InnerLoop->getHeader())
-               << "Found unsupported instruction between induction variable "
-                  "increment and branch.";
-      });
-      return true;
-    }
+  return false;
+}
 
-    FoundInduction = true;
-    break;
-  }
-  // The loop latch ended and we didn't find the induction variable return as
-  // current limitation.
-  if (!FoundInduction) {
-    LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
-    ORE->emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
-                                      InnerLoop->getStartLoc(),
-                                      InnerLoop->getHeader())
-             << "Did not find the induction variable.";
-    });
-    return true;
+bool LoopInterchangeLegality::findInductions(
+    Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
+  for (PHINode &PHI : L->getHeader()->phis()) {
+    InductionDescriptor ID;
+    if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+      Inductions.push_back(&PHI);
   }
-  return false;
+  return !Inductions.empty();
 }
 
 // We currently only support LCSSA PHI nodes in the inner loop exit, if their
@@ -1076,7 +974,7 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
     for (Instruction &I : BB->instructionsWithoutDebug())
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         // readnone functions do not prevent interchanging.
-        if (CI->doesNotReadMemory())
+        if (CI->onlyWritesMemory())
           continue;
         LLVM_DEBUG(
             dbgs() << "Loops with call instructions cannot be interchanged "
@@ -1091,6 +989,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
         return false;
       }
 
+  if (!findInductions(InnerLoop, InnerLoopInductions)) {
+    LLVM_DEBUG(dbgs() << "Cound not find inner loop induction variables.\n");
+    return false;
+  }
+
   if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
     LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
     ORE->emit([&]() {
@@ -1347,25 +1250,25 @@ void LoopInterchangeTransform::restructureLoops(
 
 bool LoopInterchangeTransform::transform() {
   bool Transformed = false;
-  Instruction *InnerIndexVar;
 
   if (InnerLoop->getSubLoops().empty()) {
     BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
     LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
-    PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
-    if (!InductionPHI) {
+    auto &InductionPHIs = LIL.getInnerLoopInductions();
+    if (InductionPHIs.empty()) {
       LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
       return false;
     }
 
-    if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
-    else
-      InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
-
-    // Ensure that InductionPHI is the first Phi node.
-    if (&InductionPHI->getParent()->front() != InductionPHI)
-      InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+    SmallVector<Instruction *, 8> InnerIndexVarList;
+    for (PHINode *CurInductionPHI : InductionPHIs) {
+      if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+        InnerIndexVarList.push_back(
+            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
+      else
+        InnerIndexVarList.push_back(
+            dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
+    }
 
     // Create a new latch block for the inner loop. We split at the
     // current latch's terminator and then move the condition and all
@@ -1377,7 +1280,7 @@ bool LoopInterchangeTransform::transform() {
 
     SmallSetVector<Instruction *, 4> WorkList;
     unsigned i = 0;
-    auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+    auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
       for (; i < WorkList.size(); i++) {
         // Duplicate instruction and move it the new latch. Update uses that
         // have been moved.
@@ -1389,7 +1292,8 @@ bool LoopInterchangeTransform::transform() {
         for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
           Instruction *UserI = cast<Instruction>(U.getUser());
           if (!InnerLoop->contains(UserI->getParent()) ||
-              UserI->getParent() == NewLatch || UserI == InductionPHI)
+              UserI->getParent() == NewLatch ||
+              llvm::is_contained(InductionPHIs, UserI))
             U.set(NewI);
         }
         // Add operands of moved instruction to the worklist, except if they are
@@ -1398,7 +1302,7 @@ bool LoopInterchangeTransform::transform() {
           Instruction *OpI = dyn_cast<Instruction>(Op);
           if (!OpI ||
               this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
-              OpI == InductionPHI)
+              llvm::is_contained(InductionPHIs, OpI))
             continue;
           WorkList.insert(OpI);
         }
@@ -1412,7 +1316,8 @@ bool LoopInterchangeTransform::transform() {
     if (CondI)
       WorkList.insert(CondI);
     MoveInstructions();
-    WorkList.insert(cast<Instruction>(InnerIndexVar));
+    for (Instruction *InnerIndexVar : InnerIndexVarList)
+      WorkList.insert(cast<Instruction>(InnerIndexVar));
     MoveInstructions();
 
     // Splits the inner loops phi nodes out into a separate basic block.
@@ -1685,7 +1590,6 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
                   InnerLoopLatchSuccessor, DTUpdates);
 
-
   if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
     OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
   else
@@ -1712,19 +1616,22 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
   for (PHINode &PHI : InnerLoopHeader->phis())
     if (OuterInnerReductions.contains(&PHI))
-      InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+      InnerLoopPHIs.push_back(&PHI);
+
   for (PHINode &PHI : OuterLoopHeader->phis())
     if (OuterInnerReductions.contains(&PHI))
-      OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+      OuterLoopPHIs.push_back(&PHI);
 
   // Now move the remaining reduction PHIs from outer to inner loop header and
   // vice versa. The PHI nodes must be part of a reduction across the inner and
   // outer loop and all the remains to do is and updating the incoming blocks.
   for (PHINode *PHI : OuterLoopPHIs) {
+    LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
     PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
   for (PHINode *PHI : InnerLoopPHIs) {
+    LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
     PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 798af48c2337..654f0d2a03a8 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3486,6 +3486,31 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         // Don't bother if the instruction is in a BB which ends in an EHPad.
         if (UseBB->getTerminator()->isEHPad())
           continue;
+
+        // Ignore cases in which the currently-examined value could come from
+        // a basic block terminated with an EHPad. This checks all incoming
+        // blocks of the phi node since it is possible that the same incoming
+        // value comes from multiple basic blocks, only some of which may end
+        // in an EHPad. If any of them do, a subsequent rewrite attempt by this
+        // pass would try to insert instructions into an EHPad, hitting an
+        // assertion.
+        if (isa<PHINode>(UserInst)) {
+          const auto *PhiNode = cast<PHINode>(UserInst);
+          bool HasIncompatibleEHPTerminatedBlock = false;
+          llvm::Value *ExpectedValue = U;
+          for (unsigned int I = 0; I < PhiNode->getNumIncomingValues(); I++) {
+            if (PhiNode->getIncomingValue(I) == ExpectedValue) {
+              if (PhiNode->getIncomingBlock(I)->getTerminator()->isEHPad()) {
+                HasIncompatibleEHPTerminatedBlock = true;
+                break;
+              }
+            }
+          }
+          if (HasIncompatibleEHPTerminatedBlock) {
+            continue;
+          }
+        }
+
         // Don't bother rewriting PHIs in catchswitch blocks.
         if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
           continue;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 893928fb0560..022d9c7abc8c 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1142,7 +1142,7 @@ static LoopUnrollResult tryToUnrollLoop(
   // automatic unrolling from interfering with the user requested
   // transformation.
   Loop *ParentL = L->getParentLoop();
-  if (ParentL != NULL &&
+  if (ParentL != nullptr &&
       hasUnrollAndJamTransformation(ParentL) == TM_ForcedByUser &&
       hasUnrollTransformation(L) != TM_ForcedByUser) {
     LLVM_DEBUG(dbgs() << "Not unrolling loop since parent loop has"
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 1c186e9a0488..a7eb60b5e032 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -64,7 +64,7 @@ getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) {
     // __builtin_expect_with_probability
     assert(CI->getNumOperands() >= 3 &&
            "expect with probability must have 3 arguments");
-    ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2));
+    auto *Confidence = cast<ConstantFP>(CI->getArgOperand(2));
     double TrueProb = Confidence->getValueAPF().convertToDouble();
     assert((TrueProb >= 0.0 && TrueProb <= 1.0) &&
            "probability value must be in the range [0.0, 1.0]");
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 4e4097e13271..8f1d0181ee5b 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -220,9 +220,7 @@ class LowerMatrixIntrinsics {
     bool IsColumnMajor = true;
 
   public:
-    MatrixTy()
-        : Vectors(),
-          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+    MatrixTy() : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
     MatrixTy(ArrayRef<Value *> Vectors)
         : Vectors(Vectors.begin(), Vectors.end()),
           IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
@@ -1393,7 +1391,8 @@ public:
     // reloads necessary.
     unsigned Op0Regs = (R + VF - 1) / VF * M;
     unsigned Op1Regs = (M + VF - 1) / VF * C;
-    return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true);
+    return Op0Regs + Op1Regs >
+           TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true));
   }
 
   MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {
@@ -1832,7 +1831,7 @@ public:
                    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
                    const SmallSetVector<Value *, 32> &ExprsInSubprogram,
                    Value *Leaf)
-        : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
+        : Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
           ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
 
     void indent(unsigned N) {
@@ -1895,7 +1894,7 @@ public:
           write(Name);
           return;
         }
-        IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+        auto *II = cast<IntrinsicInst>(CI);
         write(Intrinsic::getBaseName(II->getIntrinsicID())
                   .drop_front(StringRef("llvm.matrix.").size()));
         write(".");
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 67335a45fb58..6698db26626b 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -171,7 +172,7 @@ public:
   bool empty() const { return Ranges.empty(); }
 
   void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    if (auto *SI = dyn_cast<StoreInst>(Inst))
       addStore(OffsetFromFirst, SI);
     else
       addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
@@ -312,15 +313,21 @@ INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
 static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
                                          Instruction *End) {
   assert(Start->getParent() == End->getParent() && "Must be in same block");
-  if (!Start->getFunction()->doesNotThrow() &&
-      !isa<AllocaInst>(getUnderlyingObject(V))) {
-    for (const Instruction &I :
-         make_range(Start->getIterator(), End->getIterator())) {
-      if (I.mayThrow())
-        return true;
-    }
-  }
-  return false;
+  // Function can't unwind, so it also can't be visible through unwinding.
+  if (Start->getFunction()->doesNotThrow())
+    return false;
+
+  // Object is not visible on unwind.
+  // TODO: Support RequiresNoCaptureBeforeUnwind case.
+  bool RequiresNoCaptureBeforeUnwind;
+  if (isNotVisibleOnUnwind(getUnderlyingObject(V),
+                           RequiresNoCaptureBeforeUnwind) &&
+      !RequiresNoCaptureBeforeUnwind)
+    return false;
+
+  // Check whether there are any unwinding instructions in the range.
+  return any_of(make_range(Start->getIterator(), End->getIterator()),
+                [](const Instruction &I) { return I.mayThrow(); });
 }
 
 void MemCpyOptPass::eraseInstruction(Instruction *I) {
@@ -364,7 +371,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   const DataLayout &DL = StartInst->getModule()->getDataLayout();
 
   // We can't track scalable types
-  if (StoreInst *SI = dyn_cast<StoreInst>(StartInst))
+  if (auto *SI = dyn_cast<StoreInst>(StartInst))
     if (DL.getTypeStoreSize(SI->getOperand(0)->getType()).isScalable())
       return nullptr;
 
@@ -410,7 +417,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
       continue;
     }
 
-    if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+    if (auto *NextStore = dyn_cast<StoreInst>(BI)) {
       // If this is a store, see if we can merge it in.
       if (!NextStore->isSimple()) break;
 
@@ -440,7 +447,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
 
       Ranges.addStore(*Offset, NextStore);
     } else {
-      MemSetInst *MSI = cast<MemSetInst>(BI);
+      auto *MSI = cast<MemSetInst>(BI);
 
       if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
           !isa<ConstantInt>(MSI->getLength()))
@@ -661,7 +668,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
     return false;
 
   // Load to store forwarding can be interpreted as memcpy.
-  if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+  if (auto *LI = dyn_cast<LoadInst>(StoredVal)) {
     if (LI->isSimple() && LI->hasOneUse() &&
         LI->getParent() == SI->getParent()) {
 
@@ -871,7 +878,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
       return false;
 
   // Require that src be an alloca.  This simplifies the reasoning considerably.
-  AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+  auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
   if (!srcAlloca)
     return false;
 
@@ -890,8 +897,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   // trap.  Otherwise the transform is invalid since it might cause a trap
   // to occur earlier than it otherwise would.
   if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpySize),
-                                          DL, C, DT))
+                                          DL, C, DT)) {
+    LLVM_DEBUG(dbgs() << "Call Slot: Dest pointer not dereferenceable\n");
     return false;
+  }
 
   // Make sure that nothing can observe cpyDest being written early. There are
   // a number of cases to consider:
@@ -907,8 +916,10 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
   //     guaranteed to be executed if C is. As it is a non-atomic access, it
   //     renders accesses from other threads undefined.
   //     TODO: This is currently not checked.
-  if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore))
+  if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore)) {
+    LLVM_DEBUG(dbgs() << "Call Slot: Dest may be visible through unwinding");
     return false;
+  }
 
   // Check that dest points to memory that is at least as aligned as src.
   Align srcAlign = srcAlloca->getAlign();
@@ -930,14 +941,14 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
       append_range(srcUseList, U->users());
       continue;
     }
-    if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+    if (const auto *G = dyn_cast<GetElementPtrInst>(U)) {
       if (!G->hasAllZeroIndices())
         return false;
 
       append_range(srcUseList, U->users());
       continue;
     }
-    if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+    if (const auto *IT = dyn_cast<IntrinsicInst>(U))
       if (IT->isLifetimeStartOrEnd())
         continue;
 
@@ -945,12 +956,57 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
       return false;
   }
 
-  // Check that src isn't captured by the called function since the
-  // transformation can cause aliasing issues in that case.
-  for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI)
-    if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI))
+  // Check whether src is captured by the called function, in which case there
+  // may be further indirect uses of src.
+  bool SrcIsCaptured = any_of(C->args(), [&](Use &U) {
+    return U->stripPointerCasts() == cpySrc &&
+           !C->doesNotCapture(C->getArgOperandNo(&U));
+  });
+
+  // If src is captured, then check whether there are any potential uses of
+  // src through the captured pointer before the lifetime of src ends, either
+  // due to a lifetime.end or a return from the function.
+  if (SrcIsCaptured) {
+    // Check that dest is not captured before/at the call. We have already
+    // checked that src is not captured before it. If either had been captured,
+    // then the call might be comparing the argument against the captured dest
+    // or src pointer.
+    Value *DestObj = getUnderlyingObject(cpyDest);
+    if (!isIdentifiedFunctionLocal(DestObj) ||
+        PointerMayBeCapturedBefore(DestObj, /* ReturnCaptures */ true,
+                                   /* StoreCaptures */ true, C, DT,
+                                   /* IncludeI */ true))
       return false;
 
+    MemoryLocation SrcLoc =
+        MemoryLocation(srcAlloca, LocationSize::precise(srcSize));
+    for (Instruction &I :
+         make_range(++C->getIterator(), C->getParent()->end())) {
+      // Lifetime of srcAlloca ends at lifetime.end.
+      if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
+        if (II->getIntrinsicID() == Intrinsic::lifetime_end &&
+            II->getArgOperand(1)->stripPointerCasts() == srcAlloca &&
+            cast<ConstantInt>(II->getArgOperand(0))->uge(srcSize))
+          break;
+      }
+
+      // Lifetime of srcAlloca ends at return.
+      if (isa<ReturnInst>(&I))
+        break;
+
+      // Ignore the direct read of src in the load.
+      if (&I == cpyLoad)
+        continue;
+
+      // Check whether this instruction may mod/ref src through the captured
+      // pointer (we have already any direct mod/refs in the loop above).
+      // Also bail if we hit a terminator, as we don't want to scan into other
+      // blocks.
+      if (isModOrRefSet(AA->getModRefInfo(&I, SrcLoc)) || I.isTerminator())
+        return false;
+    }
+  }
+
   // Since we're changing the parameter to the callsite, we need to make sure
   // that what would be the new parameter dominates the callsite.
   if (!DT->dominates(cpyDest, C)) {
@@ -1018,6 +1074,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
                          LLVMContext::MD_invariant_group,
                          LLVMContext::MD_access_group};
   combineMetadata(C, cpyLoad, KnownIDs, true);
+  if (cpyLoad != cpyStore)
+    combineMetadata(C, cpyStore, KnownIDs, true);
 
   ++NumCallSlot;
   return true;
@@ -1043,8 +1101,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // Second, the length of the memcpy's must be the same, or the preceding one
   // must be larger than the following one.
   if (MDep->getLength() != M->getLength()) {
-    ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
-    ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+    auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+    auto *MLen = dyn_cast<ConstantInt>(M->getLength());
     if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
       return false;
   }
@@ -1163,7 +1221,7 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   const unsigned DestAlign =
       std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
   if (DestAlign > 1)
-    if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+    if (auto *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
       Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
 
   IRBuilder<> Builder(MemCpy);
@@ -1211,12 +1269,11 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
   if (MSSA->isLiveOnEntryDef(Def))
     return isa<AllocaInst>(getUnderlyingObject(V));
 
-  if (IntrinsicInst *II =
-          dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
+  if (auto *II = dyn_cast_or_null<IntrinsicInst>(Def->getMemoryInst())) {
     if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
-      ConstantInt *LTSize = cast<ConstantInt>(II->getArgOperand(0));
+      auto *LTSize = cast<ConstantInt>(II->getArgOperand(0));
 
-      if (ConstantInt *CSize = dyn_cast<ConstantInt>(Size)) {
+      if (auto *CSize = dyn_cast<ConstantInt>(Size)) {
         if (AA->isMustAlias(V, II->getArgOperand(1)) &&
             LTSize->getZExtValue() >= CSize->getZExtValue())
           return true;
@@ -1226,12 +1283,14 @@ static bool hasUndefContents(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
       // does) and we're querying a pointer based on that alloca, then we know
       // the memory is definitely undef, regardless of how exactly we alias.
       // The size also doesn't matter, as an out-of-bounds access would be UB.
-      AllocaInst *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V));
-      if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
-        const DataLayout &DL = Alloca->getModule()->getDataLayout();
-        if (Optional<TypeSize> AllocaSize = Alloca->getAllocationSizeInBits(DL))
-          if (*AllocaSize == LTSize->getValue() * 8)
-            return true;
+      if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(V))) {
+        if (getUnderlyingObject(II->getArgOperand(1)) == Alloca) {
+          const DataLayout &DL = Alloca->getModule()->getDataLayout();
+          if (Optional<TypeSize> AllocaSize =
+                  Alloca->getAllocationSizeInBits(DL))
+            if (*AllocaSize == LTSize->getValue() * 8)
+              return true;
+        }
       }
     }
   }
@@ -1266,12 +1325,12 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
     // Don't worry about sizes larger than i64.
 
     // A known memset size is required.
-    ConstantInt *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
+    auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
     if (!CMemSetSize)
       return false;
 
     // A known memcpy size is also required.
-    ConstantInt *CCopySize = dyn_cast<ConstantInt>(CopySize);
+    auto  *CCopySize = dyn_cast<ConstantInt>(CopySize);
     if (!CCopySize)
       return false;
     if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
@@ -1323,7 +1382,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   }
 
   // If copying from a constant, try to turn the memcpy into a memset.
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+  if (auto *GV = dyn_cast<GlobalVariable>(M->getSource()))
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
                                            M->getModule()->getDataLayout())) {
@@ -1370,7 +1429,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   //   d) memcpy from a just-memset'd source can be turned into memset.
   if (auto *MD = dyn_cast<MemoryDef>(SrcClobber)) {
     if (Instruction *MI = MD->getMemoryInst()) {
-      if (ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
+      if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
         if (auto *C = dyn_cast<CallInst>(MI)) {
           // The memcpy must post-dom the call. Limit to the same block for
           // now. Additionally, we need to ensure that there are no accesses
@@ -1469,7 +1528,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
     return false;
 
   // The length of the memcpy must be larger or equal to the size of the byval.
-  ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+  auto *C1 = dyn_cast<ConstantInt>(MDep->getLength());
   if (!C1 || !TypeSize::isKnownGE(
                  TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize))
     return false;
@@ -1540,13 +1599,13 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 
       bool RepeatInstruction = false;
 
-      if (StoreInst *SI = dyn_cast<StoreInst>(I))
+      if (auto *SI = dyn_cast<StoreInst>(I))
         MadeChange |= processStore(SI, BI);
-      else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+      else if (auto *M = dyn_cast<MemSetInst>(I))
         RepeatInstruction = processMemSet(M, BI);
-      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+      else if (auto *M = dyn_cast<MemCpyInst>(I))
         RepeatInstruction = processMemCpy(M, BI);
-      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+      else if (auto *M = dyn_cast<MemMoveInst>(I))
         RepeatInstruction = processMemMove(M);
       else if (auto *CB = dyn_cast<CallBase>(I)) {
         for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 10a8742940b1..2476e6c408b1 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -1198,9 +1198,10 @@ NewGVN::ExprResult NewGVN::createExpression(Instruction *I) const {
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
   } else if (auto *GEPI = dyn_cast<GetElementPtrInst>(I)) {
-    Value *V = SimplifyGEPInst(GEPI->getSourceElementType(),
-                               ArrayRef<Value *>(E->op_begin(), E->op_end()),
-                               GEPI->isInBounds(), SQ);
+    Value *V =
+        SimplifyGEPInst(GEPI->getSourceElementType(), *E->op_begin(),
+                        makeArrayRef(std::next(E->op_begin()), E->op_end()),
+                        GEPI->isInBounds(), SQ);
     if (auto Simplified = checkExprResults(E, I, V))
       return Simplified;
   } else if (AllConstant) {
@@ -1322,11 +1323,11 @@ bool NewGVN::someEquivalentDominates(const Instruction *Inst,
 Value *NewGVN::lookupOperandLeader(Value *V) const {
   CongruenceClass *CC = ValueToClass.lookup(V);
   if (CC) {
-    // Everything in TOP is represented by undef, as it can be any value.
+    // Everything in TOP is represented by poison, as it can be any value.
     // We do have to make sure we get the type right though, so we can't set the
-    // RepLeader to undef.
+    // RepLeader to poison.
     if (CC == TOPClass)
-      return UndefValue::get(V->getType());
+      return PoisonValue::get(V->getType());
     return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
   }
 
@@ -1493,8 +1494,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.  Note that this is only true in the case
   // that the result of the allocation is pointer equal to the load ptr.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
-      isAlignedAllocLikeFn(DepInst, TLI)) {
+  if (isa<AllocaInst>(DepInst)) {
     return createConstantExpression(UndefValue::get(LoadType));
   }
   // If this load occurs either right after a lifetime begin,
@@ -1502,12 +1502,10 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
     if (II->getIntrinsicID() == Intrinsic::lifetime_start)
       return createConstantExpression(UndefValue::get(LoadType));
-  }
-  // If this load follows a calloc (which zero initializes memory),
-  // then the loaded value is zero
-  else if (isCallocLikeFn(DepInst, TLI)) {
-    return createConstantExpression(Constant::getNullValue(LoadType));
-  }
+  } else if (isAllocationFn(DepInst, TLI))
+    if (auto *InitVal = getInitialValueOfAllocation(cast<CallBase>(DepInst),
+                                                    TLI, LoadType))
+      return createConstantExpression(InitVal);
 
   return nullptr;
 }
@@ -1521,9 +1519,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
     return nullptr;
 
   Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
-  // Load of undef is undef.
+  // Load of undef is UB.
   if (isa<UndefValue>(LoadAddressLeader))
-    return createConstantExpression(UndefValue::get(LI->getType()));
+    return createConstantExpression(PoisonValue::get(LI->getType()));
   MemoryAccess *OriginalAccess = getMemoryAccess(I);
   MemoryAccess *DefiningAccess =
       MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
@@ -1531,9 +1529,9 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
   if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
     if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
       Instruction *DefiningInst = MD->getMemoryInst();
-      // If the defining instruction is not reachable, replace with undef.
+      // If the defining instruction is not reachable, replace with poison.
       if (!ReachableBlocks.count(DefiningInst->getParent()))
-        return createConstantExpression(UndefValue::get(LI->getType()));
+        return createConstantExpression(PoisonValue::get(LI->getType()));
       // This will handle stores and memory insts.  We only do if it the
       // defining access has a different type, or it is a pointer produced by
       // certain memory operations that cause the memory to have a fixed value
@@ -1722,8 +1720,12 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
   // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
   // See if all arguments are the same.
   // We track if any were undef because they need special handling.
-  bool HasUndef = false;
+  bool HasUndef = false, HasPoison = false;
   auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+    if (isa<PoisonValue>(Arg)) {
+      HasPoison = true;
+      return false;
+    }
     if (isa<UndefValue>(Arg)) {
       HasUndef = true;
       return false;
@@ -1732,8 +1734,14 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
   });
   // If we are left with no operands, it's dead.
   if (Filtered.empty()) {
-    // If it has undef at this point, it means there are no-non-undef arguments,
-    // and thus, the value of the phi node must be undef.
+    // If it has undef or poison at this point, it means there are no-non-undef
+    // arguments, and thus, the value of the phi node must be undef.
+    if (HasPoison && !HasUndef) {
+      LLVM_DEBUG(
+          dbgs() << "PHI Node " << *I
+                 << " has no non-poison arguments, valuing it as poison\n");
+      return createConstantExpression(PoisonValue::get(I->getType()));
+    }
     if (HasUndef) {
       LLVM_DEBUG(
           dbgs() << "PHI Node " << *I
@@ -1758,7 +1766,7 @@ NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
     // expression to say if one is equivalent to the other.
     // We also special case undef, so that if we have an undef, we can't use the
     // common value unless it dominates the phi block.
-    if (HasUndef) {
+    if (HasPoison || HasUndef) {
       // If we have undef and at least one other value, this is really a
       // multivalued phi, and we need to know if it's cycle free in order to
       // evaluate whether we can ignore the undef.  The other parts of this are
@@ -2579,6 +2587,15 @@ bool NewGVN::OpIsSafeForPHIOfOpsHelper(
   }
 
   auto *OrigI = cast<Instruction>(V);
+  // When we hit an instruction that reads memory (load, call, etc), we must
+  // consider any store that may happen in the loop. For now, we assume the
+  // worst: there is a store in the loop that alias with this read.
+  // The case where the load is outside the loop is already covered by the
+  // dominator check above.
+  // TODO: relax this condition
+  if (OrigI->mayReadFromMemory())
+    return false;
+
   for (auto *Op : OrigI->operand_values()) {
     if (!isa<Instruction>(Op))
       continue;
@@ -2780,7 +2797,7 @@ NewGVN::makePossiblePHIOfOps(Instruction *I,
       LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
                         << getBlockName(PredBB)
                         << " because the block is unreachable\n");
-      FoundVal = UndefValue::get(I->getType());
+      FoundVal = PoisonValue::get(I->getType());
       RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
     }
 
@@ -3459,7 +3476,7 @@ bool NewGVN::runGVN() {
   // Delete all instructions marked for deletion.
   for (Instruction *ToErase : InstructionsToErase) {
     if (!ToErase->use_empty())
-      ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+      ToErase->replaceAllUsesWith(PoisonValue::get(ToErase->getType()));
 
     assert(ToErase->getParent() &&
            "BB containing ToErase deleted unexpectedly!");
@@ -3677,7 +3694,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
   for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
     Instruction &Inst = *I++;
     if (!Inst.use_empty())
-      Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+      Inst.replaceAllUsesWith(PoisonValue::get(Inst.getType()));
     if (isa<LandingPadInst>(Inst))
       continue;
     salvageKnowledge(&Inst, AC);
@@ -3687,7 +3704,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
   }
   // Now insert something that simplifycfg will turn into an unreachable.
   Type *Int8Ty = Type::getInt8Ty(BB->getContext());
-  new StoreInst(UndefValue::get(Int8Ty),
+  new StoreInst(PoisonValue::get(Int8Ty),
                 Constant::getNullValue(Int8Ty->getPointerTo()),
                 BB->getTerminator());
 }
@@ -3827,8 +3844,8 @@ bool NewGVN::eliminateInstructions(Function &F) {
         LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
                           << " for block "
                           << getBlockName(PHI->getIncomingBlock(Operand))
-                          << " with undef due to it being unreachable\n");
-        Operand.set(UndefValue::get(PHI->getType()));
+                          << " with poison due to it being unreachable\n");
+        Operand.set(PoisonValue::get(PHI->getType()));
       }
   };
   // Replace unreachable phi arguments.
@@ -4128,21 +4145,25 @@ bool NewGVN::eliminateInstructions(Function &F) {
 unsigned int NewGVN::getRank(const Value *V) const {
   // Prefer constants to undef to anything else
   // Undef is a constant, have to check it first.
+  // Prefer poison to undef as it's less defined.
   // Prefer smaller constants to constantexprs
+  // Note that the order here matters because of class inheritance
   if (isa<ConstantExpr>(V))
-    return 2;
-  if (isa<UndefValue>(V))
+    return 3;
+  if (isa<PoisonValue>(V))
     return 1;
+  if (isa<UndefValue>(V))
+    return 2;
   if (isa<Constant>(V))
     return 0;
-  else if (auto *A = dyn_cast<Argument>(V))
-    return 3 + A->getArgNo();
+  if (auto *A = dyn_cast<Argument>(V))
+    return 4 + A->getArgNo();
 
-  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // Need to shift the instruction DFS by number of arguments + 5 to account for
   // the constant and argument ranking above.
   unsigned Result = InstrToDFSNum(V);
   if (Result > 0)
-    return 4 + NumFuncArgs + Result;
+    return 5 + NumFuncArgs + Result;
   // Unreachable or something else, just return a really large number.
   return ~0;
 }
diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 44027ccd92ca..e0d0301c1ef6 100644
--- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -82,6 +82,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
 
   // Add attribute "readnone" so that backend can use a native sqrt instruction
   // for this call.
+  Call->removeFnAttr(Attribute::WriteOnly);
   Call->addFnAttr(Attribute::ReadNone);
 
   // Insert a FP compare instruction and use it as the CurrBB branch condition.
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index e12eca0ed287..3da367341d2a 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1367,13 +1367,13 @@ static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
     return AL;
 
   // Remove the readonly, readnone, and statepoint function attributes.
-  AttrBuilder FnAttrs = AL.getFnAttrs();
+  AttrBuilder FnAttrs(Ctx, AL.getFnAttrs());
   for (auto Attr : FnAttrsToStrip)
     FnAttrs.removeAttribute(Attr);
 
   for (Attribute A : AL.getFnAttrs()) {
     if (isStatepointDirectiveAttr(A))
-      FnAttrs.remove(A);
+      FnAttrs.removeAttribute(A);
   }
 
   // Just skip parameter and return attributes for now
@@ -2643,10 +2643,10 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
 // List of all parameter and return attributes which must be stripped when
 // lowering from the abstract machine model.  Note that we list attributes
 // here which aren't valid as return attributes, that is okay.
-static AttrBuilder getParamAndReturnAttributesToRemove() {
-  AttrBuilder R;
-  R.addDereferenceableAttr(1);
-  R.addDereferenceableOrNullAttr(1);
+static AttributeMask getParamAndReturnAttributesToRemove() {
+  AttributeMask R;
+  R.addAttribute(Attribute::Dereferenceable);
+  R.addAttribute(Attribute::DereferenceableOrNull);
   R.addAttribute(Attribute::ReadNone);
   R.addAttribute(Attribute::ReadOnly);
   R.addAttribute(Attribute::WriteOnly);
@@ -2668,7 +2668,7 @@ static void stripNonValidAttributesFromPrototype(Function &F) {
     return;
   }
 
-  AttrBuilder R = getParamAndReturnAttributesToRemove();
+  AttributeMask R = getParamAndReturnAttributesToRemove();
   for (Argument &A : F.args())
     if (isa<PointerType>(A.getType()))
       F.removeParamAttrs(A.getArgNo(), R);
@@ -2742,7 +2742,7 @@ static void stripNonValidDataFromBody(Function &F) {
 
     stripInvalidMetadataFromInstruction(I);
 
-    AttrBuilder R = getParamAndReturnAttributesToRemove();
+    AttributeMask R = getParamAndReturnAttributesToRemove();
     if (auto *Call = dyn_cast<CallBase>(&I)) {
       for (int i = 0, e = Call->arg_size(); i != e; i++)
         if (isa<PointerType>(Call->getArgOperand(i)->getType()))
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index ff2f8a25f379..c34da51e6dc1 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -486,7 +486,7 @@ bool llvm::runIPSCCP(
       // inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove
       // them from both the function and callsites.
       if (ReplacedPointerArg) {
-        AttrBuilder AttributesToRemove;
+        AttributeMask AttributesToRemove;
         AttributesToRemove.addAttribute(Attribute::ArgMemOnly);
         AttributesToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
         F.removeFnAttrs(AttributesToRemove);
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 31c8999c3724..35497ae5ed9a 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -323,7 +323,7 @@ private:
   ///
   /// Note that these are not separated by slice. This is because we expect an
   /// alloca to be completely rewritten or not rewritten at all. If rewritten,
-  /// all these instructions can simply be removed and replaced with undef as
+  /// all these instructions can simply be removed and replaced with poison as
   /// they come from outside of the allocated space.
   SmallVector<Instruction *, 8> DeadUsers;
 
@@ -333,10 +333,10 @@ private:
   /// Operands which will become dead if we rewrite the alloca.
   ///
   /// These are operands that in their particular use can be replaced with
-  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+  /// poison when we rewrite the alloca. These show up in out-of-bounds inputs
   /// to PHI nodes and the like. They aren't entirely dead (there might be
   /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
-  /// want to swap this particular input for undef to simplify the use lists of
+  /// want to swap this particular input for poison to simplify the use lists of
   /// the alloca.
   SmallVector<Use *, 8> DeadOperands;
 };
@@ -1008,6 +1008,13 @@ private:
     if (I.use_empty())
       return markAsDead(I);
 
+    // If this is a PHI node before a catchswitch, we cannot insert any non-PHI
+    // instructions in this BB, which may be required during rewriting. Bail out
+    // on these cases.
+    if (isa<PHINode>(I) &&
+        I.getParent()->getFirstInsertionPt() == I.getParent()->end())
+      return PI.setAborted(&I);
+
     // TODO: We could use SimplifyInstruction here to fold PHINodes and
     // SelectInsts. However, doing so requires to change the current
     // dead-operand-tracking mechanism. For instance, suppose neither loading
@@ -1023,7 +1030,7 @@ private:
         enqueueUsers(I);
       else
         // Otherwise the operand to the PHI/select is dead, and we can replace
-        // it with undef.
+        // it with poison.
         AS.DeadOperands.push_back(U);
 
       return;
@@ -1043,7 +1050,7 @@ private:
     // For PHI and select operands outside the alloca, we can't nuke the entire
     // phi or select -- the other side might still be relevant, so we special
     // case them here and use a separate structure to track the operands
-    // themselves which should be replaced with undef.
+    // themselves which should be replaced with poison.
     // FIXME: This should instead be escaped in the event we're instrumenting
     // for address sanitization.
     if (Offset.uge(AllocSize)) {
@@ -1264,14 +1271,14 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
   return true;
 }
 
-static void speculatePHINodeLoads(PHINode &PN) {
+static void speculatePHINodeLoads(IRBuilderTy &IRB, PHINode &PN) {
   LLVM_DEBUG(dbgs() << "    original: " << PN << "\n");
 
   LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
   Type *LoadTy = SomeLoad->getType();
-  IRBuilderTy PHIBuilder(&PN);
-  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
-                                        PN.getName() + ".sroa.speculated");
+  IRB.SetInsertPoint(&PN);
+  PHINode *NewPN = IRB.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                 PN.getName() + ".sroa.speculated");
 
   // Get the AA tags and alignment to use from one of the loads. It does not
   // matter which one we get and if any differ.
@@ -1301,9 +1308,9 @@ static void speculatePHINodeLoads(PHINode &PN) {
     }
 
     Instruction *TI = Pred->getTerminator();
-    IRBuilderTy PredBuilder(TI);
+    IRB.SetInsertPoint(TI);
 
-    LoadInst *Load = PredBuilder.CreateAlignedLoad(
+    LoadInst *Load = IRB.CreateAlignedLoad(
         LoadTy, InVal, Alignment,
         (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
     ++NumLoadsSpeculated;
@@ -1361,10 +1368,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
   return true;
 }
 
-static void speculateSelectInstLoads(SelectInst &SI) {
+static void speculateSelectInstLoads(IRBuilderTy &IRB, SelectInst &SI) {
   LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
 
-  IRBuilderTy IRB(&SI);
+  IRB.SetInsertPoint(&SI);
   Value *TV = SI.getTrueValue();
   Value *FV = SI.getFalseValue();
   // Replace the loads of the select with a select of two loads.
@@ -1430,8 +1437,10 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
   if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
     return BasePtr;
 
-  return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
-                               BasePtr, Indices, NamePrefix + "sroa_idx");
+  // buildGEP() is only called for non-opaque pointers.
+  return IRB.CreateInBoundsGEP(
+      BasePtr->getType()->getNonOpaquePointerElementType(), BasePtr, Indices,
+      NamePrefix + "sroa_idx");
 }
 
 /// Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1504,7 +1513,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
     return nullptr;
 
-  Type *ElementTy = Ty->getElementType();
+  Type *ElementTy = Ty->getNonOpaquePointerElementType();
   if (!ElementTy->isSized())
     return nullptr; // We can't GEP through an unsized element.
 
@@ -1563,7 +1572,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
   APInt Int8PtrOffset(Offset.getBitWidth(), 0);
 
   PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
-  Type *TargetTy = TargetPtrTy->getElementType();
+  Type *TargetTy = TargetPtrTy->getNonOpaquePointerElementType();
 
   // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
   // address space from the expected `PointerTy` (the pointer to be used).
@@ -2558,7 +2567,7 @@ private:
       // the computed value, and then replace the placeholder with LI, leaving
       // LI only used for this computation.
       Value *Placeholder = new LoadInst(
-          LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
+          LI.getType(), PoisonValue::get(LI.getType()->getPointerTo(AS)), "",
           false, Align(1));
       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                         "insert");
@@ -3223,8 +3232,11 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   /// Used to calculate offsets, and hence alignment, of subobjects.
   const DataLayout &DL;
 
+  IRBuilderTy &IRB;
+
 public:
-  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
+  AggLoadStoreRewriter(const DataLayout &DL, IRBuilderTy &IRB)
+      : DL(DL), IRB(IRB) {}
 
   /// Rewrite loads and stores through a pointer and all pointers derived from
   /// it.
@@ -3255,7 +3267,7 @@ private:
   template <typename Derived> class OpSplitter {
   protected:
     /// The builder used to form new instructions.
-    IRBuilderTy IRB;
+    IRBuilderTy &IRB;
 
     /// The indices which to be used with insert- or extractvalue to select the
     /// appropriate value within the aggregate.
@@ -3282,9 +3294,11 @@ private:
     /// Initialize the splitter with an insertion point, Ptr and start with a
     /// single zero GEP index.
     OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
-               Align BaseAlign, const DataLayout &DL)
-        : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
-          BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
+               Align BaseAlign, const DataLayout &DL, IRBuilderTy &IRB)
+        : IRB(IRB), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr), BaseTy(BaseTy),
+          BaseAlign(BaseAlign), DL(DL) {
+      IRB.SetInsertPoint(InsertionPoint);
+    }
 
   public:
     /// Generic recursive split emission routine.
@@ -3345,9 +3359,10 @@ private:
     AAMDNodes AATags;
 
     LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
-                   AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
-        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
-                                     DL),
+                   AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
+                   IRBuilderTy &IRB)
+        : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign, DL,
+                                     IRB),
           AATags(AATags) {}
 
     /// Emit a leaf load of a single value. This is called at the leaves of the
@@ -3379,8 +3394,8 @@ private:
     // We have an aggregate being loaded, split it apart.
     LLVM_DEBUG(dbgs() << "    original: " << LI << "\n");
     LoadOpSplitter Splitter(&LI, *U, LI.getType(), LI.getAAMetadata(),
-                            getAdjustedAlignment(&LI, 0), DL);
-    Value *V = UndefValue::get(LI.getType());
+                            getAdjustedAlignment(&LI, 0), DL, IRB);
+    Value *V = PoisonValue::get(LI.getType());
     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
     Visited.erase(&LI);
     LI.replaceAllUsesWith(V);
@@ -3390,9 +3405,10 @@ private:
 
   struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
     StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
-                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
+                    AAMDNodes AATags, Align BaseAlign, const DataLayout &DL,
+                    IRBuilderTy &IRB)
         : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
-                                      DL),
+                                      DL, IRB),
           AATags(AATags) {}
     AAMDNodes AATags;
     /// Emit a leaf store of a single value. This is called at the leaves of the
@@ -3430,7 +3446,7 @@ private:
     // We have an aggregate being stored, split it apart.
     LLVM_DEBUG(dbgs() << "    original: " << SI << "\n");
     StoreOpSplitter Splitter(&SI, *U, V->getType(), SI.getAAMetadata(),
-                             getAdjustedAlignment(&SI, 0), DL);
+                             getAdjustedAlignment(&SI, 0), DL, IRB);
     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
     Visited.erase(&SI);
     SI.eraseFromParent();
@@ -3458,7 +3474,7 @@ private:
                       << "\n    original: " << *Sel
                       << "\n              " << GEPI);
 
-    IRBuilderTy Builder(&GEPI);
+    IRB.SetInsertPoint(&GEPI);
     SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
 
@@ -3466,21 +3482,20 @@ private:
     Value *True = Sel->getTrueValue();
     Value *NTrue =
         IsInBounds
-            ? Builder.CreateInBoundsGEP(Ty, True, Index,
-                                        True->getName() + ".sroa.gep")
-            : Builder.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
+            ? IRB.CreateInBoundsGEP(Ty, True, Index,
+                                    True->getName() + ".sroa.gep")
+            : IRB.CreateGEP(Ty, True, Index, True->getName() + ".sroa.gep");
 
     Value *False = Sel->getFalseValue();
 
     Value *NFalse =
         IsInBounds
-            ? Builder.CreateInBoundsGEP(Ty, False, Index,
-                                        False->getName() + ".sroa.gep")
-            : Builder.CreateGEP(Ty, False, Index,
-                                False->getName() + ".sroa.gep");
+            ? IRB.CreateInBoundsGEP(Ty, False, Index,
+                                    False->getName() + ".sroa.gep")
+            : IRB.CreateGEP(Ty, False, Index, False->getName() + ".sroa.gep");
 
-    Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
-                                       Sel->getName() + ".sroa.sel");
+    Value *NSel = IRB.CreateSelect(Sel->getCondition(), NTrue, NFalse,
+                                   Sel->getName() + ".sroa.sel");
     Visited.erase(&GEPI);
     GEPI.replaceAllUsesWith(NSel);
     GEPI.eraseFromParent();
@@ -3517,10 +3532,9 @@ private:
 
     SmallVector<Value *, 4> Index(GEPI.indices());
     bool IsInBounds = GEPI.isInBounds();
-    IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
-    PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
-                                          PHI->getNumIncomingValues(),
-                                          PHI->getName() + ".sroa.phi");
+    IRB.SetInsertPoint(GEPI.getParent()->getFirstNonPHI());
+    PHINode *NewPN = IRB.CreatePHI(GEPI.getType(), PHI->getNumIncomingValues(),
+                                   PHI->getName() + ".sroa.phi");
     for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
       BasicBlock *B = PHI->getIncomingBlock(I);
       Value *NewVal = nullptr;
@@ -3530,11 +3544,12 @@ private:
       } else {
         Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
 
-        IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
+        IRB.SetInsertPoint(In->getParent(), std::next(In->getIterator()));
         Type *Ty = GEPI.getSourceElementType();
-        NewVal = IsInBounds
-            ? B.CreateInBoundsGEP(Ty, In, Index, In->getName() + ".sroa.gep")
-            : B.CreateGEP(Ty, In, Index, In->getName() + ".sroa.gep");
+        NewVal = IsInBounds ? IRB.CreateInBoundsGEP(Ty, In, Index,
+                                                    In->getName() + ".sroa.gep")
+                            : IRB.CreateGEP(Ty, In, Index,
+                                            In->getName() + ".sroa.gep");
       }
       NewPN->addIncoming(NewVal, B);
     }
@@ -4557,11 +4572,11 @@ bool SROAPass::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   return Changed;
 }
 
-/// Clobber a use with undef, deleting the used value if it becomes dead.
+/// Clobber a use with poison, deleting the used value if it becomes dead.
 void SROAPass::clobberUse(Use &U) {
   Value *OldV = U;
-  // Replace the use with an undef value.
-  U = UndefValue::get(OldV->getType());
+  // Replace the use with an poison value.
+  U = PoisonValue::get(OldV->getType());
 
   // Check for this making an instruction dead. We have to garbage collect
   // all the dead instructions to ensure the uses of any alloca end up being
@@ -4598,7 +4613,8 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) {
 
   // First, split any FCA loads and stores touching this alloca to promote
   // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter(DL);
+  IRBuilderTy IRB(&AI);
+  AggLoadStoreRewriter AggRewriter(DL, IRB);
   Changed |= AggRewriter.rewrite(AI);
 
   // Build the slices using a recursive instruction-visiting builder.
@@ -4614,7 +4630,7 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) {
       clobberUse(DeadOp);
 
     // Now replace the uses of this instruction.
-    DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+    DeadUser->replaceAllUsesWith(PoisonValue::get(DeadUser->getType()));
 
     // And mark it for deletion.
     DeadInsts.push_back(DeadUser);
@@ -4633,11 +4649,11 @@ bool SROAPass::runOnAlloca(AllocaInst &AI) {
 
   LLVM_DEBUG(dbgs() << "  Speculating PHIs\n");
   while (!SpeculatablePHIs.empty())
-    speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+    speculatePHINodeLoads(IRB, *SpeculatablePHIs.pop_back_val());
 
   LLVM_DEBUG(dbgs() << "  Speculating Selects\n");
   while (!SpeculatableSelects.empty())
-    speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+    speculateSelectInstLoads(IRB, *SpeculatableSelects.pop_back_val());
 
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
index 1284bae820a4..29cea42e4a00 100644
--- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp
@@ -959,7 +959,8 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
       Type *LoadTy = CI->getType();
       Align Alignment = DL.getValueOrABITypeAlignment(MA,
                                                       LoadTy->getScalarType());
-      if (TTI.isLegalMaskedGather(LoadTy, Alignment))
+      if (TTI.isLegalMaskedGather(LoadTy, Alignment) &&
+          !TTI.forceScalarizeMaskedGather(cast<VectorType>(LoadTy), Alignment))
         return false;
       scalarizeMaskedGather(DL, CI, DTU, ModifiedDT);
       return true;
@@ -970,7 +971,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT,
       Type *StoreTy = CI->getArgOperand(0)->getType();
       Align Alignment = DL.getValueOrABITypeAlignment(MA,
                                                       StoreTy->getScalarType());
-      if (TTI.isLegalMaskedScatter(StoreTy, Alignment))
+      if (TTI.isLegalMaskedScatter(StoreTy, Alignment) &&
+          !TTI.forceScalarizeMaskedScatter(cast<VectorType>(StoreTy),
+                                           Alignment))
         return false;
       scalarizeMaskedScatter(DL, CI, DTU, ModifiedDT);
       return true;
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 6b7419abe1d1..3606c8a4b073 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -270,7 +270,7 @@ Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
   Type *Ty = V->getType();
   PtrTy = dyn_cast<PointerType>(Ty);
   if (PtrTy)
-    Ty = PtrTy->getElementType();
+    Ty = PtrTy->getPointerElementType();
   Size = cast<FixedVectorType>(Ty)->getNumElements();
   if (!CachePtr)
     Tmp.resize(Size, nullptr);
@@ -288,7 +288,8 @@ Value *Scatterer::operator[](unsigned I) {
     return CV[I];
   IRBuilder<> Builder(BB, BBI);
   if (PtrTy) {
-    Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType();
+    Type *ElTy =
+        cast<VectorType>(PtrTy->getPointerElementType())->getElementType();
     if (!CV[0]) {
       Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace());
       CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0");
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 3799d2dd1cf2..ee17da1875e5 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -78,6 +78,79 @@ static cl::opt<bool> UserSinkCommonInsts(
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
+static bool
+performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs,
+                        std::vector<DominatorTree::UpdateType> *Updates) {
+  SmallVector<PHINode *, 1> NewOps;
+
+  // We don't want to change IR just because we can.
+  // Only do that if there are at least two blocks we'll tail-merge.
+  if (BBs.size() < 2)
+    return false;
+
+  if (Updates)
+    Updates->reserve(Updates->size() + BBs.size());
+
+  BasicBlock *CanonicalBB;
+  Instruction *CanonicalTerm;
+  {
+    auto *Term = BBs[0]->getTerminator();
+
+    // Create a canonical block for this function terminator type now,
+    // placing it *before* the first block that will branch to it.
+    CanonicalBB = BasicBlock::Create(
+        F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]);
+    // We'll also need a PHI node per each operand of the terminator.
+    NewOps.resize(Term->getNumOperands());
+    for (auto I : zip(Term->operands(), NewOps)) {
+      std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(),
+                                       /*NumReservedValues=*/BBs.size(),
+                                       CanonicalBB->getName() + ".op");
+      CanonicalBB->getInstList().push_back(std::get<1>(I));
+    }
+    // Make it so that this canonical block actually has the right
+    // terminator.
+    CanonicalTerm = Term->clone();
+    CanonicalBB->getInstList().push_back(CanonicalTerm);
+    // If the canonical terminator has operands, rewrite it to take PHI's.
+    for (auto I : zip(NewOps, CanonicalTerm->operands()))
+      std::get<1>(I) = std::get<0>(I);
+  }
+
+  // Now, go through each block (with the current terminator type)
+  // we've recorded, and rewrite it to branch to the new common block.
+  const DILocation *CommonDebugLoc = nullptr;
+  for (BasicBlock *BB : BBs) {
+    auto *Term = BB->getTerminator();
+    assert(Term->getOpcode() == CanonicalTerm->getOpcode() &&
+           "All blocks to be tail-merged must be the same "
+           "(function-terminating) terminator type.");
+
+    // Aha, found a new non-canonical function terminator. If it has operands,
+    // forward them to the PHI nodes in the canonical block.
+    for (auto I : zip(Term->operands(), NewOps))
+      std::get<1>(I)->addIncoming(std::get<0>(I), BB);
+
+    // Compute the debug location common to all the original terminators.
+    if (!CommonDebugLoc)
+      CommonDebugLoc = Term->getDebugLoc();
+    else
+      CommonDebugLoc =
+          DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
+
+    // And turn BB into a block that just unconditionally branches
+    // to the canonical block.
+    Term->eraseFromParent();
+    BranchInst::Create(CanonicalBB, BB);
+    if (Updates)
+      Updates->push_back({DominatorTree::Insert, BB, CanonicalBB});
+  }
+
+  CanonicalTerm->setDebugLoc(CommonDebugLoc);
+
+  return true;
+}
+
 static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F,
                                                           DomTreeUpdater *DTU) {
   SmallMapVector<unsigned /*TerminatorOpcode*/, SmallVector<BasicBlock *, 2>, 4>
@@ -133,73 +206,8 @@ static bool tailMergeBlocksWithSimilarFunctionTerminators(Function &F,
 
   std::vector<DominatorTree::UpdateType> Updates;
 
-  for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure)) {
-    SmallVector<PHINode *, 1> NewOps;
-
-    // We don't want to change IR just because we can.
-    // Only do that if there are at least two blocks we'll tail-merge.
-    if (BBs.size() < 2)
-      continue;
-
-    Changed = true;
-
-    if (DTU)
-      Updates.reserve(Updates.size() + BBs.size());
-
-    BasicBlock *CanonicalBB;
-    Instruction *CanonicalTerm;
-    {
-      auto *Term = BBs[0]->getTerminator();
-
-      // Create a canonical block for this function terminator type now,
-      // placing it *before* the first block that will branch to it.
-      CanonicalBB = BasicBlock::Create(
-          F.getContext(), Twine("common.") + Term->getOpcodeName(), &F, BBs[0]);
-      // We'll also need a PHI node per each operand of the terminator.
-      NewOps.resize(Term->getNumOperands());
-      for (auto I : zip(Term->operands(), NewOps)) {
-        std::get<1>(I) = PHINode::Create(std::get<0>(I)->getType(),
-                                         /*NumReservedValues=*/BBs.size(),
-                                         CanonicalBB->getName() + ".op");
-        CanonicalBB->getInstList().push_back(std::get<1>(I));
-      }
-      // Make it so that this canonical block actually has the right
-      // terminator.
-      CanonicalTerm = Term->clone();
-      CanonicalBB->getInstList().push_back(CanonicalTerm);
-      // If the canonical terminator has operands, rewrite it to take PHI's.
-      for (auto I : zip(NewOps, CanonicalTerm->operands()))
-        std::get<1>(I) = std::get<0>(I);
-    }
-
-    // Now, go through each block (with the current terminator type)
-    // we've recorded, and rewrite it to branch to the new common block.
-    const DILocation *CommonDebugLoc = nullptr;
-    for (BasicBlock *BB : BBs) {
-      auto *Term = BB->getTerminator();
-
-      // Aha, found a new non-canonical function terminator. If it has operands,
-      // forward them to the PHI nodes in the canonical block.
-      for (auto I : zip(Term->operands(), NewOps))
-        std::get<1>(I)->addIncoming(std::get<0>(I), BB);
-
-      // Compute the debug location common to all the original terminators.
-      if (!CommonDebugLoc)
-        CommonDebugLoc = Term->getDebugLoc();
-      else
-        CommonDebugLoc =
-            DILocation::getMergedLocation(CommonDebugLoc, Term->getDebugLoc());
-
-      // And turn BB into a block that just unconditionally branches
-      // to the canonical block.
-      Term->eraseFromParent();
-      BranchInst::Create(CanonicalBB, BB);
-      if (DTU)
-        Updates.push_back({DominatorTree::Insert, BB, CanonicalBB});
-    }
-
-    CanonicalTerm->setDebugLoc(CommonDebugLoc);
-  }
+  for (ArrayRef<BasicBlock *> BBs : make_second_range(Structure))
+    Changed |= performBlockTailMerging(F, BBs, DTU ? &Updates : nullptr);
 
   if (DTU)
     DTU->applyUpdates(Updates);
@@ -313,7 +321,7 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.SinkCommonInsts = UserSinkCommonInsts;
 }
 
-SimplifyCFGPass::SimplifyCFGPass() : Options() {
+SimplifyCFGPass::SimplifyCFGPass() {
   applyCommandLineOverridesToOptions(Options);
 }
 
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index fdc914a72bfd..c734611836eb 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -22,19 +22,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-emit-printf"
 
-static bool isCString(const Value *Arg) {
-  auto Ty = Arg->getType();
-  auto PtrTy = dyn_cast<PointerType>(Ty);
-  if (!PtrTy)
-    return false;
-
-  auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType());
-  if (!IntTy)
-    return false;
-
-  return IntTy->getBitWidth() == 8;
-}
-
 static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
   auto Int64Ty = Builder.getInt64Ty();
   auto Ty = Arg->getType();
@@ -176,13 +163,15 @@ static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
 
 static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
                            bool IsLast) {
+  Arg = Builder.CreateBitCast(
+      Arg, Builder.getInt8PtrTy(Arg->getType()->getPointerAddressSpace()));
   auto Length = getStrlenWithNull(Builder, Arg);
   return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
 }
 
 static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
                          bool SpecIsCString, bool IsLast) {
-  if (SpecIsCString && isCString(Arg)) {
+  if (SpecIsCString && isa<PointerType>(Arg->getType())) {
     return appendString(Builder, Desc, Arg, IsLast);
   }
   // If the format specifies a string but the argument is not, the frontend will
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 580cfd80141e..97f11ca71726 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -34,6 +34,7 @@ STATISTIC(NumReadNone, "Number of functions inferred as readnone");
 STATISTIC(NumInaccessibleMemOnly,
           "Number of functions inferred as inaccessiblememonly");
 STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumWriteOnly, "Number of functions inferred as writeonly");
 STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
 STATISTIC(NumInaccessibleMemOrArgMemOnly,
           "Number of functions inferred as inaccessiblemem_or_argmemonly");
@@ -71,6 +72,19 @@ static bool setOnlyReadsMemory(Function &F) {
   return true;
 }
 
+static bool setOnlyWritesMemory(Function &F) {
+  if (F.onlyWritesMemory()) // writeonly or readnone
+    return false;
+  // Turn readonly and writeonly into readnone.
+  if (F.hasFnAttribute(Attribute::ReadOnly)) {
+    F.removeFnAttr(Attribute::ReadOnly);
+    return setDoesNotAccessMemory(F);
+  }
+  ++NumWriteOnly;
+  F.setOnlyWritesMemory();
+  return true;
+}
+
 static bool setOnlyAccessesArgMemory(Function &F) {
   if (F.onlyAccessesArgMemory())
     return false;
@@ -233,6 +247,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
 
   switch (TheLibFunc) {
   case LibFunc_strlen:
+  case LibFunc_strnlen:
   case LibFunc_wcslen:
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
@@ -400,6 +415,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
     return Changed;
+  case LibFunc_aligned_alloc:
+  case LibFunc_valloc:
   case LibFunc_malloc:
   case LibFunc_vec_malloc:
     Changed |= setOnlyAccessesInaccessibleMemory(F);
@@ -484,6 +501,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_realloc:
   case LibFunc_vec_realloc:
+  case LibFunc_reallocf:
     Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
     Changed |= setRetNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -492,11 +510,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setArgNoUndef(F, 1);
     return Changed;
-  case LibFunc_reallocf:
-    Changed |= setRetNoUndef(F);
-    Changed |= setWillReturn(F);
-    Changed |= setArgNoUndef(F, 1);
-    return Changed;
   case LibFunc_read:
     // May throw; "read" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
@@ -536,13 +549,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc_aligned_alloc:
-    Changed |= setOnlyAccessesInaccessibleMemory(F);
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setRetDoesNotAlias(F);
-    Changed |= setWillReturn(F);
-    return Changed;
   case LibFunc_bcopy:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
@@ -569,6 +575,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     return Changed;
   case LibFunc_calloc:
   case LibFunc_vec_calloc:
+    Changed |= setOnlyAccessesInaccessibleMemory(F);
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
@@ -851,13 +858,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
-  case LibFunc_valloc:
-    Changed |= setOnlyAccessesInaccessibleMemory(F);
-    Changed |= setRetAndArgsNoUndef(F);
-    Changed |= setDoesNotThrow(F);
-    Changed |= setRetDoesNotAlias(F);
-    Changed |= setWillReturn(F);
-    return Changed;
   case LibFunc_vprintf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -1020,12 +1020,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_memset_pattern4:
   case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
-    Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setDoesNotCapture(F, 0);
-    Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    LLVM_FALLTHROUGH;
   case LibFunc_memset:
     Changed |= setWillReturn(F);
     LLVM_FALLTHROUGH;
@@ -1158,7 +1156,6 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_sqrt:
   case LibFunc_sqrtf:
   case LibFunc_sqrtl:
-  case LibFunc_strnlen:
   case LibFunc_tan:
   case LibFunc_tanf:
   case LibFunc_tanh:
@@ -1171,6 +1168,7 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_truncl:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotFreeMemory(F);
+    Changed |= setOnlyWritesMemory(F);
     Changed |= setWillReturn(F);
     return Changed;
   default:
diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
index b2763900e154..ac3839f2a4ab 100644
--- a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -20,8 +20,7 @@ using namespace llvm;
 
 bool CallGraphUpdater::finalize() {
   if (!DeadFunctionsInComdats.empty()) {
-    filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(),
-                              DeadFunctionsInComdats);
+    filterDeadComdatFunctions(DeadFunctionsInComdats);
     DeadFunctions.append(DeadFunctionsInComdats.begin(),
                          DeadFunctionsInComdats.end());
   }
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index ebe19f1751e5..56b6e4bc46a5 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -500,7 +500,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
       CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
-      AttrBuilder ArgAttrs(CallerPAL.getParamAttrs(ArgNo));
+      AttrBuilder ArgAttrs(Ctx, CallerPAL.getParamAttrs(ArgNo));
       ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
 
       // We may have a different byval/inalloca type.
@@ -518,7 +518,7 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
   // If the return type of the call site doesn't match that of the callee, cast
   // the returned value to the appropriate type.
   // Remove any incompatible return value attribute.
-  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+  AttrBuilder RAttrs(Ctx, CallerPAL.getRetAttrs());
   if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
     createRetBitCast(CB, CallSiteRetTy, RetBitCast);
     RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 96aff563aa9b..24cd5747c5a4 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -829,39 +829,54 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   default: RetTy = Type::getInt16Ty(header->getContext()); break;
   }
 
-  std::vector<Type *> paramTy;
+  std::vector<Type *> ParamTy;
+  std::vector<Type *> AggParamTy;
+  ValueSet StructValues;
 
   // Add the types of the input values to the function's argument list
   for (Value *value : inputs) {
     LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
-    paramTy.push_back(value->getType());
+    if (AggregateArgs && !ExcludeArgsFromAggregate.contains(value)) {
+      AggParamTy.push_back(value->getType());
+      StructValues.insert(value);
+    } else
+      ParamTy.push_back(value->getType());
   }
 
   // Add the types of the output values to the function's argument list.
   for (Value *output : outputs) {
     LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
-    if (AggregateArgs)
-      paramTy.push_back(output->getType());
-    else
-      paramTy.push_back(PointerType::getUnqual(output->getType()));
+    if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) {
+      AggParamTy.push_back(output->getType());
+      StructValues.insert(output);
+    } else
+      ParamTy.push_back(PointerType::getUnqual(output->getType()));
+  }
+
+  assert(
+      (ParamTy.size() + AggParamTy.size()) ==
+          (inputs.size() + outputs.size()) &&
+      "Number of scalar and aggregate params does not match inputs, outputs");
+  assert(StructValues.empty() ||
+         AggregateArgs && "Expeced StructValues only with AggregateArgs set");
+
+  // Concatenate scalar and aggregate params in ParamTy.
+  size_t NumScalarParams = ParamTy.size();
+  StructType *StructTy = nullptr;
+  if (AggregateArgs && !AggParamTy.empty()) {
+    StructTy = StructType::get(M->getContext(), AggParamTy);
+    ParamTy.push_back(PointerType::getUnqual(StructTy));
   }
 
   LLVM_DEBUG({
     dbgs() << "Function type: " << *RetTy << " f(";
-    for (Type *i : paramTy)
+    for (Type *i : ParamTy)
       dbgs() << *i << ", ";
     dbgs() << ")\n";
   });
 
-  StructType *StructTy = nullptr;
-  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
-    StructTy = StructType::get(M->getContext(), paramTy);
-    paramTy.clear();
-    paramTy.push_back(PointerType::getUnqual(StructTy));
-  }
-  FunctionType *funcType =
-                  FunctionType::get(RetTy, paramTy,
-                                    AllowVarArgs && oldFunction->isVarArg());
+  FunctionType *funcType = FunctionType::get(
+      RetTy, ParamTy, AllowVarArgs && oldFunction->isVarArg());
 
   std::string SuffixToUse =
       Suffix.empty()
@@ -871,13 +886,6 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   Function *newFunction = Function::Create(
       funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(),
       oldFunction->getName() + "." + SuffixToUse, M);
-  // If the old function is no-throw, so is the new one.
-  if (oldFunction->doesNotThrow())
-    newFunction->setDoesNotThrow();
-
-  // Inherit the uwtable attribute if we need to.
-  if (oldFunction->hasUWTable())
-    newFunction->setHasUWTable();
 
   // Inherit all of the target dependent attributes and white-listed
   // target independent attributes.
@@ -893,53 +901,26 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
     } else
       switch (Attr.getKindAsEnum()) {
       // Those attributes cannot be propagated safely. Explicitly list them
-      // here so we get a warning if new attributes are added. This list also
-      // includes non-function attributes.
-      case Attribute::Alignment:
+      // here so we get a warning if new attributes are added.
       case Attribute::AllocSize:
       case Attribute::ArgMemOnly:
       case Attribute::Builtin:
-      case Attribute::ByVal:
       case Attribute::Convergent:
-      case Attribute::Dereferenceable:
-      case Attribute::DereferenceableOrNull:
-      case Attribute::ElementType:
-      case Attribute::InAlloca:
-      case Attribute::InReg:
       case Attribute::InaccessibleMemOnly:
       case Attribute::InaccessibleMemOrArgMemOnly:
       case Attribute::JumpTable:
       case Attribute::Naked:
-      case Attribute::Nest:
-      case Attribute::NoAlias:
       case Attribute::NoBuiltin:
-      case Attribute::NoCapture:
       case Attribute::NoMerge:
       case Attribute::NoReturn:
       case Attribute::NoSync:
-      case Attribute::NoUndef:
-      case Attribute::None:
-      case Attribute::NonNull:
-      case Attribute::Preallocated:
       case Attribute::ReadNone:
       case Attribute::ReadOnly:
-      case Attribute::Returned:
       case Attribute::ReturnsTwice:
-      case Attribute::SExt:
       case Attribute::Speculatable:
       case Attribute::StackAlignment:
-      case Attribute::StructRet:
-      case Attribute::SwiftError:
-      case Attribute::SwiftSelf:
-      case Attribute::SwiftAsync:
       case Attribute::WillReturn:
       case Attribute::WriteOnly:
-      case Attribute::ZExt:
-      case Attribute::ImmArg:
-      case Attribute::ByRef:
-      case Attribute::EndAttrKinds:
-      case Attribute::EmptyKey:
-      case Attribute::TombstoneKey:
         continue;
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
@@ -980,30 +961,62 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::MustProgress:
       case Attribute::NoProfile:
         break;
+      // These attributes cannot be applied to functions.
+      case Attribute::Alignment:
+      case Attribute::ByVal:
+      case Attribute::Dereferenceable:
+      case Attribute::DereferenceableOrNull:
+      case Attribute::ElementType:
+      case Attribute::InAlloca:
+      case Attribute::InReg:
+      case Attribute::Nest:
+      case Attribute::NoAlias:
+      case Attribute::NoCapture:
+      case Attribute::NoUndef:
+      case Attribute::NonNull:
+      case Attribute::Preallocated:
+      case Attribute::Returned:
+      case Attribute::SExt:
+      case Attribute::StructRet:
+      case Attribute::SwiftError:
+      case Attribute::SwiftSelf:
+      case Attribute::SwiftAsync:
+      case Attribute::ZExt:
+      case Attribute::ImmArg:
+      case Attribute::ByRef:
+      //  These are not really attributes.
+      case Attribute::None:
+      case Attribute::EndAttrKinds:
+      case Attribute::EmptyKey:
+      case Attribute::TombstoneKey:
+        llvm_unreachable("Not a function attribute");
       }
 
     newFunction->addFnAttr(Attr);
   }
   newFunction->getBasicBlockList().push_back(newRootNode);
 
-  // Create an iterator to name all of the arguments we inserted.
-  Function::arg_iterator AI = newFunction->arg_begin();
+  // Create scalar and aggregate iterators to name all of the arguments we
+  // inserted.
+  Function::arg_iterator ScalarAI = newFunction->arg_begin();
+  Function::arg_iterator AggAI = std::next(ScalarAI, NumScalarParams);
 
   // Rewrite all users of the inputs in the extracted region to use the
   // arguments (or appropriate addressing into struct) instead.
-  for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+  for (unsigned i = 0, e = inputs.size(), aggIdx = 0; i != e; ++i) {
     Value *RewriteVal;
-    if (AggregateArgs) {
+    if (AggregateArgs && StructValues.contains(inputs[i])) {
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), aggIdx);
       Instruction *TI = newFunction->begin()->getTerminator();
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
-          StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
-      RewriteVal = new LoadInst(StructTy->getElementType(i), GEP,
+          StructTy, &*AggAI, Idx, "gep_" + inputs[i]->getName(), TI);
+      RewriteVal = new LoadInst(StructTy->getElementType(aggIdx), GEP,
                                 "loadgep_" + inputs[i]->getName(), TI);
+      ++aggIdx;
     } else
-      RewriteVal = &*AI++;
+      RewriteVal = &*ScalarAI++;
 
     std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end());
     for (User *use : Users)
@@ -1013,12 +1026,14 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   }
 
   // Set names for input and output arguments.
-  if (!AggregateArgs) {
-    AI = newFunction->arg_begin();
-    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
-      AI->setName(inputs[i]->getName());
-    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
-      AI->setName(outputs[i]->getName()+".out");
+  if (NumScalarParams) {
+    ScalarAI = newFunction->arg_begin();
+    for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++ScalarAI)
+      if (!StructValues.contains(inputs[i]))
+        ScalarAI->setName(inputs[i]->getName());
+    for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++ScalarAI)
+      if (!StructValues.contains(outputs[i]))
+        ScalarAI->setName(outputs[i]->getName() + ".out");
   }
 
   // Rewrite branches to basic blocks outside of the loop to new dummy blocks
@@ -1126,7 +1141,8 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
                                                     ValueSet &outputs) {
   // Emit a call to the new function, passing in: *pointer to struct (if
   // aggregating parameters), or plan inputs and allocated memory for outputs
-  std::vector<Value *> params, StructValues, ReloadOutputs, Reloads;
+  std::vector<Value *> params, ReloadOutputs, Reloads;
+  ValueSet StructValues;
 
   Module *M = newFunction->getParent();
   LLVMContext &Context = M->getContext();
@@ -1134,23 +1150,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   CallInst *call = nullptr;
 
   // Add inputs as params, or to be filled into the struct
-  unsigned ArgNo = 0;
+  unsigned ScalarInputArgNo = 0;
   SmallVector<unsigned, 1> SwiftErrorArgs;
   for (Value *input : inputs) {
-    if (AggregateArgs)
-      StructValues.push_back(input);
+    if (AggregateArgs && !ExcludeArgsFromAggregate.contains(input))
+      StructValues.insert(input);
     else {
       params.push_back(input);
       if (input->isSwiftError())
-        SwiftErrorArgs.push_back(ArgNo);
+        SwiftErrorArgs.push_back(ScalarInputArgNo);
     }
-    ++ArgNo;
+    ++ScalarInputArgNo;
   }
 
   // Create allocas for the outputs
+  unsigned ScalarOutputArgNo = 0;
   for (Value *output : outputs) {
-    if (AggregateArgs) {
-      StructValues.push_back(output);
+    if (AggregateArgs && !ExcludeArgsFromAggregate.contains(output)) {
+      StructValues.insert(output);
     } else {
       AllocaInst *alloca =
         new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
@@ -1158,12 +1175,14 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
                        &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
+      ++ScalarOutputArgNo;
     }
   }
 
   StructType *StructArgTy = nullptr;
   AllocaInst *Struct = nullptr;
-  if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+  unsigned NumAggregatedInputs = 0;
+  if (AggregateArgs && !StructValues.empty()) {
     std::vector<Type *> ArgTypes;
     for (Value *V : StructValues)
       ArgTypes.push_back(V->getType());
@@ -1175,14 +1194,18 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
                             &codeReplacer->getParent()->front().front());
     params.push_back(Struct);
 
-    for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
-      Value *Idx[2];
-      Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
-      GetElementPtrInst *GEP = GetElementPtrInst::Create(
-          StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
-      codeReplacer->getInstList().push_back(GEP);
-      new StoreInst(StructValues[i], GEP, codeReplacer);
+    // Store aggregated inputs in the struct.
+    for (unsigned i = 0, e = StructValues.size(); i != e; ++i) {
+      if (inputs.contains(StructValues[i])) {
+        Value *Idx[2];
+        Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+        Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
+        GetElementPtrInst *GEP = GetElementPtrInst::Create(
+            StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
+        codeReplacer->getInstList().push_back(GEP);
+        new StoreInst(StructValues[i], GEP, codeReplacer);
+        NumAggregatedInputs++;
+      }
     }
   }
 
@@ -1205,24 +1228,24 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
     newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
   }
 
-  Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
-  unsigned FirstOut = inputs.size();
-  if (!AggregateArgs)
-    std::advance(OutputArgBegin, inputs.size());
-
-  // Reload the outputs passed in by reference.
-  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+  // Reload the outputs passed in by reference, use the struct if output is in
+  // the aggregate or reload from the scalar argument.
+  for (unsigned i = 0, e = outputs.size(), scalarIdx = 0,
+                aggIdx = NumAggregatedInputs;
+       i != e; ++i) {
     Value *Output = nullptr;
-    if (AggregateArgs) {
+    if (AggregateArgs && StructValues.contains(outputs[i])) {
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx);
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
       Output = GEP;
+      ++aggIdx;
     } else {
-      Output = ReloadOutputs[i];
+      Output = ReloadOutputs[scalarIdx];
+      ++scalarIdx;
     }
     LoadInst *load = new LoadInst(outputs[i]->getType(), Output,
                                   outputs[i]->getName() + ".reload",
@@ -1304,8 +1327,13 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
   // Store the arguments right after the definition of output value.
   // This should be proceeded after creating exit stubs to be ensure that invoke
   // result restore will be placed in the outlined function.
-  Function::arg_iterator OAI = OutputArgBegin;
-  for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+  Function::arg_iterator ScalarOutputArgBegin = newFunction->arg_begin();
+  std::advance(ScalarOutputArgBegin, ScalarInputArgNo);
+  Function::arg_iterator AggOutputArgBegin = newFunction->arg_begin();
+  std::advance(AggOutputArgBegin, ScalarInputArgNo + ScalarOutputArgNo);
+
+  for (unsigned i = 0, e = outputs.size(), aggIdx = NumAggregatedInputs; i != e;
+       ++i) {
     auto *OutI = dyn_cast<Instruction>(outputs[i]);
     if (!OutI)
       continue;
@@ -1325,23 +1353,27 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
     assert((InsertBefore->getFunction() == newFunction ||
             Blocks.count(InsertBefore->getParent())) &&
            "InsertPt should be in new function");
-    assert(OAI != newFunction->arg_end() &&
-           "Number of output arguments should match "
-           "the amount of defined values");
-    if (AggregateArgs) {
+    if (AggregateArgs && StructValues.contains(outputs[i])) {
+      assert(AggOutputArgBegin != newFunction->arg_end() &&
+             "Number of aggregate output arguments should match "
+             "the number of defined values");
       Value *Idx[2];
       Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+      Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), aggIdx);
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
-          StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(),
+          StructArgTy, &*AggOutputArgBegin, Idx, "gep_" + outputs[i]->getName(),
           InsertBefore);
       new StoreInst(outputs[i], GEP, InsertBefore);
+      ++aggIdx;
       // Since there should be only one struct argument aggregating
-      // all the output values, we shouldn't increment OAI, which always
-      // points to the struct argument, in this case.
+      // all the output values, we shouldn't increment AggOutputArgBegin, which
+      // always points to the struct argument, in this case.
     } else {
-      new StoreInst(outputs[i], &*OAI, InsertBefore);
-      ++OAI;
+      assert(ScalarOutputArgBegin != newFunction->arg_end() &&
+             "Number of scalar output arguments should match "
+             "the number of defined values");
+      new StoreInst(outputs[i], &*ScalarOutputArgBegin, InsertBefore);
+      ++ScalarOutputArgBegin;
     }
   }
 
@@ -1840,3 +1872,7 @@ bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
   }
   return false;
 }
+
+void CodeExtractor::excludeArgFromAggregate(Value *Arg) {
+  ExcludeArgsFromAggregate.insert(Arg);
+}
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index 91630d876fc8..e73287c060ae 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -122,129 +122,114 @@ isSimpleEnoughValueToCommit(Constant *C,
   return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
 }
 
-/// Return true if this constant is simple enough for us to understand.  In
-/// particular, if it is a cast to anything other than from one pointer type to
-/// another pointer type, we punt.  We basically just support direct accesses to
-/// globals and GEP's of globals.  This should be kept up to date with
-/// CommitValueTo.
-static bool isSimpleEnoughPointerToCommit(Constant *C, const DataLayout &DL) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
-    // Do not allow weak/*_odr/linkonce linkage or external globals.
-    return GV->hasUniqueInitializer();
-
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
-    // Handle a constantexpr gep.
-    if (CE->getOpcode() == Instruction::GetElementPtr &&
-        isa<GlobalVariable>(CE->getOperand(0)) &&
-        cast<GEPOperator>(CE)->isInBounds()) {
-      GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
-      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
-      // external globals.
-      if (!GV->hasUniqueInitializer())
-        return false;
+void Evaluator::MutableValue::clear() {
+  if (auto *Agg = Val.dyn_cast<MutableAggregate *>())
+    delete Agg;
+  Val = nullptr;
+}
 
-      // The first index must be zero.
-      ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
-      if (!CI || !CI->isZero()) return false;
+Constant *Evaluator::MutableValue::read(Type *Ty, APInt Offset,
+                                        const DataLayout &DL) const {
+  TypeSize TySize = DL.getTypeStoreSize(Ty);
+  const MutableValue *V = this;
+  while (const auto *Agg = V->Val.dyn_cast<MutableAggregate *>()) {
+    Type *AggTy = Agg->Ty;
+    Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset);
+    if (!Index || Index->uge(Agg->Elements.size()) ||
+        !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy)))
+      return nullptr;
+
+    V = &Agg->Elements[Index->getZExtValue()];
+  }
 
-      // The remaining indices must be compile-time known integers within the
-      // notional bounds of the corresponding static array types.
-      if (!CE->isGEPWithNoNotionalOverIndexing())
-        return false;
+  return ConstantFoldLoadFromConst(V->Val.get<Constant *>(), Ty, Offset, DL);
+}
 
-      return ConstantFoldLoadThroughGEPConstantExpr(
-          GV->getInitializer(), CE,
-          cast<GEPOperator>(CE)->getResultElementType(), DL);
-    } else if (CE->getOpcode() == Instruction::BitCast &&
-               isa<GlobalVariable>(CE->getOperand(0))) {
-      // A constantexpr bitcast from a pointer to another pointer is a no-op,
-      // and we know how to evaluate it by moving the bitcast from the pointer
-      // operand to the value operand.
-      // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
-      // external globals.
-      return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
-    }
-  }
+bool Evaluator::MutableValue::makeMutable() {
+  Constant *C = Val.get<Constant *>();
+  Type *Ty = C->getType();
+  unsigned NumElements;
+  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+    NumElements = VT->getNumElements();
+  } else if (auto *AT = dyn_cast<ArrayType>(Ty))
+    NumElements = AT->getNumElements();
+  else if (auto *ST = dyn_cast<StructType>(Ty))
+    NumElements = ST->getNumElements();
+  else
+    return false;
 
-  return false;
+  MutableAggregate *MA = new MutableAggregate(Ty);
+  MA->Elements.reserve(NumElements);
+  for (unsigned I = 0; I < NumElements; ++I)
+    MA->Elements.push_back(C->getAggregateElement(I));
+  Val = MA;
+  return true;
 }
 
-/// Apply \p TryLoad to Ptr. If this returns \p nullptr, introspect the
-/// pointer's type and walk down through the initial elements to obtain
-/// additional pointers to try. Returns the first non-null return value from
-/// \p TryLoad, or \p nullptr if the type can't be introspected further.
-static Constant *
-evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL,
-                       const TargetLibraryInfo *TLI,
-                       std::function<Constant *(Constant *)> TryLoad) {
-  Constant *Val;
-  while (!(Val = TryLoad(Ptr))) {
-    // If Ty is a non-opaque struct, we can convert the pointer to the struct
-    // into a pointer to its first member.
-    // FIXME: This could be extended to support arrays as well.
-    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
-    if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isOpaque())
-      break;
-
-    IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32);
-    Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
-    Constant *const IdxList[] = {IdxZero, IdxZero};
-
-    Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList);
-    Ptr = ConstantFoldConstant(Ptr, DL, TLI);
+bool Evaluator::MutableValue::write(Constant *V, APInt Offset,
+                                    const DataLayout &DL) {
+  Type *Ty = V->getType();
+  TypeSize TySize = DL.getTypeStoreSize(Ty);
+  MutableValue *MV = this;
+  while (Offset != 0 ||
+         !CastInst::isBitOrNoopPointerCastable(Ty, MV->getType(), DL)) {
+    if (MV->Val.is<Constant *>() && !MV->makeMutable())
+      return false;
+
+    MutableAggregate *Agg = MV->Val.get<MutableAggregate *>();
+    Type *AggTy = Agg->Ty;
+    Optional<APInt> Index = DL.getGEPIndexForOffset(AggTy, Offset);
+    if (!Index || Index->uge(Agg->Elements.size()) ||
+        !TypeSize::isKnownLE(TySize, DL.getTypeStoreSize(AggTy)))
+      return false;
+
+    MV = &Agg->Elements[Index->getZExtValue()];
   }
-  return Val;
+
+  Type *MVType = MV->getType();
+  MV->clear();
+  if (Ty->isIntegerTy() && MVType->isPointerTy())
+    MV->Val = ConstantExpr::getIntToPtr(V, MVType);
+  else if (Ty->isPointerTy() && MVType->isIntegerTy())
+    MV->Val = ConstantExpr::getPtrToInt(V, MVType);
+  else if (Ty != MVType)
+    MV->Val = ConstantExpr::getBitCast(V, MVType);
+  else
+    MV->Val = V;
+  return true;
 }
 
-static Constant *getInitializer(Constant *C) {
-  auto *GV = dyn_cast<GlobalVariable>(C);
-  return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
+Constant *Evaluator::MutableAggregate::toConstant() const {
+  SmallVector<Constant *, 32> Consts;
+  for (const MutableValue &MV : Elements)
+    Consts.push_back(MV.toConstant());
+
+  if (auto *ST = dyn_cast<StructType>(Ty))
+    return ConstantStruct::get(ST, Consts);
+  if (auto *AT = dyn_cast<ArrayType>(Ty))
+    return ConstantArray::get(AT, Consts);
+  assert(isa<FixedVectorType>(Ty) && "Must be vector");
+  return ConstantVector::get(Consts);
 }
 
 /// Return the value that would be computed by a load from P after the stores
 /// reflected by 'memory' have been performed.  If we can't decide, return null.
 Constant *Evaluator::ComputeLoadResult(Constant *P, Type *Ty) {
-  // If this memory location has been recently stored, use the stored value: it
-  // is the most up-to-date.
-  auto TryFindMemLoc = [this](Constant *Ptr) {
-    return MutatedMemory.lookup(Ptr);
-  };
-
-  if (Constant *Val = TryFindMemLoc(P))
-    return Val;
-
-  // Access it.
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
-    if (GV->hasDefinitiveInitializer())
-      return GV->getInitializer();
+  APInt Offset(DL.getIndexTypeSizeInBits(P->getType()), 0);
+  P = cast<Constant>(P->stripAndAccumulateConstantOffsets(
+      DL, Offset, /* AllowNonInbounds */ true));
+  Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(P->getType()));
+  auto *GV = dyn_cast<GlobalVariable>(P);
+  if (!GV)
     return nullptr;
-  }
 
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
-    switch (CE->getOpcode()) {
-    // Handle a constantexpr getelementptr.
-    case Instruction::GetElementPtr:
-      if (auto *I = getInitializer(CE->getOperand(0)))
-        return ConstantFoldLoadThroughGEPConstantExpr(I, CE, Ty, DL);
-      break;
-    // Handle a constantexpr bitcast.
-    case Instruction::BitCast:
-      // We're evaluating a load through a pointer that was bitcast to a
-      // different type. See if the "from" pointer has recently been stored.
-      // If it hasn't, we may still be able to find a stored pointer by
-      // introspecting the type.
-      Constant *Val =
-          evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryFindMemLoc);
-      if (!Val)
-        Val = getInitializer(CE->getOperand(0));
-      if (Val)
-        return ConstantFoldLoadThroughBitcast(
-            Val, P->getType()->getPointerElementType(), DL);
-      break;
-    }
-  }
+  auto It = MutatedMemory.find(GV);
+  if (It != MutatedMemory.end())
+    return It->second.read(Ty, Offset, DL);
 
-  return nullptr;  // don't know how to evaluate.
+  if (!GV->hasDefinitiveInitializer())
+    return nullptr;
+  return ConstantFoldLoadFromConst(GV->getInitializer(), Ty, Offset, DL);
 }
 
 static Function *getFunction(Constant *C) {
@@ -260,17 +245,10 @@ static Function *getFunction(Constant *C) {
 Function *
 Evaluator::getCalleeWithFormalArgs(CallBase &CB,
                                    SmallVectorImpl<Constant *> &Formals) {
-  auto *V = CB.getCalledOperand();
+  auto *V = CB.getCalledOperand()->stripPointerCasts();
   if (auto *Fn = getFunction(getVal(V)))
     return getFormalParams(CB, Fn, Formals) ? Fn : nullptr;
-
-  auto *CE = dyn_cast<ConstantExpr>(V);
-  if (!CE || CE->getOpcode() != Instruction::BitCast ||
-      !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals))
-    return nullptr;
-
-  return dyn_cast<Function>(
-      ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
+  return nullptr;
 }
 
 bool Evaluator::getFormalParams(CallBase &CB, Function *F,
@@ -299,17 +277,13 @@ bool Evaluator::getFormalParams(CallBase &CB, Function *F,
 
 /// If call expression contains bitcast then we may need to cast
 /// evaluated return value to a type of the call expression.
-Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
-  ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
-  if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
+Constant *Evaluator::castCallResultIfNeeded(Type *ReturnType, Constant *RV) {
+  if (!RV || RV->getType() == ReturnType)
     return RV;
 
-  if (auto *FT =
-          dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
-    RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
-    if (!RV)
-      LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
-  }
+  RV = ConstantFoldLoadThroughBitcast(RV, ReturnType, DL);
+  if (!RV)
+    LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
   return RV;
 }
 
@@ -337,68 +311,30 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
         Ptr = FoldedPtr;
         LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
       }
-      // Conservatively, avoid aggregate types. This is because we don't
-      // want to worry about them partially overlapping other stores.
-      if (!SI->getValueOperand()->getType()->isSingleValueType() ||
-          !isSimpleEnoughPointerToCommit(Ptr, DL)) {
-        // If this is too complex for us to commit, reject it.
-        LLVM_DEBUG(
-            dbgs() << "Pointer is too complex for us to evaluate store.");
+
+      APInt Offset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+      Ptr = cast<Constant>(Ptr->stripAndAccumulateConstantOffsets(
+          DL, Offset, /* AllowNonInbounds */ true));
+      Offset = Offset.sextOrTrunc(DL.getIndexTypeSizeInBits(Ptr->getType()));
+      auto *GV = dyn_cast<GlobalVariable>(Ptr);
+      if (!GV || !GV->hasUniqueInitializer()) {
+        LLVM_DEBUG(dbgs() << "Store is not to global with unique initializer: "
+                          << *Ptr << "\n");
         return false;
       }
 
-      Constant *Val = getVal(SI->getOperand(0));
-
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
+      Constant *Val = getVal(SI->getOperand(0));
       if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
         LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
                           << *Val << "\n");
         return false;
       }
 
-      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
-        if (CE->getOpcode() == Instruction::BitCast) {
-          LLVM_DEBUG(dbgs()
-                     << "Attempting to resolve bitcast on constant ptr.\n");
-          // If we're evaluating a store through a bitcast, then we need
-          // to pull the bitcast off the pointer type and push it onto the
-          // stored value. In order to push the bitcast onto the stored value,
-          // a bitcast from the pointer's element type to Val's type must be
-          // legal. If it's not, we can try introspecting the type to find a
-          // legal conversion.
-
-          auto TryCastValTy = [&](Constant *P) -> Constant * {
-            // The conversion is illegal if the store is wider than the
-            // pointee proposed by `evaluateBitcastFromPtr`, since that would
-            // drop stores to other struct elements when the caller attempts to
-            // look through a struct's 0th element.
-            Type *NewTy = cast<PointerType>(P->getType())->getElementType();
-            Type *STy = Val->getType();
-            if (DL.getTypeSizeInBits(NewTy) < DL.getTypeSizeInBits(STy))
-              return nullptr;
-
-            if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, NewTy, DL)) {
-              Ptr = P;
-              return FV;
-            }
-            return nullptr;
-          };
-
-          Constant *NewVal =
-              evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, TryCastValTy);
-          if (!NewVal) {
-            LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
-                                 "evaluate.\n");
-            return false;
-          }
-
-          Val = NewVal;
-          LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
-        }
-      }
-
-      MutatedMemory[Ptr] = Val;
+      auto Res = MutatedMemory.try_emplace(GV, GV->getInitializer());
+      if (!Res.first->second.write(Val, Offset, DL))
+        return false;
     } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
       InstResult = ConstantExpr::get(BO->getOpcode(),
                                      getVal(BO->getOperand(0)),
@@ -593,7 +529,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
         if (Callee->isDeclaration()) {
           // If this is a function we can constant fold, do it.
           if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
-            InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C);
+            InstResult = castCallResultIfNeeded(CB.getType(), C);
             if (!InstResult)
               return false;
             LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
@@ -617,7 +553,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
             return false;
           }
           ValueStack.pop_back();
-          InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal);
+          InstResult = castCallResultIfNeeded(CB.getType(), RetVal);
           if (RetVal && !InstResult)
             return false;
 
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index 9bfc73e4ba6c..f8ec8c6ad426 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -66,8 +66,6 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
   for (const Use &U : V->uses()) {
     const User *UR = U.getUser();
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
-      GS.HasNonInstructionUser = true;
-
       // If the result of the constantexpr isn't pointer type, then we won't
       // know to expect it in various places.  Just reject early.
       if (!isa<PointerType>(CE->getType()))
@@ -105,9 +103,7 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
         // value, not an aggregate), keep more specific information about
         // stores.
         if (GS.StoredType != GlobalStatus::Stored) {
-          const Value *Ptr = SI->getPointerOperand();
-          if (isa<ConstantExpr>(Ptr))
-            Ptr = Ptr->stripPointerCasts();
+          const Value *Ptr = SI->getPointerOperand()->stripPointerCasts();
           if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
             Value *StoredVal = SI->getOperand(0);
 
@@ -174,12 +170,10 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
         return true; // Any other non-load instruction might take address!
       }
     } else if (const Constant *C = dyn_cast<Constant>(UR)) {
-      GS.HasNonInstructionUser = true;
       // We might have a dead and dangling constant hanging off of here.
       if (!isSafeToDestroyConstant(C))
         return true;
     } else {
-      GS.HasNonInstructionUser = true;
       // Otherwise must be some other user.
       return true;
     }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 997667810580..c9f872f5b7e1 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1185,10 +1185,10 @@ static bool MayContainThrowingOrExitingCall(Instruction *Begin,
 
 static AttrBuilder IdentifyValidAttributes(CallBase &CB) {
 
-  AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex);
-  if (AB.empty())
+  AttrBuilder AB(CB.getContext(), CB.getAttributes().getRetAttrs());
+  if (!AB.hasAttributes())
     return AB;
-  AttrBuilder Valid;
+  AttrBuilder Valid(CB.getContext());
   // Only allow these white listed attributes to be propagated back to the
   // callee. This is because other attributes may only be valid on the call
   // itself, i.e. attributes such as signext and zeroext.
@@ -1208,7 +1208,7 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
     return;
 
   AttrBuilder Valid = IdentifyValidAttributes(CB);
-  if (Valid.empty())
+  if (!Valid.hasAttributes())
     return;
   auto *CalledFunction = CB.getCalledFunction();
   auto &Context = CalledFunction->getContext();
@@ -1667,7 +1667,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
   Module *Mod = CB.getModule();
   assert(objcarc::isRetainOrClaimRV(RVCallKind) && "unexpected ARC function");
   bool IsRetainRV = RVCallKind == objcarc::ARCInstKind::RetainRV,
-       IsClaimRV = !IsRetainRV;
+       IsUnsafeClaimRV = !IsRetainRV;
 
   for (auto *RI : Returns) {
     Value *RetOpnd = objcarc::GetRCIdentityRoot(RI->getOperand(0));
@@ -1694,7 +1694,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
         //   and erase the autoreleaseRV call.
         // - If retainRV is attached to the call, just erase the autoreleaseRV
         //   call.
-        if (IsClaimRV) {
+        if (IsUnsafeClaimRV) {
           Builder.SetInsertPoint(II);
           Function *IFn =
               Intrinsic::getDeclaration(Mod, Intrinsic::objc_release);
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 668626fef933..72b864dc3e48 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -339,8 +339,10 @@ bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
 
 #ifdef EXPENSIVE_CHECKS
   // Verify all sub-loops are in LCSSA form already.
-  for (Loop *SubLoop: L)
+  for (Loop *SubLoop: L) {
+    (void)SubLoop; // Silence unused variable warning.
     assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!");
+  }
 #endif
 
   SmallVector<BasicBlock *, 8> ExitBlocks;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ecad79b68185..9f33d2f82732 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -492,7 +492,7 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
     }
   }
 
-  if (isAllocLikeFn(I, TLI))
+  if (isAllocationFn(I, TLI) && isAllocRemovable(cast<CallBase>(I), TLI))
     return true;
 
   if (CallInst *CI = isFreeCall(I, TLI))
@@ -2189,8 +2189,8 @@ CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
   return NewCall;
 }
 
-/// changeToCall - Convert the specified invoke into a normal call.
-void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
+// changeToCall - Convert the specified invoke into a normal call.
+CallInst *llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
   CallInst *NewCall = createCallMatchingInvoke(II);
   NewCall->takeName(II);
   NewCall->insertBefore(II);
@@ -2207,6 +2207,7 @@ void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
   II->eraseFromParent();
   if (DTU)
     DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
+  return NewCall;
 }
 
 BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
@@ -3147,11 +3148,6 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
   if (!ITy->isIntOrIntVectorTy() || ITy->getScalarSizeInBits() > 128)
     return false;  // Can't do integer/elements > 128 bits.
 
-  Type *DemandedTy = ITy;
-  if (I->hasOneUse())
-    if (auto *Trunc = dyn_cast<TruncInst>(I->user_back()))
-      DemandedTy = Trunc->getType();
-
   // Try to find all the pieces corresponding to the bswap.
   bool FoundRoot = false;
   std::map<Value *, Optional<BitPart>> BPS;
@@ -3165,6 +3161,7 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
          "Illegal bit provenance index");
 
   // If the upper bits are zero, then attempt to perform as a truncated op.
+  Type *DemandedTy = ITy;
   if (BitProvenance.back() == BitPart::Unset) {
     while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset)
       BitProvenance = BitProvenance.drop_back();
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 69fd110dc3c2..92333408aaef 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -359,7 +359,7 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::PeelingPreferences &PP,
-                            unsigned &TripCount, DominatorTree &DT,
+                            unsigned TripCount, DominatorTree &DT,
                             ScalarEvolution &SE, unsigned Threshold) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
   // Save the PP.PeelCount value set by the target in
@@ -370,7 +370,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     return;
 
   // Only try to peel innermost loops by default.
-  // The constraint can be relaxed by the target in TTI.getUnrollingPreferences
+  // The constraint can be relaxed by the target in TTI.getPeelingPreferences
   // or by the flag -unroll-allow-loop-nests-peeling.
   if (!PP.AllowLoopNestsPeeling && !L->isInnermost())
     return;
@@ -407,8 +407,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     SmallDenseMap<PHINode *, Optional<unsigned> > IterationsToInvariance;
     // Now go through all Phis to calculate their the number of iterations they
     // need to become invariants.
-    // Start the max computation with the UP.PeelCount value set by the target
-    // in TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
+    // Start the max computation with the PP.PeelCount value set by the target
+    // in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
     unsigned DesiredPeelCount = TargetPeelCount;
     BasicBlock *BackEdge = L->getLoopLatch();
     assert(BackEdge && "Loop is not in simplified form?");
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index b0c622b98d5e..9ca1f4f44b97 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -99,6 +99,17 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
 #endif
                     );
 
+static cl::opt<bool>
+UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
+                    cl::desc("Verify loopinfo after unrolling"),
+#ifdef EXPENSIVE_CHECKS
+    cl::init(true)
+#else
+    cl::init(false)
+#endif
+                    );
+
+
 /// Check if unrolling created a situation where we need to insert phi nodes to
 /// preserve LCSSA form.
 /// \param Blocks is a vector of basic blocks representing unrolled loop.
@@ -764,6 +775,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   // Apply updates to the DomTree.
   DT = &DTU.getDomTree();
 
+  assert(!UnrollVerifyDomtree ||
+         DT->verify(DominatorTree::VerificationLevel::Fast));
+
   // At this point, the code is well formed.  We now simplify the unrolled loop,
   // doing constant propagation and dead code elimination as we go.
   simplifyLoopAfterUnroll(L, !CompletelyUnroll && ULO.Count > 1, LI, SE, DT, AC,
@@ -777,6 +791,10 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   if (CompletelyUnroll)
     LI->erase(L);
 
+  // LoopInfo should not be valid, confirm that.
+  if (UnrollVerifyLoopInfo)
+    LI->verify(*DT);
+
   // After complete unrolling most of the blocks should be contained in OuterL.
   // However, some of them might happen to be out of OuterL (e.g. if they
   // precede a loop exit). In this case we might need to insert PHI nodes in
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 93157bd87c34..95db2fe8d310 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -1567,7 +1568,9 @@ Value *llvm::addRuntimeChecks(
   auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp);
 
   LLVMContext &Ctx = Loc->getContext();
-  IRBuilder<> ChkBuilder(Loc);
+  IRBuilder<InstSimplifyFolder> ChkBuilder(Ctx,
+                                           Loc->getModule()->getDataLayout());
+  ChkBuilder.SetInsertPoint(Loc);
   // Our instructions might fold to a constant.
   Value *MemoryRuntimeCheck = nullptr;
 
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 771b7d25b0f2..f0bf625fa18e 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -70,17 +71,14 @@ void LoopVersioning::versionLoop(
                    "scev.check");
   SCEVRuntimeCheck =
       Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator());
-  auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
-
-  // Discard the SCEV runtime check if it is always true.
-  if (CI && CI->isZero())
-    SCEVRuntimeCheck = nullptr;
 
+  IRBuilder<InstSimplifyFolder> Builder(
+      RuntimeCheckBB->getContext(),
+      InstSimplifyFolder(RuntimeCheckBB->getModule()->getDataLayout()));
   if (MemRuntimeCheck && SCEVRuntimeCheck) {
-    RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
-                                          SCEVRuntimeCheck, "lver.safe");
-    if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
-      I->insertBefore(RuntimeCheckBB->getTerminator());
+    Builder.SetInsertPoint(RuntimeCheckBB->getTerminator());
+    RuntimeCheck =
+        Builder.CreateOr(MemRuntimeCheck, SCEVRuntimeCheck, "lver.safe");
   } else
     RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
 
@@ -109,8 +107,9 @@ void LoopVersioning::versionLoop(
 
   // Insert the conditional branch based on the result of the memchecks.
   Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
-  BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
-                     VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);
+  Builder.SetInsertPoint(OrigTerm);
+  Builder.CreateCondBr(RuntimeCheck, NonVersionedLoop->getLoopPreheader(),
+                       VersionedLoop->getLoopPreheader());
   OrigTerm->eraseFromParent();
 
   // The loops merge in the original exit block.  This is now dominated by the
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 8dc4702993c3..3d75dd57456d 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -297,7 +297,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
   Function *F = OrigBB->getParent();
   const DataLayout &DL = F->getParent()->getDataLayout();
 
-  Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType();
+  Type *EltTy = SrcAddr->getType()->getPointerElementType();
 
   // Create the a comparison of src and dst, based on which we jump to either
   // the forward-copy part of the function (if src >= dst) or the backwards-copy
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index bb5ff59cba4b..7c9ab7f6ca2c 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -178,66 +178,30 @@ llvm::getOrCreateSanitizerCtorAndInitFunctions(
 }
 
 void llvm::filterDeadComdatFunctions(
-    Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) {
-  // Build a map from the comdat to the number of entries in that comdat we
-  // think are dead. If this fully covers the comdat group, then the entire
-  // group is dead. If we find another entry in the comdat group though, we'll
-  // have to preserve the whole group.
-  SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered;
+    SmallVectorImpl<Function *> &DeadComdatFunctions) {
+  SmallPtrSet<Function *, 32> MaybeDeadFunctions;
+  SmallPtrSet<Comdat *, 32> MaybeDeadComdats;
   for (Function *F : DeadComdatFunctions) {
-    Comdat *C = F->getComdat();
-    assert(C && "Expected all input GVs to be in a comdat!");
-    ComdatEntriesCovered[C] += 1;
+    MaybeDeadFunctions.insert(F);
+    if (Comdat *C = F->getComdat())
+      MaybeDeadComdats.insert(C);
   }
 
-  auto CheckComdat = [&](Comdat &C) {
-    auto CI = ComdatEntriesCovered.find(&C);
-    if (CI == ComdatEntriesCovered.end())
-      return;
-
-    // If this could have been covered by a dead entry, just subtract one to
-    // account for it.
-    if (CI->second > 0) {
-      CI->second -= 1;
-      return;
-    }
-
-    // If we've already accounted for all the entries that were dead, the
-    // entire comdat is alive so remove it from the map.
-    ComdatEntriesCovered.erase(CI);
-  };
-
-  auto CheckAllComdats = [&] {
-    for (Function &F : M.functions())
-      if (Comdat *C = F.getComdat()) {
-        CheckComdat(*C);
-        if (ComdatEntriesCovered.empty())
-          return;
-      }
-    for (GlobalVariable &GV : M.globals())
-      if (Comdat *C = GV.getComdat()) {
-        CheckComdat(*C);
-        if (ComdatEntriesCovered.empty())
-          return;
-      }
-    for (GlobalAlias &GA : M.aliases())
-      if (Comdat *C = GA.getComdat()) {
-        CheckComdat(*C);
-        if (ComdatEntriesCovered.empty())
-          return;
-      }
-  };
-  CheckAllComdats();
-
-  if (ComdatEntriesCovered.empty()) {
-    DeadComdatFunctions.clear();
-    return;
+  // Find comdats for which all users are dead now.
+  SmallPtrSet<Comdat *, 32> DeadComdats;
+  for (Comdat *C : MaybeDeadComdats) {
+    auto IsUserDead = [&](GlobalObject *GO) {
+      auto *F = dyn_cast<Function>(GO);
+      return F && MaybeDeadFunctions.contains(F);
+    };
+    if (all_of(C->getUsers(), IsUserDead))
+      DeadComdats.insert(C);
   }
 
-  // Remove the entries that were not covering.
-  erase_if(DeadComdatFunctions, [&](GlobalValue *GV) {
-    return ComdatEntriesCovered.find(GV->getComdat()) ==
-           ComdatEntriesCovered.end();
+  // Only keep functions which have no comdat or a dead comdat.
+  erase_if(DeadComdatFunctions, [&](Function *F) {
+    Comdat *C = F->getComdat();
+    return C && !DeadComdats.contains(C);
   });
 }
 
diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
index 2f2dff6b5f0b..961adf2570a7 100644
--- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
+++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SampleProfileInference.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Support/Debug.h"
 #include <queue>
 #include <set>
@@ -144,7 +145,7 @@ public:
   /// A cost of decreasing the entry block's count by one.
   static constexpr int64_t AuxCostDecEntry = 10;
   /// A cost of taking an unlikely jump.
-  static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20;
+  static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 30;
 
 private:
   /// Check for existence of an augmenting path with a positive capacity.
@@ -236,7 +237,7 @@ private:
     }
   }
 
-  /// An node in a flow network.
+  /// A node in a flow network.
   struct Node {
     /// The cost of the cheapest path from the source to the current node.
     int64_t Distance;
@@ -303,13 +304,10 @@ public:
     rebalanceUnknownSubgraphs();
   }
 
-  /// The probability for the first successor of a unknown subgraph
-  static constexpr double UnknownFirstSuccProbability = 0.5;
-
 private:
   void joinIsolatedComponents() {
     // Find blocks that are reachable from the source
-    auto Visited = std::vector<bool>(NumBlocks(), false);
+    auto Visited = BitVector(NumBlocks(), false);
     findReachable(Func.Entry, Visited);
 
     // Iterate over all non-reachable blocks and adjust their weights
@@ -334,7 +332,7 @@ private:
 
   /// Run BFS from a given block along the jumps with a positive flow and mark
   /// all reachable blocks.
-  void findReachable(uint64_t Src, std::vector<bool> &Visited) {
+  void findReachable(uint64_t Src, BitVector &Visited) {
     if (Visited[Src])
       return;
     std::queue<uint64_t> Queue;
@@ -452,44 +450,70 @@ private:
 
   uint64_t NumBlocks() const { return Func.Blocks.size(); }
 
-  /// Rebalance unknown subgraphs so as each branch splits with probabilities
-  /// UnknownFirstSuccProbability and 1 - UnknownFirstSuccProbability
+  /// Rebalance unknown subgraphs so that the flow is split evenly across the
+  /// outgoing branches of every block of the subgraph. The method iterates over
+  /// blocks with known weight and identifies unknown subgraphs rooted at the
+  /// blocks. Then it verifies if flow rebalancing is feasible and applies it.
   void rebalanceUnknownSubgraphs() {
-    assert(UnknownFirstSuccProbability >= 0.0 &&
-           UnknownFirstSuccProbability <= 1.0 &&
-           "the share of the unknown successor should be between 0 and 1");
-    // Try to find unknown subgraphs from each non-unknown block
+    // Try to find unknown subgraphs from each block
     for (uint64_t I = 0; I < Func.Blocks.size(); I++) {
       auto SrcBlock = &Func.Blocks[I];
-      // Do not attempt to find unknown successors from a unknown or a
-      // zero-flow block
-      if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+      // Verify if rebalancing rooted at SrcBlock is feasible
+      if (!canRebalanceAtRoot(SrcBlock))
         continue;
 
-      std::vector<FlowBlock *> UnknownSuccs;
+      // Find an unknown subgraphs starting at SrcBlock. Along the way,
+      // fill in known destinations and intermediate unknown blocks.
+      std::vector<FlowBlock *> UnknownBlocks;
+      std::vector<FlowBlock *> KnownDstBlocks;
+      findUnknownSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks);
+
+      // Verify if rebalancing of the subgraph is feasible. If the search is
+      // successful, find the unique destination block (which can be null)
       FlowBlock *DstBlock = nullptr;
-      // Find a unknown subgraphs starting at block SrcBlock
-      if (!findUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+      if (!canRebalanceSubgraph(SrcBlock, KnownDstBlocks, UnknownBlocks,
+                                DstBlock))
         continue;
-      // At the moment, we do not rebalance subgraphs containing cycles among
-      // unknown blocks
-      if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownSuccs))
+
+      // We cannot rebalance subgraphs containing cycles among unknown blocks
+      if (!isAcyclicSubgraph(SrcBlock, DstBlock, UnknownBlocks))
         continue;
 
       // Rebalance the flow
-      rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownSuccs);
+      rebalanceUnknownSubgraph(SrcBlock, DstBlock, UnknownBlocks);
     }
   }
 
-  /// Find a unknown subgraph starting at block SrcBlock.
-  /// If the search is successful, the method sets DstBlock and UnknownSuccs.
-  bool findUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock,
-                           std::vector<FlowBlock *> &UnknownSuccs) {
+  /// Verify if rebalancing rooted at a given block is possible.
+  bool canRebalanceAtRoot(const FlowBlock *SrcBlock) {
+    // Do not attempt to find unknown subgraphs from an unknown or a
+    // zero-flow block
+    if (SrcBlock->UnknownWeight || SrcBlock->Flow == 0)
+      return false;
+
+    // Do not attempt to process subgraphs from a block w/o unknown sucessors
+    bool HasUnknownSuccs = false;
+    for (auto Jump : SrcBlock->SuccJumps) {
+      if (Func.Blocks[Jump->Target].UnknownWeight) {
+        HasUnknownSuccs = true;
+        break;
+      }
+    }
+    if (!HasUnknownSuccs)
+      return false;
+
+    return true;
+  }
+
+  /// Find an unknown subgraph starting at block SrcBlock. The method sets
+  /// identified destinations, KnownDstBlocks, and intermediate UnknownBlocks.
+  void findUnknownSubgraph(const FlowBlock *SrcBlock,
+                           std::vector<FlowBlock *> &KnownDstBlocks,
+                           std::vector<FlowBlock *> &UnknownBlocks) {
     // Run BFS from SrcBlock and make sure all paths are going through unknown
     // blocks and end at a non-unknown DstBlock
-    auto Visited = std::vector<bool>(NumBlocks(), false);
+    auto Visited = BitVector(NumBlocks(), false);
     std::queue<uint64_t> Queue;
-    DstBlock = nullptr;
 
     Queue.push(SrcBlock->Index);
     Visited[SrcBlock->Index] = true;
@@ -498,52 +522,105 @@ private:
       Queue.pop();
       // Process blocks reachable from Block
       for (auto Jump : Block.SuccJumps) {
+        // If Jump can be ignored, skip it
+        if (ignoreJump(SrcBlock, nullptr, Jump))
+          continue;
+
         uint64_t Dst = Jump->Target;
+        // If Dst has been visited, skip Jump
         if (Visited[Dst])
           continue;
+        // Process block Dst
         Visited[Dst] = true;
         if (!Func.Blocks[Dst].UnknownWeight) {
-          // If we see non-unique non-unknown block reachable from SrcBlock,
-          // stop processing and skip rebalancing
-          FlowBlock *CandidateDstBlock = &Func.Blocks[Dst];
-          if (DstBlock != nullptr && DstBlock != CandidateDstBlock)
-            return false;
-          DstBlock = CandidateDstBlock;
+          KnownDstBlocks.push_back(&Func.Blocks[Dst]);
         } else {
           Queue.push(Dst);
-          UnknownSuccs.push_back(&Func.Blocks[Dst]);
+          UnknownBlocks.push_back(&Func.Blocks[Dst]);
         }
       }
     }
+  }
 
+  /// Verify if rebalancing of the subgraph is feasible. If the checks are
+  /// successful, set the unique destination block, DstBlock (can be null).
+  bool canRebalanceSubgraph(const FlowBlock *SrcBlock,
+                            const std::vector<FlowBlock *> &KnownDstBlocks,
+                            const std::vector<FlowBlock *> &UnknownBlocks,
+                            FlowBlock *&DstBlock) {
     // If the list of unknown blocks is empty, we don't need rebalancing
-    if (UnknownSuccs.empty())
+    if (UnknownBlocks.empty())
       return false;
-    // If all reachable nodes from SrcBlock are unknown, skip rebalancing
-    if (DstBlock == nullptr)
+
+    // If there are multiple known sinks, we can't rebalance
+    if (KnownDstBlocks.size() > 1)
       return false;
-    // If any of the unknown blocks is an exit block, skip rebalancing
-    for (auto Block : UnknownSuccs) {
-      if (Block->isExit())
+    DstBlock = KnownDstBlocks.empty() ? nullptr : KnownDstBlocks.front();
+
+    // Verify sinks of the subgraph
+    for (auto Block : UnknownBlocks) {
+      if (Block->SuccJumps.empty()) {
+        // If there are multiple (known and unknown) sinks, we can't rebalance
+        if (DstBlock != nullptr)
+          return false;
+        continue;
+      }
+      size_t NumIgnoredJumps = 0;
+      for (auto Jump : Block->SuccJumps) {
+        if (ignoreJump(SrcBlock, DstBlock, Jump))
+          NumIgnoredJumps++;
+      }
+      // If there is a non-sink block in UnknownBlocks with all jumps ignored,
+      // then we can't rebalance
+      if (NumIgnoredJumps == Block->SuccJumps.size())
         return false;
     }
 
     return true;
   }
 
+  /// Decide whether the Jump is ignored while processing an unknown subgraphs
+  /// rooted at basic block SrcBlock with the destination block, DstBlock.
+  bool ignoreJump(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+                  const FlowJump *Jump) {
+    // Ignore unlikely jumps with zero flow
+    if (Jump->IsUnlikely && Jump->Flow == 0)
+      return true;
+
+    auto JumpSource = &Func.Blocks[Jump->Source];
+    auto JumpTarget = &Func.Blocks[Jump->Target];
+
+    // Do not ignore jumps coming into DstBlock
+    if (DstBlock != nullptr && JumpTarget == DstBlock)
+      return false;
+
+    // Ignore jumps out of SrcBlock to known blocks
+    if (!JumpTarget->UnknownWeight && JumpSource == SrcBlock)
+      return true;
+
+    // Ignore jumps to known blocks with zero flow
+    if (!JumpTarget->UnknownWeight && JumpTarget->Flow == 0)
+      return true;
+
+    return false;
+  }
+
   /// Verify if the given unknown subgraph is acyclic, and if yes, reorder
-  /// UnknownSuccs in the topological order (so that all jumps are "forward").
-  bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
-                         std::vector<FlowBlock *> &UnknownSuccs) {
+  /// UnknownBlocks in the topological order (so that all jumps are "forward").
+  bool isAcyclicSubgraph(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+                         std::vector<FlowBlock *> &UnknownBlocks) {
     // Extract local in-degrees in the considered subgraph
     auto LocalInDegree = std::vector<uint64_t>(NumBlocks(), 0);
-    for (auto Jump : SrcBlock->SuccJumps) {
-      LocalInDegree[Jump->Target]++;
-    }
-    for (uint64_t I = 0; I < UnknownSuccs.size(); I++) {
-      for (auto Jump : UnknownSuccs[I]->SuccJumps) {
+    auto fillInDegree = [&](const FlowBlock *Block) {
+      for (auto Jump : Block->SuccJumps) {
+        if (ignoreJump(SrcBlock, DstBlock, Jump))
+          continue;
         LocalInDegree[Jump->Target]++;
       }
+    };
+    fillInDegree(SrcBlock);
+    for (auto Block : UnknownBlocks) {
+      fillInDegree(Block);
     }
     // A loop containing SrcBlock
     if (LocalInDegree[SrcBlock->Index] > 0)
@@ -553,15 +630,20 @@ private:
     std::queue<uint64_t> Queue;
     Queue.push(SrcBlock->Index);
     while (!Queue.empty()) {
-      auto &Block = Func.Blocks[Queue.front()];
+      FlowBlock *Block = &Func.Blocks[Queue.front()];
       Queue.pop();
-      // Stop propagation once we reach DstBlock
-      if (Block.Index == DstBlock->Index)
+      // Stop propagation once we reach DstBlock, if any
+      if (DstBlock != nullptr && Block == DstBlock)
         break;
 
-      AcyclicOrder.push_back(&Block);
+      // Keep an acyclic order of unknown blocks
+      if (Block->UnknownWeight && Block != SrcBlock)
+        AcyclicOrder.push_back(Block);
+
       // Add to the queue all successors with zero local in-degree
-      for (auto Jump : Block.SuccJumps) {
+      for (auto Jump : Block->SuccJumps) {
+        if (ignoreJump(SrcBlock, DstBlock, Jump))
+          continue;
         uint64_t Dst = Jump->Target;
         LocalInDegree[Dst]--;
         if (LocalInDegree[Dst] == 0) {
@@ -572,42 +654,69 @@ private:
 
     // If there is a cycle in the subgraph, AcyclicOrder contains only a subset
     // of all blocks
-    if (UnknownSuccs.size() + 1 != AcyclicOrder.size())
+    if (UnknownBlocks.size() != AcyclicOrder.size())
       return false;
-    UnknownSuccs = AcyclicOrder;
+    UnknownBlocks = AcyclicOrder;
     return true;
   }
 
-  /// Rebalance a given subgraph.
-  void rebalanceUnknownSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock,
-                                std::vector<FlowBlock *> &UnknownSuccs) {
+  /// Rebalance a given subgraph rooted at SrcBlock, ending at DstBlock and
+  /// having UnknownBlocks intermediate blocks.
+  void rebalanceUnknownSubgraph(const FlowBlock *SrcBlock,
+                                const FlowBlock *DstBlock,
+                                const std::vector<FlowBlock *> &UnknownBlocks) {
     assert(SrcBlock->Flow > 0 && "zero-flow block in unknown subgraph");
-    assert(UnknownSuccs.front() == SrcBlock && "incorrect order of unknowns");
 
-    for (auto Block : UnknownSuccs) {
+    // Ditribute flow from the source block
+    uint64_t BlockFlow = 0;
+    // SrcBlock's flow is the sum of outgoing flows along non-ignored jumps
+    for (auto Jump : SrcBlock->SuccJumps) {
+      if (ignoreJump(SrcBlock, DstBlock, Jump))
+        continue;
+      BlockFlow += Jump->Flow;
+    }
+    rebalanceBlock(SrcBlock, DstBlock, SrcBlock, BlockFlow);
+
+    // Ditribute flow from the remaining blocks
+    for (auto Block : UnknownBlocks) {
+      assert(Block->UnknownWeight && "incorrect unknown subgraph");
+      uint64_t BlockFlow = 0;
       // Block's flow is the sum of incoming flows
-      uint64_t TotalFlow = 0;
-      if (Block == SrcBlock) {
-        TotalFlow = Block->Flow;
-      } else {
-        for (auto Jump : Block->PredJumps) {
-          TotalFlow += Jump->Flow;
-        }
-        Block->Flow = TotalFlow;
+      for (auto Jump : Block->PredJumps) {
+        BlockFlow += Jump->Flow;
       }
+      Block->Flow = BlockFlow;
+      rebalanceBlock(SrcBlock, DstBlock, Block, BlockFlow);
+    }
+  }
 
-      // Process all successor jumps and update corresponding flow values
-      for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) {
-        auto Jump = Block->SuccJumps[I];
-        if (I + 1 == Block->SuccJumps.size()) {
-          Jump->Flow = TotalFlow;
-          continue;
-        }
-        uint64_t Flow = uint64_t(TotalFlow * UnknownFirstSuccProbability);
-        Jump->Flow = Flow;
-        TotalFlow -= Flow;
-      }
+  /// Redistribute flow for a block in a subgraph rooted at SrcBlock,
+  /// and ending at DstBlock.
+  void rebalanceBlock(const FlowBlock *SrcBlock, const FlowBlock *DstBlock,
+                      const FlowBlock *Block, uint64_t BlockFlow) {
+    // Process all successor jumps and update corresponding flow values
+    size_t BlockDegree = 0;
+    for (auto Jump : Block->SuccJumps) {
+      if (ignoreJump(SrcBlock, DstBlock, Jump))
+        continue;
+      BlockDegree++;
+    }
+    // If all successor jumps of the block are ignored, skip it
+    if (DstBlock == nullptr && BlockDegree == 0)
+      return;
+    assert(BlockDegree > 0 && "all outgoing jumps are ignored");
+
+    // Each of the Block's successors gets the following amount of flow.
+    // Rounding the value up so that all flow is propagated
+    uint64_t SuccFlow = (BlockFlow + BlockDegree - 1) / BlockDegree;
+    for (auto Jump : Block->SuccJumps) {
+      if (ignoreJump(SrcBlock, DstBlock, Jump))
+        continue;
+      uint64_t Flow = std::min(SuccFlow, BlockFlow);
+      Jump->Flow = Flow;
+      BlockFlow -= Flow;
     }
+    assert(BlockFlow == 0 && "not all flow is propagated");
   }
 
   /// A constant indicating an arbitrary exit block of a function.
@@ -799,7 +908,7 @@ void verifyWeights(const FlowFunction &Func) {
 
   // Run BFS from the source along edges with positive flow
   std::queue<uint64_t> Queue;
-  auto Visited = std::vector<bool>(NumBlocks, false);
+  auto Visited = BitVector(NumBlocks, false);
   Queue.push(Func.Entry);
   Visited[Func.Entry] = true;
   while (!Queue.empty()) {
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index c840ee85795f..5363a851fc27 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -173,7 +173,7 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
     auto *PtrTy = cast<PointerType>(Ty);
     if (DL.isNonIntegralPointerType(PtrTy)) {
       auto *Int8PtrTy = Builder.getInt8PtrTy(PtrTy->getAddressSpace());
-      assert(DL.getTypeAllocSize(Int8PtrTy->getElementType()) == 1 &&
+      assert(DL.getTypeAllocSize(Builder.getInt8Ty()) == 1 &&
              "alloc size of i8 must by 1 byte for the GEP to be correct");
       auto *GEP = Builder.CreateGEP(
           Builder.getInt8Ty(), Constant::getNullValue(Int8PtrTy), V, "uglygep");
@@ -471,7 +471,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     // indexes into the array implied by the pointer operand; the rest of
     // the indices index into the element or field type selected by the
     // preceding index.
-    Type *ElTy = PTy->getElementType();
+    Type *ElTy = PTy->getNonOpaquePointerElementType();
     for (;;) {
       // If the scale size is not 0, attempt to factor out a scale for
       // array indexing.
@@ -640,8 +640,8 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     Value *Casted = V;
     if (V->getType() != PTy)
       Casted = InsertNoopCastOfTo(Casted, PTy);
-    Value *GEP = Builder.CreateGEP(PTy->getElementType(), Casted, GepIndices,
-                                   "scevgep");
+    Value *GEP = Builder.CreateGEP(PTy->getNonOpaquePointerElementType(),
+                                   Casted, GepIndices, "scevgep");
     Ops.push_back(SE.getUnknown(GEP));
   }
 
@@ -1671,7 +1671,7 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
   return Builder.CreateSExt(V, Ty);
 }
 
-Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+Value *SCEVExpander::expandSMaxExpr(const SCEVNAryExpr *S) {
   Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
   Type *Ty = LHS->getType();
   for (int i = S->getNumOperands()-2; i >= 0; --i) {
@@ -1700,7 +1700,7 @@ Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
   return LHS;
 }
 
-Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+Value *SCEVExpander::expandUMaxExpr(const SCEVNAryExpr *S) {
   Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
   Type *Ty = LHS->getType();
   for (int i = S->getNumOperands()-2; i >= 0; --i) {
@@ -1729,7 +1729,7 @@ Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
   return LHS;
 }
 
-Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
+Value *SCEVExpander::expandSMinExpr(const SCEVNAryExpr *S) {
   Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
   Type *Ty = LHS->getType();
   for (int i = S->getNumOperands() - 2; i >= 0; --i) {
@@ -1758,7 +1758,7 @@ Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
   return LHS;
 }
 
-Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
+Value *SCEVExpander::expandUMinExpr(const SCEVNAryExpr *S) {
   Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
   Type *Ty = LHS->getType();
   for (int i = S->getNumOperands() - 2; i >= 0; --i) {
@@ -1787,6 +1787,40 @@ Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
   return LHS;
 }
 
+Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+  return expandSMaxExpr(S);
+}
+
+Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+  return expandUMaxExpr(S);
+}
+
+Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
+  return expandSMinExpr(S);
+}
+
+Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
+  return expandUMinExpr(S);
+}
+
+Value *SCEVExpander::visitSequentialUMinExpr(const SCEVSequentialUMinExpr *S) {
+  SmallVector<Value *> Ops;
+  for (const SCEV *Op : S->operands())
+    Ops.emplace_back(expand(Op));
+
+  Value *SaturationPoint =
+      MinMaxIntrinsic::getSaturationPoint(Intrinsic::umin, S->getType());
+
+  SmallVector<Value *> OpIsZero;
+  for (Value *Op : ArrayRef<Value *>(Ops).drop_back())
+    OpIsZero.emplace_back(Builder.CreateICmpEQ(Op, SaturationPoint));
+
+  Value *AnyOpIsZero = Builder.CreateLogicalOr(OpIsZero);
+
+  Value *NaiveUMin = expandUMinExpr(S);
+  return Builder.CreateSelect(AnyOpIsZero, SaturationPoint, NaiveUMin);
+}
+
 Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty,
                                        Instruction *IP, bool Root) {
   setInsertPoint(IP);
@@ -1809,8 +1843,8 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
       // instruction.
       Instruction *Tmp;
       if (Inst->getType()->isIntegerTy())
-        Tmp =
-            cast<Instruction>(Builder.CreateAdd(Inst, Inst, "tmp.lcssa.user"));
+        Tmp = cast<Instruction>(Builder.CreateIntToPtr(
+            Inst, Inst->getType()->getPointerTo(), "tmp.lcssa.user"));
       else {
         assert(Inst->getType()->isPointerTy());
         Tmp = cast<Instruction>(Builder.CreatePtrToInt(
@@ -1947,22 +1981,14 @@ Value *SCEVExpander::expand(const SCEV *S) {
 
     if (VO.second) {
       if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
-        Type *Ety = Vty->getPointerElementType();
         int64_t Offset = VO.second->getSExtValue();
-        int64_t ESize = SE.getTypeSizeInBits(Ety);
-        if ((Offset * 8) % ESize == 0) {
-          ConstantInt *Idx =
-            ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
-          V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
-        } else {
-          ConstantInt *Idx =
-            ConstantInt::getSigned(VO.second->getType(), -Offset);
-          unsigned AS = Vty->getAddressSpace();
-          V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
-          V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
-                                "uglygep");
-          V = Builder.CreateBitCast(V, Vty);
-        }
+        ConstantInt *Idx =
+          ConstantInt::getSigned(VO.second->getType(), -Offset);
+        unsigned AS = Vty->getAddressSpace();
+        V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
+        V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
+                              "uglygep");
+        V = Builder.CreateBitCast(V, Vty);
       } else {
         V = Builder.CreateSub(V, VO.second);
       }
@@ -2271,10 +2297,27 @@ template<typename T> static InstructionCost costAndCollectOperands(
   case scSMaxExpr:
   case scUMaxExpr:
   case scSMinExpr:
-  case scUMinExpr: {
+  case scUMinExpr:
+  case scSequentialUMinExpr: {
     // FIXME: should this ask the cost for Intrinsic's?
+    // The reduction tree.
     Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 1);
     Cost += CmpSelCost(Instruction::Select, S->getNumOperands() - 1, 0, 2);
+    switch (S->getSCEVType()) {
+    case scSequentialUMinExpr: {
+      // The safety net against poison.
+      // FIXME: this is broken.
+      Cost += CmpSelCost(Instruction::ICmp, S->getNumOperands() - 1, 0, 0);
+      Cost += ArithCost(Instruction::Or,
+                        S->getNumOperands() > 2 ? S->getNumOperands() - 2 : 0);
+      Cost += CmpSelCost(Instruction::Select, 1, 0, 1);
+      break;
+    }
+    default:
+      assert(!isa<SCEVSequentialMinMaxExpr>(S) &&
+             "Unhandled SCEV expression type?");
+      break;
+    }
     break;
   }
   case scAddRecExpr: {
@@ -2362,7 +2405,7 @@ bool SCEVExpander::isHighCostExpansionHelper(
   case scConstant: {
     // Only evalulate the costs of constants when optimizing for size.
     if (CostKind != TargetTransformInfo::TCK_CodeSize)
-      return 0;
+      return false;
     const APInt &Imm = cast<SCEVConstant>(S)->getAPInt();
     Type *Ty = S->getType();
     Cost += TTI.getIntImmCostInst(
@@ -2399,7 +2442,8 @@ bool SCEVExpander::isHighCostExpansionHelper(
   case scUMaxExpr:
   case scSMaxExpr:
   case scUMinExpr:
-  case scSMinExpr: {
+  case scSMinExpr:
+  case scSequentialUMinExpr: {
     assert(cast<SCEVNAryExpr>(S)->getNumOperands() > 1 &&
            "Nary expr should have more than 1 operand.");
     // The simple nary expr will require one less op (or pair of ops)
@@ -2490,49 +2534,73 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero);
   Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue);
 
-  // Get the backedge taken count and truncate or extended to the AR type.
-  Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
-
   // Compute |Step| * Backedge
-  Value *MulV, *OfMul;
-  if (Step->isOne()) {
-    // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't
-    // needed, there is never an overflow, so to avoid artificially inflating
-    // the cost of the check, directly emit the optimized IR.
-    MulV = TruncTripCount;
-    OfMul = ConstantInt::getFalse(MulV->getContext());
-  } else {
-    auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
-                                           Intrinsic::umul_with_overflow, Ty);
-    CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
-    MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
-    OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
-  }
-
   // Compute:
-  //   Start + |Step| * Backedge < Start
-  //   Start - |Step| * Backedge > Start
-  Value *Add = nullptr, *Sub = nullptr;
-  if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) {
-    StartValue = InsertNoopCastOfTo(
-        StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace()));
-    Value *NegMulV = Builder.CreateNeg(MulV);
-    Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV);
-    Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV);
-  } else {
-    Add = Builder.CreateAdd(StartValue, MulV);
-    Sub = Builder.CreateSub(StartValue, MulV);
-  }
-
-  Value *EndCompareGT = Builder.CreateICmp(
-      Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
+  //   1. Start + |Step| * Backedge < Start
+  //   2. Start - |Step| * Backedge > Start
+  //
+  // And select either 1. or 2. depending on whether step is positive or
+  // negative. If Step is known to be positive or negative, only create
+  // either 1. or 2.
+  auto ComputeEndCheck = [&]() -> Value * {
+    // Checking <u 0 is always false.
+    if (!Signed && Start->isZero() && SE.isKnownPositive(Step))
+      return ConstantInt::getFalse(Loc->getContext());
+
+    // Get the backedge taken count and truncate or extended to the AR type.
+    Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
+
+    Value *MulV, *OfMul;
+    if (Step->isOne()) {
+      // Special-case Step of one. Potentially-costly `umul_with_overflow` isn't
+      // needed, there is never an overflow, so to avoid artificially inflating
+      // the cost of the check, directly emit the optimized IR.
+      MulV = TruncTripCount;
+      OfMul = ConstantInt::getFalse(MulV->getContext());
+    } else {
+      auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
+                                             Intrinsic::umul_with_overflow, Ty);
+      CallInst *Mul =
+          Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+      MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+      OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+    }
 
-  Value *EndCompareLT = Builder.CreateICmp(
-      Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
+    Value *Add = nullptr, *Sub = nullptr;
+    bool NeedPosCheck = !SE.isKnownNegative(Step);
+    bool NeedNegCheck = !SE.isKnownPositive(Step);
+
+    if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARTy)) {
+      StartValue = InsertNoopCastOfTo(
+          StartValue, Builder.getInt8PtrTy(ARPtrTy->getAddressSpace()));
+      Value *NegMulV = Builder.CreateNeg(MulV);
+      if (NeedPosCheck)
+        Add = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, MulV);
+      if (NeedNegCheck)
+        Sub = Builder.CreateGEP(Builder.getInt8Ty(), StartValue, NegMulV);
+    } else {
+      if (NeedPosCheck)
+        Add = Builder.CreateAdd(StartValue, MulV);
+      if (NeedNegCheck)
+        Sub = Builder.CreateSub(StartValue, MulV);
+    }
 
-  // Select the answer based on the sign of Step.
-  Value *EndCheck =
-      Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
+    Value *EndCompareLT = nullptr;
+    Value *EndCompareGT = nullptr;
+    Value *EndCheck = nullptr;
+    if (NeedPosCheck)
+      EndCheck = EndCompareLT = Builder.CreateICmp(
+          Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
+    if (NeedNegCheck)
+      EndCheck = EndCompareGT = Builder.CreateICmp(
+          Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
+    if (NeedPosCheck && NeedNegCheck) {
+      // Select the answer based on the sign of Step.
+      EndCheck = Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
+    }
+    return Builder.CreateOr(EndCheck, OfMul);
+  };
+  Value *EndCheck = ComputeEndCheck();
 
   // If the backedge taken count type is larger than the AR type,
   // check that we don't drop any bits by truncating it. If we are
@@ -2548,7 +2616,7 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
     EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck);
   }
 
-  return Builder.CreateOr(EndCheck, OfMul);
+  return EndCheck;
 }
 
 Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
@@ -2578,17 +2646,16 @@ Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
 
 Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
                                           Instruction *IP) {
-  auto *BoolType = IntegerType::get(IP->getContext(), 1);
-  Value *Check = ConstantInt::getNullValue(BoolType);
-
   // Loop over all checks in this set.
+  SmallVector<Value *> Checks;
   for (auto Pred : Union->getPredicates()) {
-    auto *NextCheck = expandCodeForPredicate(Pred, IP);
+    Checks.push_back(expandCodeForPredicate(Pred, IP));
     Builder.SetInsertPoint(IP);
-    Check = Builder.CreateOr(Check, NextCheck);
   }
 
-  return Check;
+  if (Checks.empty())
+    return ConstantInt::getFalse(IP->getContext());
+  return Builder.CreateOr(Checks);
 }
 
 Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) {
@@ -2720,13 +2787,8 @@ void SCEVExpanderCleaner::cleanup() {
   // Remove sets with value handles.
   Expander.clear();
 
-  // Sort so that earlier instructions do not dominate later instructions.
-  stable_sort(InsertedInstructions, [this](Instruction *A, Instruction *B) {
-    return DT.dominates(B, A);
-  });
   // Remove all inserted instructions.
-  for (Instruction *I : InsertedInstructions) {
-
+  for (Instruction *I : reverse(InsertedInstructions)) {
 #ifndef NDEBUG
     assert(all_of(I->users(),
                   [&InsertedSet](Value *U) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 1046998c26de..335ac03ccb52 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2052,109 +2052,119 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
   if (ScanIdx == 0)
     return false;
 
-  // Okay, we *could* sink last ScanIdx instructions. But how many can we
-  // actually sink before encountering instruction that is unprofitable to sink?
-  auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
-    unsigned NumPHIdValues = 0;
-    for (auto *I : *LRI)
-      for (auto *V : PHIOperands[I]) {
-        if (!InstructionsToSink.contains(V))
-          ++NumPHIdValues;
-        // FIXME: this check is overly optimistic. We may end up not sinking
-        // said instruction, due to the very same profitability check.
-        // See @creating_too_many_phis in sink-common-code.ll.
-      }
-    LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
-    unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
-    if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
+  bool followedByDeoptOrUnreachable = IsBlockFollowedByDeoptOrUnreachable(BB);
+
+  if (!followedByDeoptOrUnreachable) {
+    // Okay, we *could* sink last ScanIdx instructions. But how many can we
+    // actually sink before encountering instruction that is unprofitable to
+    // sink?
+    auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
+      unsigned NumPHIdValues = 0;
+      for (auto *I : *LRI)
+        for (auto *V : PHIOperands[I]) {
+          if (!InstructionsToSink.contains(V))
+            ++NumPHIdValues;
+          // FIXME: this check is overly optimistic. We may end up not sinking
+          // said instruction, due to the very same profitability check.
+          // See @creating_too_many_phis in sink-common-code.ll.
+        }
+      LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+      unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
+      if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
         NumPHIInsts++;
 
-    return NumPHIInsts <= 1;
-  };
+      return NumPHIInsts <= 1;
+    };
 
-  // We've determined that we are going to sink last ScanIdx instructions,
-  // and recorded them in InstructionsToSink. Now, some instructions may be
-  // unprofitable to sink. But that determination depends on the instructions
-  // that we are going to sink.
-
-  // First, forward scan: find the first instruction unprofitable to sink,
-  // recording all the ones that are profitable to sink.
-  // FIXME: would it be better, after we detect that not all are profitable.
-  // to either record the profitable ones, or erase the unprofitable ones?
-  // Maybe we need to choose (at runtime) the one that will touch least instrs?
-  LRI.reset();
-  int Idx = 0;
-  SmallPtrSet<Value *, 4> InstructionsProfitableToSink;
-  while (Idx < ScanIdx) {
-    if (!ProfitableToSinkInstruction(LRI)) {
-      // Too many PHIs would be created.
-      LLVM_DEBUG(
-          dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
-      break;
+    // We've determined that we are going to sink last ScanIdx instructions,
+    // and recorded them in InstructionsToSink. Now, some instructions may be
+    // unprofitable to sink. But that determination depends on the instructions
+    // that we are going to sink.
+
+    // First, forward scan: find the first instruction unprofitable to sink,
+    // recording all the ones that are profitable to sink.
+    // FIXME: would it be better, after we detect that not all are profitable.
+    // to either record the profitable ones, or erase the unprofitable ones?
+    // Maybe we need to choose (at runtime) the one that will touch least
+    // instrs?
+    LRI.reset();
+    int Idx = 0;
+    SmallPtrSet<Value *, 4> InstructionsProfitableToSink;
+    while (Idx < ScanIdx) {
+      if (!ProfitableToSinkInstruction(LRI)) {
+        // Too many PHIs would be created.
+        LLVM_DEBUG(
+            dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+        break;
+      }
+      InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end());
+      --LRI;
+      ++Idx;
     }
-    InstructionsProfitableToSink.insert((*LRI).begin(), (*LRI).end());
-    --LRI;
-    ++Idx;
-  }
 
-  // If no instructions can be sunk, early-return.
-  if (Idx == 0)
-    return false;
+    // If no instructions can be sunk, early-return.
+    if (Idx == 0)
+      return false;
 
-  // Did we determine that (only) some instructions are unprofitable to sink?
-  if (Idx < ScanIdx) {
-    // Okay, some instructions are unprofitable.
-    ScanIdx = Idx;
-    InstructionsToSink = InstructionsProfitableToSink;
-
-    // But, that may make other instructions unprofitable, too.
-    // So, do a backward scan, do any earlier instructions become unprofitable?
-    assert(!ProfitableToSinkInstruction(LRI) &&
-           "We already know that the last instruction is unprofitable to sink");
-    ++LRI;
-    --Idx;
-    while (Idx >= 0) {
-      // If we detect that an instruction becomes unprofitable to sink,
-      // all earlier instructions won't be sunk either,
-      // so preemptively keep InstructionsProfitableToSink in sync.
-      // FIXME: is this the most performant approach?
-      for (auto *I : *LRI)
-        InstructionsProfitableToSink.erase(I);
-      if (!ProfitableToSinkInstruction(LRI)) {
-        // Everything starting with this instruction won't be sunk.
-        ScanIdx = Idx;
-        InstructionsToSink = InstructionsProfitableToSink;
-      }
+    // Did we determine that (only) some instructions are unprofitable to sink?
+    if (Idx < ScanIdx) {
+      // Okay, some instructions are unprofitable.
+      ScanIdx = Idx;
+      InstructionsToSink = InstructionsProfitableToSink;
+
+      // But, that may make other instructions unprofitable, too.
+      // So, do a backward scan, do any earlier instructions become
+      // unprofitable?
+      assert(
+          !ProfitableToSinkInstruction(LRI) &&
+          "We already know that the last instruction is unprofitable to sink");
       ++LRI;
       --Idx;
+      while (Idx >= 0) {
+        // If we detect that an instruction becomes unprofitable to sink,
+        // all earlier instructions won't be sunk either,
+        // so preemptively keep InstructionsProfitableToSink in sync.
+        // FIXME: is this the most performant approach?
+        for (auto *I : *LRI)
+          InstructionsProfitableToSink.erase(I);
+        if (!ProfitableToSinkInstruction(LRI)) {
+          // Everything starting with this instruction won't be sunk.
+          ScanIdx = Idx;
+          InstructionsToSink = InstructionsProfitableToSink;
+        }
+        ++LRI;
+        --Idx;
+      }
     }
-  }
 
-  // If no instructions can be sunk, early-return.
-  if (ScanIdx == 0)
-    return false;
+    // If no instructions can be sunk, early-return.
+    if (ScanIdx == 0)
+      return false;
+  }
 
   bool Changed = false;
 
   if (HaveNonUnconditionalPredecessors) {
-    // It is always legal to sink common instructions from unconditional
-    // predecessors. However, if not all predecessors are unconditional,
-    // this transformation might be pessimizing. So as a rule of thumb,
-    // don't do it unless we'd sink at least one non-speculatable instruction.
-    // See https://bugs.llvm.org/show_bug.cgi?id=30244
-    LRI.reset();
-    int Idx = 0;
-    bool Profitable = false;
-    while (Idx < ScanIdx) {
-      if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
-        Profitable = true;
-        break;
+    if (!followedByDeoptOrUnreachable) {
+      // It is always legal to sink common instructions from unconditional
+      // predecessors. However, if not all predecessors are unconditional,
+      // this transformation might be pessimizing. So as a rule of thumb,
+      // don't do it unless we'd sink at least one non-speculatable instruction.
+      // See https://bugs.llvm.org/show_bug.cgi?id=30244
+      LRI.reset();
+      int Idx = 0;
+      bool Profitable = false;
+      while (Idx < ScanIdx) {
+        if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
+          Profitable = true;
+          break;
+        }
+        --LRI;
+        ++Idx;
       }
-      --LRI;
-      ++Idx;
+      if (!Profitable)
+        return false;
     }
-    if (!Profitable)
-      return false;
 
     LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
     // We have a conditional edge and we're going to sink some instructions.
@@ -4935,14 +4945,13 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
                                      AssumptionCache *AC,
                                      const DataLayout &DL) {
   Value *Cond = SI->getCondition();
-  unsigned Bits = Cond->getType()->getIntegerBitWidth();
   KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
 
   // We can also eliminate cases by determining that their values are outside of
   // the limited range of the condition based on how many significant (non-sign)
   // bits are in the condition value.
-  unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
-  unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
+  unsigned MaxSignificantBitsInCond =
+      ComputeMaxSignificantBits(Cond, DL, 0, AC, SI);
 
   // Gather dead cases.
   SmallVector<ConstantInt *, 8> DeadCases;
@@ -4973,8 +4982,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
   bool HasDefault =
       !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
   const unsigned NumUnknownBits =
-      Bits - (Known.Zero | Known.One).countPopulation();
-  assert(NumUnknownBits <= Bits);
+      Known.getBitWidth() - (Known.Zero | Known.One).countPopulation();
+  assert(NumUnknownBits <= Known.getBitWidth());
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
       SI->getNumCases() == (1ULL << NumUnknownBits)) {
@@ -5796,10 +5805,9 @@ static void reuseTableCompare(
   for (auto ValuePair : Values) {
     Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
                                                 ValuePair.second, CmpOp1, true);
-    if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst))
+    if (!CaseConst || CaseConst == DefaultConst ||
+        (CaseConst != TrueConst && CaseConst != FalseConst))
       return;
-    assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
-           "Expect true or false as compare result.");
   }
 
   // Check if the branch instruction dominates the phi node. It's a simple
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 02727a3dbf9c..e02d02a05752 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -602,7 +602,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
     Align MemSetAlign =
         CI->getAttributes().getParamAttrs(0).getAlignment().valueOrOne();
     CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, MemSetAlign);
-    AttrBuilder ArgAttrs(CI->getAttributes().getParamAttrs(0));
+    AttrBuilder ArgAttrs(CI->getContext(), CI->getAttributes().getParamAttrs(0));
     NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
         CI->getContext(), 0, ArgAttrs));
     copyFlags(*CI, NewCI);
@@ -2515,8 +2515,9 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
     } else if (Value *V = emitStpCpy(Dest, CI->getArgOperand(2), B, TLI)) {
       // sprintf(dest, "%s", str) -> stpcpy(dest, str) - dest
       // Handle mismatched pointer types (goes away with typeless pointers?).
-      V = B.CreatePointerCast(V, Dest->getType());
-      Value *PtrDiff = B.CreatePtrDiff(V, Dest);
+      V = B.CreatePointerCast(V, B.getInt8PtrTy());
+      Dest = B.CreatePointerCast(Dest, B.getInt8PtrTy());
+      Value *PtrDiff = B.CreatePtrDiff(B.getInt8Ty(), V, Dest);
       return B.CreateIntCast(PtrDiff, CI->getType(), false);
     }
 
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index b822db938af8..8947303674ee 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -398,13 +398,17 @@ Value *Mapper::mapValue(const Value *V) {
       SmallVector<ValueAsMetadata *, 4> MappedArgs;
       for (auto *VAM : AL->getArgs()) {
         // Map both Local and Constant VAMs here; they will both ultimately
-        // be mapped via mapValue (apart from constants when we have no
-        // module level changes, which have an identity mapping).
+        // be mapped via mapValue. The exceptions are constants when we have no
+        // module level changes and locals when they have no existing mapped
+        // value and RF_IgnoreMissingLocals is set; these have identity
+        // mappings.
         if ((Flags & RF_NoModuleLevelChanges) && isa<ConstantAsMetadata>(VAM)) {
           MappedArgs.push_back(VAM);
         } else if (Value *LV = mapValue(VAM->getValue())) {
           MappedArgs.push_back(
               LV == VAM->getValue() ? VAM : ValueAsMetadata::get(LV));
+        } else if ((Flags & RF_IgnoreMissingLocals) && isa<LocalAsMetadata>(VAM)) {
+            MappedArgs.push_back(VAM);
         } else {
           // If we cannot map the value, set the argument as undef.
           MappedArgs.push_back(ValueAsMetadata::get(
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 5a4a2f0924f6..97c2acb7d4c7 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -698,8 +698,9 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
       ChainInstrs.push_back(&I);
       continue;
     }
-    if (I.mayThrow()) {
-      LLVM_DEBUG(dbgs() << "LSV: Found may-throw operation: " << I << '\n');
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I)) {
+      LLVM_DEBUG(dbgs() << "LSV: Found instruction may not transfer execution: "
+                        << I << '\n');
       break;
     }
     if (I.mayReadOrWriteMemory())
@@ -853,13 +854,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
           (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
         continue;
 
-      // Make sure all the users of a vector are constant-index extracts.
-      if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
-            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
-            return EEI && isa<ConstantInt>(EEI->getOperand(1));
-          }))
-        continue;
-
       // Save the load locations.
       const ChainID ID = getChainID(Ptr);
       LoadRefs[ID].push_back(LI);
@@ -900,12 +894,6 @@ Vectorizer::collectInstructions(BasicBlock *BB) {
           (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
         continue;
 
-      if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
-            const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
-            return EEI && isa<ConstantInt>(EEI->getOperand(1));
-          }))
-        continue;
-
       // Save store location.
       const ChainID ID = getChainID(Ptr);
       StoreRefs[ID].push_back(SI);
@@ -1289,52 +1277,32 @@ bool Vectorizer::vectorizeLoadChain(
       Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
   propagateMetadata(LI, Chain);
 
-  if (VecLoadTy) {
-    SmallVector<Instruction *, 16> InstrsToErase;
-
-    unsigned VecWidth = VecLoadTy->getNumElements();
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
-      for (auto Use : Chain[I]->users()) {
-        // All users of vector loads are ExtractElement instructions with
-        // constant indices, otherwise we would have bailed before now.
-        Instruction *UI = cast<Instruction>(Use);
-        unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
-        unsigned NewIdx = Idx + I * VecWidth;
-        Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
-                                                UI->getName());
-        if (V->getType() != UI->getType())
-          V = Builder.CreateBitCast(V, UI->getType());
-
-        // Replace the old instruction.
-        UI->replaceAllUsesWith(V);
-        InstrsToErase.push_back(UI);
-      }
+  for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+    Value *CV = Chain[I];
+    Value *V;
+    if (VecLoadTy) {
+      // Extract a subvector using shufflevector.
+      unsigned VecWidth = VecLoadTy->getNumElements();
+      auto Mask =
+          llvm::to_vector<8>(llvm::seq<int>(I * VecWidth, (I + 1) * VecWidth));
+      V = Builder.CreateShuffleVector(LI, Mask, CV->getName());
+    } else {
+      V = Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
     }
 
-    // Bitcast might not be an Instruction, if the value being loaded is a
-    // constant.  In that case, no need to reorder anything.
-    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
-      reorder(BitcastInst);
-
-    for (auto I : InstrsToErase)
-      I->eraseFromParent();
-  } else {
-    for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
-      Value *CV = Chain[I];
-      Value *V =
-          Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
-      if (V->getType() != CV->getType()) {
-        V = Builder.CreateBitOrPointerCast(V, CV->getType());
-      }
-
-      // Replace the old instruction.
-      CV->replaceAllUsesWith(V);
+    if (V->getType() != CV->getType()) {
+      V = Builder.CreateBitOrPointerCast(V, CV->getType());
     }
 
-    if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
-      reorder(BitcastInst);
+    // Replace the old instruction.
+    CV->replaceAllUsesWith(V);
   }
 
+  // Bitcast might not be an Instruction, if the value being loaded is a
+  // constant. In that case, no need to reorder anything.
+  if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+    reorder(BitcastInst);
+
   eraseInstructions(Chain);
 
   ++NumVectorInstructions;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4747f34fcc62..d11f4146b590 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -470,10 +470,11 @@ public:
   /// on, while the old loop will be used as the scalar remainder. Control flow
   /// is generated around the vectorized (and scalar epilogue) loops consisting
   /// of various checks and bypasses. Return the pre-header block of the new
-  /// loop.
-  /// In the case of epilogue vectorization, this function is overriden to
-  /// handle the more complex control flow around the loops.
-  virtual BasicBlock *createVectorizedLoopSkeleton();
+  /// loop and the start value for the canonical induction, if it is != 0. The
+  /// latter is the case when vectorizing the epilogue loop. In the case of
+  /// epilogue vectorization, this function is overriden to handle the more
+  /// complex control flow around the loops.
+  virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
 
   /// Widen a single call instruction within the innermost loop.
   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
@@ -507,10 +508,10 @@ public:
 
   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
   /// is provided, the integer induction variable will first be truncated to
-  /// the corresponding type.
-  void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
-                             Value *Start, TruncInst *Trunc, VPValue *Def,
-                             VPTransformState &State);
+  /// the corresponding type. \p CanonicalIV is the scalar value generated for
+  /// the canonical induction variable.
+  void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
+                             VPTransformState &State, Value *CanonicalIV);
 
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
@@ -556,6 +557,10 @@ public:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  // Returns the resume value (bc.merge.rdx) for a reduction as
+  // generated by fixReduction.
+  PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
+
 protected:
   friend class LoopVectorizationPlanner;
 
@@ -573,16 +578,18 @@ protected:
                     Value *CountRoundDown, Value *EndValue,
                     BasicBlock *MiddleBlock);
 
-  /// Create a new induction variable inside L.
-  PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
-                                   Value *Step, Instruction *DL);
+  /// Introduce a conditional branch (on true, condition to be set later) at the
+  /// end of the header=latch connecting it to itself (across the backedge) and
+  /// to the exit block of \p L.
+  void createHeaderBranch(Loop *L);
 
   /// Handle all cross-iteration phis in the header.
   void fixCrossIterationPHIs(VPTransformState &State);
 
   /// Create the exit value of first order recurrences in the middle block and
   /// update their users.
-  void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
+  void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
+                               VPTransformState &State);
 
   /// Create code for the loop exit value of the reduction.
   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
@@ -606,14 +613,6 @@ protected:
   /// represented as.
   void truncateToMinimalBitwidths(VPTransformState &State);
 
-  /// This function adds
-  /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
-  /// to each vector element of Val. The sequence starts at StartIndex.
-  /// \p Opcode is relevant for FP induction variable.
-  virtual Value *
-  getStepVector(Value *Val, Value *StartIdx, Value *Step,
-                Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
-
   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
   /// variable on which to base the steps, \p Step is the size of the step, and
   /// \p EntryVal is the value from the original loop that maps to the steps.
@@ -640,9 +639,6 @@ protected:
   /// Returns true if we should generate a scalar version of \p IV.
   bool needsScalarInduction(Instruction *IV) const;
 
-  /// Generate a shuffle sequence that will reverse the vector Vec.
-  virtual Value *reverseVector(Value *Vec);
-
   /// Returns (and creates if needed) the original loop trip count.
   Value *getOrCreateTripCount(Loop *NewLoop);
 
@@ -685,14 +681,13 @@ protected:
   Loop *createVectorLoopSkeleton(StringRef Prefix);
 
   /// Create new phi nodes for the induction variables to resume iteration count
-  /// in the scalar epilogue, from where the vectorized loop left off (given by
-  /// \p VectorTripCount).
+  /// in the scalar epilogue, from where the vectorized loop left off.
   /// In cases where the loop skeleton is more complicated (eg. epilogue
   /// vectorization) and the resume values can come from an additional bypass
   /// block, the \p AdditionalBypass pair provides information about the bypass
   /// block and the end value on the edge from bypass to this loop.
   void createInductionResumeValues(
-      Loop *L, Value *VectorTripCount,
+      Loop *L,
       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
 
   /// Complete the loop skeleton by adding debug MDs, creating appropriate
@@ -795,12 +790,6 @@ protected:
   /// A list of all bypass blocks. The first block is the entry of the loop.
   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 
-  /// The new Induction variable which was added to the new block.
-  PHINode *Induction = nullptr;
-
-  /// The induction variable of the old basic block.
-  PHINode *OldInduction = nullptr;
-
   /// Store instructions that were predicated.
   SmallVector<Instruction *, 4> PredicatedInstructions;
 
@@ -838,6 +827,11 @@ protected:
   /// Structure to hold information about generated runtime checks, responsible
   /// for cleaning the checks, if vectorization turns out unprofitable.
   GeneratedRTChecks &RTChecks;
+
+  // Holds the resume values for reductions in the loops, used to set the
+  // correct start value of reduction PHIs when vectorizing the epilogue.
+  SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
+      ReductionResumeValues;
 };
 
 class InnerLoopUnroller : public InnerLoopVectorizer {
@@ -856,10 +850,6 @@ public:
 
 private:
   Value *getBroadcastInstrs(Value *V) override;
-  Value *getStepVector(
-      Value *Val, Value *StartIdx, Value *Step,
-      Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
-  Value *reverseVector(Value *Vec) override;
 };
 
 /// Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -909,14 +899,16 @@ public:
 
   // Override this function to handle the more complex control flow around the
   // three loops.
-  BasicBlock *createVectorizedLoopSkeleton() final override {
+  std::pair<BasicBlock *, Value *>
+  createVectorizedLoopSkeleton() final override {
     return createEpilogueVectorizedLoopSkeleton();
   }
 
   /// The interface for creating a vectorized skeleton using one of two
   /// different strategies, each corresponding to one execution of the vplan
   /// as described above.
-  virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
+  virtual std::pair<BasicBlock *, Value *>
+  createEpilogueVectorizedLoopSkeleton() = 0;
 
   /// Holds and updates state information required to vectorize the main loop
   /// and its epilogue in two separate passes. This setup helps us avoid
@@ -944,7 +936,8 @@ public:
                                        EPI, LVL, CM, BFI, PSI, Check) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *main loop* strategy (ie the first pass of vplan execution).
-  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+  std::pair<BasicBlock *, Value *>
+  createEpilogueVectorizedLoopSkeleton() final override;
 
 protected:
   /// Emits an iteration count bypass check once for the main loop (when \p
@@ -973,7 +966,8 @@ public:
                                        EPI, LVL, CM, BFI, PSI, Checks) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *epilogue loop* strategy (ie the second pass of vplan execution).
-  BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
+  std::pair<BasicBlock *, Value *>
+  createEpilogueVectorizedLoopSkeleton() final override;
 
 protected:
   /// Emits an iteration count bypass check after the main vector loop has
@@ -1069,16 +1063,16 @@ static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
 }
 
+namespace llvm {
+
 /// Return a value for Step multiplied by VF.
-static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
-                              int64_t Step) {
+Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
+                       int64_t Step) {
   assert(Ty->isIntegerTy() && "Expected an integer step");
   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
 }
 
-namespace llvm {
-
 /// Return the runtime value for VF.
 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
@@ -1163,7 +1157,8 @@ void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
       // will lead to gather/scatter instructions, which don't need to be
       // handled.
       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
-          isa<VPInterleaveRecipe>(CurRec))
+          isa<VPInterleaveRecipe>(CurRec) ||
+          isa<VPCanonicalIVPHIRecipe>(CurRec))
         continue;
 
       // This recipe contributes to the address computation of a widen
@@ -1232,6 +1227,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
   }
 }
 
+PHINode *InnerLoopVectorizer::getReductionResumeValue(
+    const RecurrenceDescriptor &RdxDesc) {
+  auto It = ReductionResumeValues.find(&RdxDesc);
+  assert(It != ReductionResumeValues.end() &&
+         "Expected to find a resume value for the reduction.");
+  return It->second;
+}
+
 namespace llvm {
 
 // Loop vectorization cost-model hints how the scalar epilogue loop should be
@@ -1556,13 +1559,16 @@ public:
 
   /// Returns true if the target machine can represent \p V as a masked gather
   /// or scatter operation.
-  bool isLegalGatherOrScatter(Value *V) {
+  bool isLegalGatherOrScatter(Value *V,
+                              ElementCount VF = ElementCount::getFixed(1)) {
     bool LI = isa<LoadInst>(V);
     bool SI = isa<StoreInst>(V);
     if (!LI && !SI)
       return false;
     auto *Ty = getLoadStoreType(V);
     Align Align = getLoadStoreAlignment(V);
+    if (VF.isVector())
+      Ty = VectorType::get(Ty, VF);
     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
            (SI && TTI.isLegalMaskedScatter(Ty, Align));
   }
@@ -1577,16 +1583,17 @@ public:
   }
 
   /// Returns true if \p I is an instruction that will be scalarized with
-  /// predication. Such instructions include conditional stores and
-  /// instructions that may divide by zero.
-  /// If a non-zero VF has been calculated, we check if I will be scalarized
-  /// predication for that VF.
-  bool isScalarWithPredication(Instruction *I) const;
+  /// predication when vectorizing \p I with vectorization factor \p VF. Such
+  /// instructions include conditional stores and instructions that may divide
+  /// by zero.
+  bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
 
   // Returns true if \p I is an instruction that will be predicated either
   // through scalar predication or masked load/store or masked gather/scatter.
+  // \p VF is the vectorization factor that will be used to vectorize \p I.
   // Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
+  bool isPredicatedInst(Instruction *I, ElementCount VF,
+                        bool IsKnownUniform = false) {
     // When we know the load is uniform and the original scalar loop was not
     // predicated we don't need to mark it as a predicated instruction. Any
     // vectorised blocks created when tail-folding are something artificial we
@@ -1602,7 +1609,7 @@ public:
     // instructions.
     if (isa<LoadInst>(I) || isa<StoreInst>(I))
       return Legal->isMaskRequired(I);
-    return isScalarWithPredication(I);
+    return isScalarWithPredication(I, VF);
   }
 
   /// Returns true if \p I is a memory instruction with consecutive memory
@@ -1794,7 +1801,7 @@ private:
 
   /// Returns true if an artificially high cost for emulated masked memrefs
   /// should be used.
-  bool useEmulatedMaskMemRefHack(Instruction *I);
+  bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
 
   /// Map of scalar integer values to the smallest bitwidth they can be legally
   /// represented as. The vector equivalents of these values should be truncated
@@ -2078,8 +2085,8 @@ public:
   /// Remove the created SCEV & memory runtime check blocks & instructions, if
   /// unused.
   ~GeneratedRTChecks() {
-    SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
-    SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
+    SCEVExpanderCleaner SCEVCleaner(SCEVExp);
+    SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
     if (!SCEVCheckCond)
       SCEVCleaner.markResultUsed();
 
@@ -2335,6 +2342,60 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
+/// This function adds
+/// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
+/// to each vector element of Val. The sequence starts at StartIndex.
+/// \p Opcode is relevant for FP induction variable.
+static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
+                            Instruction::BinaryOps BinOp, ElementCount VF,
+                            IRBuilder<> &Builder) {
+  assert(VF.isVector() && "only vector VFs are supported");
+
+  // Create and check the types.
+  auto *ValVTy = cast<VectorType>(Val->getType());
+  ElementCount VLen = ValVTy->getElementCount();
+
+  Type *STy = Val->getType()->getScalarType();
+  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+         "Induction Step must be an integer or FP");
+  assert(Step->getType() == STy && "Step has wrong type");
+
+  SmallVector<Constant *, 8> Indices;
+
+  // Create a vector of consecutive numbers from zero to VF.
+  VectorType *InitVecValVTy = ValVTy;
+  Type *InitVecValSTy = STy;
+  if (STy->isFloatingPointTy()) {
+    InitVecValSTy =
+        IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
+    InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
+  }
+  Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
+
+  // Splat the StartIdx
+  Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
+
+  if (STy->isIntegerTy()) {
+    InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
+    Step = Builder.CreateVectorSplat(VLen, Step);
+    assert(Step->getType() == Val->getType() && "Invalid step vec");
+    // FIXME: The newly created binary instructions should contain nsw/nuw
+    // flags, which can be found from the original scalar operations.
+    Step = Builder.CreateMul(InitVec, Step);
+    return Builder.CreateAdd(Val, Step, "induction");
+  }
+
+  // Floating point induction.
+  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+         "Binary Opcode should be specified for FP induction");
+  InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
+  InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
+
+  Step = Builder.CreateVectorSplat(VLen, Step);
+  Value *MulOp = Builder.CreateFMul(InitVec, Step);
+  return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+}
+
 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
     const InductionDescriptor &II, Value *Step, Value *Start,
     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
@@ -2355,8 +2416,8 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
 
   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
-  Value *SteppedStart =
-      getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
+  Value *SteppedStart = getStepVector(
+      SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
 
   // We create vector phi nodes for both integer and floating-point induction
   // variables. Here, we determine the kind of arithmetic we will perform.
@@ -2411,8 +2472,7 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   // placement of all induction updates.
   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
-  auto *ICmp = cast<Instruction>(Br->getCondition());
-  LastInduction->moveBefore(ICmp);
+  LastInduction->moveBefore(Br);
   LastInduction->setName("vec.ind.next");
 
   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
@@ -2434,15 +2494,15 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
   return llvm::any_of(IV->users(), isScalarInst);
 }
 
-void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
-                                                const InductionDescriptor &ID,
-                                                Value *Start, TruncInst *Trunc,
-                                                VPValue *Def,
-                                                VPTransformState &State) {
+void InnerLoopVectorizer::widenIntOrFpInduction(
+    PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
+    Value *CanonicalIV) {
+  Value *Start = Def->getStartValue()->getLiveInIRValue();
+  const InductionDescriptor &ID = Def->getInductionDescriptor();
+  TruncInst *Trunc = Def->getTruncInst();
   IRBuilder<> &Builder = State.Builder;
-  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
-         "Primary induction variable must have an integer type");
   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+  assert(!State.VF.isZero() && "VF must be non-zero");
 
   // The value from the original loop to which we are mapping the new induction
   // variable.
@@ -2468,12 +2528,13 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
   // induction variable and step. Otherwise, derive these values from the
   // induction descriptor.
   auto CreateScalarIV = [&](Value *&Step) -> Value * {
-    Value *ScalarIV = Induction;
-    if (IV != OldInduction) {
-      ScalarIV = IV->getType()->isIntegerTy()
-                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
-                     : Builder.CreateCast(Instruction::SIToFP, Induction,
-                                          IV->getType());
+    Value *ScalarIV = CanonicalIV;
+    Type *NeededType = IV->getType();
+    if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
+      ScalarIV =
+          NeededType->isIntegerTy()
+              ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
+              : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
                                       State.CFG.PrevBB);
       ScalarIV->setName("offset.idx");
@@ -2493,7 +2554,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
       Value *StartIdx;
       if (Step->getType()->isFloatingPointTy())
         StartIdx =
@@ -2502,7 +2562,8 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
         StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
 
       Value *EntryPart =
-          getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
+          getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(),
+                        State.VF, State.Builder);
       State.set(Def, EntryPart, Part);
       if (Trunc)
         addMetadata(EntryPart, Trunc);
@@ -2516,9 +2577,31 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
 
   // Now do the actual transformations, and start with creating the step value.
   Value *Step = CreateStepValue(ID.getStep());
-  if (State.VF.isZero() || State.VF.isScalar()) {
+  if (State.VF.isScalar()) {
     Value *ScalarIV = CreateScalarIV(Step);
-    CreateSplatIV(ScalarIV, Step);
+    Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
+                                      Step->getType()->getScalarSizeInBits());
+
+    Instruction::BinaryOps IncOp = ID.getInductionOpcode();
+    if (IncOp == Instruction::BinaryOpsEnd)
+      IncOp = Instruction::Add;
+    for (unsigned Part = 0; Part < UF; ++Part) {
+      Value *StartIdx = ConstantInt::get(ScalarTy, Part);
+      Instruction::BinaryOps MulOp = Instruction::Mul;
+      if (Step->getType()->isFloatingPointTy()) {
+        StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
+        MulOp = Instruction::FMul;
+      }
+
+      Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
+      Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
+      State.set(Def, EntryPart, Part);
+      if (Trunc) {
+        assert(!Step->getType()->isFloatingPointTy() &&
+               "fp inductions shouldn't be truncated");
+        addMetadata(EntryPart, Trunc);
+      }
+    }
     return;
   }
 
@@ -2554,54 +2637,6 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
 }
 
-Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
-                                          Value *Step,
-                                          Instruction::BinaryOps BinOp) {
-  // Create and check the types.
-  auto *ValVTy = cast<VectorType>(Val->getType());
-  ElementCount VLen = ValVTy->getElementCount();
-
-  Type *STy = Val->getType()->getScalarType();
-  assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
-         "Induction Step must be an integer or FP");
-  assert(Step->getType() == STy && "Step has wrong type");
-
-  SmallVector<Constant *, 8> Indices;
-
-  // Create a vector of consecutive numbers from zero to VF.
-  VectorType *InitVecValVTy = ValVTy;
-  Type *InitVecValSTy = STy;
-  if (STy->isFloatingPointTy()) {
-    InitVecValSTy =
-        IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
-    InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
-  }
-  Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
-
-  // Splat the StartIdx
-  Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
-
-  if (STy->isIntegerTy()) {
-    InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
-    Step = Builder.CreateVectorSplat(VLen, Step);
-    assert(Step->getType() == Val->getType() && "Invalid step vec");
-    // FIXME: The newly created binary instructions should contain nsw/nuw flags,
-    // which can be found from the original scalar operations.
-    Step = Builder.CreateMul(InitVec, Step);
-    return Builder.CreateAdd(Val, Step, "induction");
-  }
-
-  // Floating point induction.
-  assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
-         "Binary Opcode should be specified for FP induction");
-  InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
-  InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
-
-  Step = Builder.CreateVectorSplat(VLen, Step);
-  Value *MulOp = Builder.CreateFMul(InitVec, Step);
-  return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
-}
-
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
                                            Instruction *EntryVal,
                                            const InductionDescriptor &ID,
@@ -2691,11 +2726,6 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
   State.set(Def, VectorValue, Instance.Part);
 }
 
-Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
-  assert(Vec->getType()->isVectorTy() && "Invalid type");
-  return Builder.CreateVectorReverse(Vec, "reverse");
-}
-
 // Return whether we allow using masked interleave-groups (for dealing with
 // strided loads/stores that reside in predicated blocks, or for dealing
 // with gaps).
@@ -2858,7 +2888,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
         }
 
         if (Group->isReverse())
-          StridedVec = reverseVector(StridedVec);
+          StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
 
         State.set(VPDefs[J], StridedVec, Part);
       }
@@ -2894,7 +2924,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(
       Value *StoredVec = State.get(StoredValues[i], Part);
 
       if (Group->isReverse())
-        StoredVec = reverseVector(StoredVec);
+        StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
 
       // If this member has different type, cast it to a unified type.
 
@@ -2993,43 +3023,21 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
     PredicatedInstructions.push_back(Cloned);
 }
 
-PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
-                                                      Value *End, Value *Step,
-                                                      Instruction *DL) {
+void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
   BasicBlock *Header = L->getHeader();
-  BasicBlock *Latch = L->getLoopLatch();
-  // As we're just creating this loop, it's possible no latch exists
-  // yet. If so, use the header as this will be a single block loop.
-  if (!Latch)
-    Latch = Header;
-
-  IRBuilder<> B(&*Header->getFirstInsertionPt());
-  Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
-  setDebugLocFromInst(OldInst, &B);
-  auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
+  assert(!L->getLoopLatch() && "loop should not have a latch at this point");
 
-  B.SetInsertPoint(Latch->getTerminator());
+  IRBuilder<> B(Header->getTerminator());
+  Instruction *OldInst =
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
   setDebugLocFromInst(OldInst, &B);
 
-  // Create i+1 and fill the PHINode.
-  //
-  // If the tail is not folded, we know that End - Start >= Step (either
-  // statically or through the minimum iteration checks). We also know that both
-  // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
-  // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
-  // overflows and we can mark the induction increment as NUW.
-  Value *Next = B.CreateAdd(Induction, Step, "index.next",
-                            /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
-  Induction->addIncoming(Start, L->getLoopPreheader());
-  Induction->addIncoming(Next, Latch);
-  // Create the compare.
-  Value *ICmp = B.CreateICmpEQ(Next, End);
-  B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
+  // Connect the header to the exit and header blocks and replace the old
+  // terminator.
+  B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
 
   // Now we have two terminators. Remove the old one from the block.
-  Latch->getTerminator()->eraseFromParent();
-
-  return Induction;
+  Header->getTerminator()->eraseFromParent();
 }
 
 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
@@ -3099,10 +3107,9 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
   if (Cost->foldTailByMasking()) {
     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
-    assert(!VF.isScalable() &&
-           "Tail folding not yet supported for scalable vectors");
+    Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
     TC = Builder.CreateAdd(
-        TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
+        TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
   }
 
   // Now we need to generate the expression for the part of the loop that the
@@ -3436,12 +3443,13 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
 }
 
 void InnerLoopVectorizer::createInductionResumeValues(
-    Loop *L, Value *VectorTripCount,
-    std::pair<BasicBlock *, Value *> AdditionalBypass) {
-  assert(VectorTripCount && L && "Expected valid arguments");
+    Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
          "Inconsistent information about additional bypass.");
+
+  Value *VectorTripCount = getOrCreateVectorTripCount(L);
+  assert(VectorTripCount && L && "Expected valid arguments");
   // We are going to resume the execution of the scalar loop.
   // Go over all of the induction variables that we found and fix the
   // PHIs that are left in the scalar version of the loop.
@@ -3449,6 +3457,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
   // iteration in the vectorized loop.
   // If we come from a bypass edge then we need to start from the original
   // start value.
+  Instruction *OldInduction = Legal->getPrimaryInduction();
   for (auto &InductionEntry : Legal->getInductionVars()) {
     PHINode *OrigPhi = InductionEntry.first;
     InductionDescriptor II = InductionEntry.second;
@@ -3546,25 +3555,6 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
          "Inconsistent vector loop preheader");
   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
 
-  Optional<MDNode *> VectorizedLoopID =
-      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
-                                      LLVMLoopVectorizeFollowupVectorized});
-  if (VectorizedLoopID.hasValue()) {
-    L->setLoopID(VectorizedLoopID.getValue());
-
-    // Do not setAlreadyVectorized if loop attributes have been defined
-    // explicitly.
-    return LoopVectorPreHeader;
-  }
-
-  // Keep all loop hints from the original loop on the vector loop (we'll
-  // replace the vectorizer-specific hints below).
-  if (MDNode *LID = OrigLoop->getLoopID())
-    L->setLoopID(LID);
-
-  LoopVectorizeHints Hints(L, true, *ORE, TTI);
-  Hints.setAlreadyVectorized();
-
 #ifdef EXPENSIVE_CHECKS
   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
   LI->verify(*DT);
@@ -3573,7 +3563,8 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
   return LoopVectorPreHeader;
 }
 
-BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+std::pair<BasicBlock *, Value *>
+InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -3638,33 +3629,12 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // faster.
   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
 
-  // Some loops have a single integer induction variable, while other loops
-  // don't. One example is c++ iterators that often have multiple pointer
-  // induction variables. In the code below we also support a case where we
-  // don't have a single induction variable.
-  //
-  // We try to obtain an induction variable from the original loop as hard
-  // as possible. However if we don't find one that:
-  //   - is an integer
-  //   - counts from zero, stepping by one
-  //   - is the size of the widest induction variable type
-  // then we create a new one.
-  OldInduction = Legal->getPrimaryInduction();
-  Type *IdxTy = Legal->getWidestInductionType();
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-  // The loop step is equal to the vectorization factor (num of SIMD elements)
-  // times the unroll factor (num of SIMD instructions).
-  Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
-  Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
+  createHeaderBranch(Lp);
 
   // Emit phis for the new starting index of the scalar loop.
-  createInductionResumeValues(Lp, CountRoundDown);
+  createInductionResumeValues(Lp);
 
-  return completeLoopSkeleton(Lp, OrigLoopID);
+  return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
 }
 
 // Fix up external users of the induction variable. At this point, we are
@@ -4088,8 +4058,8 @@ void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
   }
 }
 
-void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
-                                                  VPTransformState &State) {
+void InnerLoopVectorizer::fixFirstOrderRecurrence(
+    VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
   // This is the second phase of vectorizing first-order recurrences. An
   // overview of the transformation is described below. Suppose we have the
   // following loop.
@@ -4334,13 +4304,29 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
   }
 
+  PHINode *ResumePhi =
+      dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
+
   // Create a phi node that merges control-flow from the backedge-taken check
   // block and the middle block.
   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
                                         LoopScalarPreHeader->getTerminator());
-  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
-    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
-  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+  // If we are fixing reductions in the epilogue loop then we should already
+  // have created a bc.merge.rdx Phi after the main vector body. Ensure that
+  // we carry over the incoming values correctly.
+  for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+    if (Incoming == LoopMiddleBlock)
+      BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
+    else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
+      BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
+                              Incoming);
+    else
+      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
+  }
+
+  // Set the resume value for this reduction
+  ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
 
   // Now, we need to fix the users of the reduction variable
   // inside and outside of the scalar remainder loop.
@@ -4557,6 +4543,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
   InductionDescriptor II = Legal->getInductionVars().lookup(P);
   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
 
+  auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
+  PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
+
   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
   // which can be found from the original scalar operations.
   switch (II.getKind()) {
@@ -4572,7 +4561,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
     if (Cost->isScalarAfterVectorization(P, State.VF)) {
       // This is the normalized GEP that starts counting at zero.
       Value *PtrInd =
-          Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
+          Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
       // Determine the number of scalars we need to generate for each unroll
       // iteration. If the instruction is uniform, we only need to generate the
       // first lane. Otherwise, we generate all VF values.
@@ -4602,10 +4591,10 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
     Type *PhiType = II.getStep()->getType();
 
     // Build a pointer phi
-    Value *ScalarStartValue = II.getStartValue();
+    Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
     Type *ScStValueType = ScalarStartValue->getType();
     PHINode *NewPointerPhi =
-        PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
+        PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
 
     // A pointer induction, performed by using a gep
@@ -4916,7 +4905,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
   Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
+bool LoopVectorizationCostModel::isScalarWithPredication(
+    Instruction *I, ElementCount VF) const {
   if (!blockNeedsPredicationForAnyReason(I->getParent()))
     return false;
   switch(I->getOpcode()) {
@@ -4928,11 +4918,14 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
       return false;
     auto *Ptr = getLoadStorePointerOperand(I);
     auto *Ty = getLoadStoreType(I);
+    Type *VTy = Ty;
+    if (VF.isVector())
+      VTy = VectorType::get(Ty, VF);
     const Align Alignment = getLoadStoreAlignment(I);
     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
-                                TTI.isLegalMaskedGather(Ty, Alignment))
+                                TTI.isLegalMaskedGather(VTy, Alignment))
                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
-                                TTI.isLegalMaskedScatter(Ty, Alignment));
+                                TTI.isLegalMaskedScatter(VTy, Alignment));
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -5005,7 +4998,7 @@ bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
-  if (isScalarWithPredication(I))
+  if (isScalarWithPredication(I, VF))
     return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
@@ -5056,7 +5049,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
                         << *I << "\n");
       return;
     }
-    if (isScalarWithPredication(I)) {
+    if (isScalarWithPredication(I, VF)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                         << *I << "\n");
       return;
@@ -5531,10 +5524,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     }
   }
 
-  // For scalable vectors, don't use tail folding as this is currently not yet
-  // supported. The code is likely to have ended up here if the tripcount is
-  // low, in which case it makes sense not to use scalable vectors.
-  if (MaxFactors.ScalableVF.isVector())
+  // For scalable vectors don't use tail folding for low trip counts or
+  // optimizing for code size. We only permit this if the user has explicitly
+  // requested it.
+  if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
+      ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
+      MaxFactors.ScalableVF.isVector())
     MaxFactors.ScalableVF = ElementCount::getScalable(0);
 
   // If we don't know the precise trip count, or if the trip count that we
@@ -5849,10 +5844,8 @@ bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
     const Loop &L, ElementCount VF) const {
   // Cross iteration phis such as reductions need special handling and are
   // currently unsupported.
-  if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
-        return Legal->isFirstOrderRecurrence(&Phi) ||
-               Legal->isReductionVariable(&Phi);
-      }))
+  if (any_of(L.getHeader()->phis(),
+             [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
     return false;
 
   // Phis with uses outside of the loop require special handling and are
@@ -5978,11 +5971,29 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
   unsigned MinWidth = -1U;
   unsigned MaxWidth = 8;
   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-  for (Type *T : ElementTypesInLoop) {
-    MinWidth = std::min<unsigned>(
-        MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
-    MaxWidth = std::max<unsigned>(
-        MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+  // For in-loop reductions, no element types are added to ElementTypesInLoop
+  // if there are no loads/stores in the loop. In this case, check through the
+  // reduction variables to determine the maximum width.
+  if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
+    // Reset MaxWidth so that we can find the smallest type used by recurrences
+    // in the loop.
+    MaxWidth = -1U;
+    for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
+      const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
+      // When finding the min width used by the recurrence we need to account
+      // for casts on the input operands of the recurrence.
+      MaxWidth = std::min<unsigned>(
+          MaxWidth, std::min<unsigned>(
+                        RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
+                        RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
+    }
+  } else {
+    for (Type *T : ElementTypesInLoop) {
+      MinWidth = std::min<unsigned>(
+          MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+      MaxWidth = std::max<unsigned>(
+          MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
+    }
   }
   return {MinWidth, MaxWidth};
 }
@@ -6022,18 +6033,8 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
       if (auto *ST = dyn_cast<StoreInst>(&I))
         T = ST->getValueOperand()->getType();
 
-      // Ignore loaded pointer types and stored pointer types that are not
-      // vectorizable.
-      //
-      // FIXME: The check here attempts to predict whether a load or store will
-      //        be vectorized. We only know this for certain after a VF has
-      //        been selected. Here, we assume that if an access can be
-      //        vectorized, it will be. We should also look at extending this
-      //        optimization to non-pointer types.
-      //
-      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
-          !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
-        continue;
+      assert(T->isSized() &&
+             "Expected the load/store/recurrence type to be sized");
 
       ElementTypesInLoop.insert(T);
     }
@@ -6475,7 +6476,8 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
   return RUs;
 }
 
-bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
+                                                           ElementCount VF) {
   // TODO: Cost model for emulated masked load/store is completely
   // broken. This hack guides the cost model to use an artificially
   // high enough value to practically disable vectorization with such
@@ -6484,8 +6486,7 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
   // from moving "masked load/store" check from legality to cost model.
   // Masked Load/Gather emulation was previously never allowed.
   // Limited number of Masked Store/Scatter emulation was allowed.
-  assert(isPredicatedInst(I) &&
-         "Expecting a scalar emulated instruction");
+  assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
   return isa<LoadInst>(I) ||
          (isa<StoreInst>(I) &&
           NumPredStores > NumberOfStoresToPredicate);
@@ -6512,13 +6513,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
     if (!blockNeedsPredicationForAnyReason(BB))
       continue;
     for (Instruction &I : *BB)
-      if (isScalarWithPredication(&I)) {
+      if (isScalarWithPredication(&I, VF)) {
         ScalarCostsTy ScalarCosts;
         // Do not apply discount if scalable, because that would lead to
         // invalid scalarization costs.
         // Do not apply discount logic if hacked cost is needed
         // for emulated masked memrefs.
-        if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
+        if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
         // Remember that BB will remain after vectorization.
@@ -6554,7 +6555,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
     // If the instruction is scalar with predication, it will be analyzed
     // separately. We ignore it within the context of PredInst.
-    if (isScalarWithPredication(I))
+    if (isScalarWithPredication(I, VF))
       return false;
 
     // If any of the instruction's operands are uniform after vectorization,
@@ -6601,7 +6602,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
-    if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+    if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
           APInt::getAllOnes(VF.getFixedValue()), true, false);
@@ -6764,7 +6765,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
   // If we have a predicated load/store, it will need extra i1 extracts and
   // conditional branches, but may not be executed for each vector lane. Scale
   // the cost by the probability of executing the predicated block.
-  if (isPredicatedInst(I)) {
+  if (isPredicatedInst(I, VF)) {
     Cost /= getReciprocalPredBlockProb();
 
     // Add the cost of an i1 extract and a branch
@@ -6775,7 +6776,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
         /*Insert=*/false, /*Extract=*/true);
     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
 
-    if (useEmulatedMaskMemRefHack(I))
+    if (useEmulatedMaskMemRefHack(I, VF))
       // Artificially setting to a high enough value to practically disable
       // vectorization with such operations.
       Cost = 3000000;
@@ -7182,7 +7183,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       // predicated uniform stores. Today they are treated as any other
       // predicated store (see added test cases in
       // invariant-store-vectorization.ll).
-      if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+      if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
         NumPredStores++;
 
       if (Legal->isUniformMemOp(I)) {
@@ -7192,7 +7193,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
         InstructionCost Cost;
         if (isa<StoreInst>(&I) && VF.isScalable() &&
-            isLegalGatherOrScatter(&I)) {
+            isLegalGatherOrScatter(&I, VF)) {
           Cost = getGatherScatterCost(&I, VF);
           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
         } else {
@@ -7234,7 +7235,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
       }
 
       InstructionCost GatherScatterCost =
-          isLegalGatherOrScatter(&I)
+          isLegalGatherOrScatter(&I, VF)
               ? getGatherScatterCost(&I, VF) * NumAccesses
               : InstructionCost::getInvalid();
 
@@ -7437,7 +7438,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
     // vector lane. Get the scalarization cost and scale this amount by the
     // probability of executing the predicated block. If the instruction is not
     // predicated, we fall through to the next case.
-    if (VF.isVector() && isScalarWithPredication(I)) {
+    if (VF.isVector() && isScalarWithPredication(I, VF)) {
       InstructionCost Cost = 0;
 
       // These instructions have a non-void type, so account for the phi nodes
@@ -7941,6 +7942,40 @@ VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
   llvm_unreachable("No plan found!");
 }
 
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+  SmallVector<Metadata *, 4> MDs;
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDs.push_back(nullptr);
+  bool IsUnrollMetadata = false;
+  MDNode *LoopID = L->getLoopID();
+  if (LoopID) {
+    // First find existing loop unrolling disable metadata.
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+      if (MD) {
+        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+        IsUnrollMetadata =
+            S && S->getString().startswith("llvm.loop.unroll.disable");
+      }
+      MDs.push_back(LoopID->getOperand(i));
+    }
+  }
+
+  if (!IsUnrollMetadata) {
+    // Add runtime unroll disable metadata.
+    LLVMContext &Context = L->getHeader()->getContext();
+    SmallVector<Metadata *, 1> DisableOperands;
+    DisableOperands.push_back(
+        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+    MDs.push_back(DisableNode);
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+    L->setLoopID(NewLoopID);
+  }
+}
+
 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
                                            VPlan &BestVPlan,
                                            InnerLoopVectorizer &ILV,
@@ -7952,9 +7987,9 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
 
   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
-  State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
-  State.TripCount = ILV.getOrCreateTripCount(nullptr);
-  State.CanonicalIV = ILV.Induction;
+  Value *CanonicalIVStartValue;
+  std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
+      ILV.createVectorizedLoopSkeleton();
   ILV.collectPoisonGeneratingRecipes(State);
 
   ILV.printDebugTracesAtStart();
@@ -7968,8 +8003,35 @@ void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
   //===------------------------------------------------===//
 
   // 2. Copy and widen instructions from the old loop into the new loop.
+  BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
+                             ILV.getOrCreateVectorTripCount(nullptr),
+                             CanonicalIVStartValue, State);
   BestVPlan.execute(&State);
 
+  // Keep all loop hints from the original loop on the vector loop (we'll
+  // replace the vectorizer-specific hints below).
+  MDNode *OrigLoopID = OrigLoop->getLoopID();
+
+  Optional<MDNode *> VectorizedLoopID =
+      makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+                                      LLVMLoopVectorizeFollowupVectorized});
+
+  Loop *L = LI->getLoopFor(State.CFG.PrevBB);
+  if (VectorizedLoopID.hasValue())
+    L->setLoopID(VectorizedLoopID.getValue());
+  else {
+    // Keep all loop hints from the original loop on the vector loop (we'll
+    // replace the vectorizer-specific hints below).
+    if (MDNode *LID = OrigLoop->getLoopID())
+      L->setLoopID(LID);
+
+    LoopVectorizeHints Hints(L, true, *ORE);
+    Hints.setAlreadyVectorized();
+  }
+  // Disable runtime unrolling when vectorizing the epilogue loop.
+  if (CanonicalIVStartValue)
+    AddRuntimeUnrollDisableMetaData(L);
+
   // 3. Fix the vectorized code: take care of header phi's, live-outs,
   //    predication, updating analyses.
   ILV.fixVectorizedLoop(State);
@@ -8032,66 +8094,16 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
   }
 }
 
-Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
-
 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
 
-Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
-                                        Value *Step,
-                                        Instruction::BinaryOps BinOp) {
-  // When unrolling and the VF is 1, we only need to add a simple scalar.
-  Type *Ty = Val->getType();
-  assert(!Ty->isVectorTy() && "Val must be a scalar");
-
-  if (Ty->isFloatingPointTy()) {
-    // Floating-point operations inherit FMF via the builder's flags.
-    Value *MulOp = Builder.CreateFMul(StartIdx, Step);
-    return Builder.CreateBinOp(BinOp, Val, MulOp);
-  }
-  return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
-}
-
-static void AddRuntimeUnrollDisableMetaData(Loop *L) {
-  SmallVector<Metadata *, 4> MDs;
-  // Reserve first location for self reference to the LoopID metadata node.
-  MDs.push_back(nullptr);
-  bool IsUnrollMetadata = false;
-  MDNode *LoopID = L->getLoopID();
-  if (LoopID) {
-    // First find existing loop unrolling disable metadata.
-    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
-      auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
-      if (MD) {
-        const auto *S = dyn_cast<MDString>(MD->getOperand(0));
-        IsUnrollMetadata =
-            S && S->getString().startswith("llvm.loop.unroll.disable");
-      }
-      MDs.push_back(LoopID->getOperand(i));
-    }
-  }
-
-  if (!IsUnrollMetadata) {
-    // Add runtime unroll disable metadata.
-    LLVMContext &Context = L->getHeader()->getContext();
-    SmallVector<Metadata *, 1> DisableOperands;
-    DisableOperands.push_back(
-        MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
-    MDNode *DisableNode = MDNode::get(Context, DisableOperands);
-    MDs.push_back(DisableNode);
-    MDNode *NewLoopID = MDNode::get(Context, MDs);
-    // Set operand 0 to refer to the loop id itself.
-    NewLoopID->replaceOperandWith(0, NewLoopID);
-    L->setLoopID(NewLoopID);
-  }
-}
-
 //===--------------------------------------------------------------------===//
 // EpilogueVectorizerMainLoop
 //===--------------------------------------------------------------------===//
 
 /// This function is partially responsible for generating the control flow
 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+std::pair<BasicBlock *, Value *>
+EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
   MDNode *OrigLoopID = OrigLoop->getLoopID();
   Loop *Lp = createVectorLoopSkeleton("");
 
@@ -8120,24 +8132,16 @@ BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
 
   // Generate the induction variable.
-  OldInduction = Legal->getPrimaryInduction();
-  Type *IdxTy = Legal->getWidestInductionType();
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-
-  IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
-  Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
   EPI.VectorTripCount = CountRoundDown;
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
+  createHeaderBranch(Lp);
 
   // Skip induction resume value creation here because they will be created in
   // the second pass. If we created them here, they wouldn't be used anyway,
   // because the vplan in the second pass still contains the inductions from the
   // original loop.
 
-  return completeLoopSkeleton(Lp, OrigLoopID);
+  return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
 }
 
 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -8219,7 +8223,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
 
 /// This function is partially responsible for generating the control flow
 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
-BasicBlock *
+std::pair<BasicBlock *, Value *>
 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   MDNode *OrigLoopID = OrigLoop->getLoopID();
   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
@@ -8275,6 +8279,25 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
+  // The vec.epilog.iter.check block may contain Phi nodes from reductions which
+  // merge control-flow from the latch block and the middle block. Update the
+  // incoming values here and move the Phi into the preheader.
+  SmallVector<PHINode *, 4> PhisInBlock;
+  for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
+    PhisInBlock.push_back(&Phi);
+
+  for (PHINode *Phi : PhisInBlock) {
+    Phi->replaceIncomingBlockWith(
+        VecEpilogueIterationCountCheck->getSinglePredecessor(),
+        VecEpilogueIterationCountCheck);
+    Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
+    if (EPI.SCEVSafetyCheck)
+      Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
+    if (EPI.MemSafetyCheck)
+      Phi->removeIncomingValue(EPI.MemSafetyCheck);
+    Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
+  }
+
   // Generate a resume induction for the vector epilogue and put it in the
   // vector epilogue preheader
   Type *IdxTy = Legal->getWidestInductionType();
@@ -8285,13 +8308,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
                            EPI.MainLoopIterationCountCheck);
 
   // Generate the induction variable.
-  OldInduction = Legal->getPrimaryInduction();
-  Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
-  Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
-  Value *StartIdx = EPResumeVal;
-  Induction =
-      createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
-                              getDebugLocFromInstOrOperands(OldInduction));
+  createHeaderBranch(Lp);
 
   // Generate induction resume values. These variables save the new starting
   // indexes for the scalar loop. They are used to test if there are any tail
@@ -8300,12 +8317,10 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
   // check, then the resume value for the induction variable comes from
   // the trip count of the main vector loop, hence passing the AdditionalBypass
   // argument.
-  createInductionResumeValues(Lp, CountRoundDown,
-                              {VecEpilogueIterationCountCheck,
-                               EPI.VectorTripCount} /* AdditionalBypass */);
+  createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
+                                   EPI.VectorTripCount} /* AdditionalBypass */);
 
-  AddRuntimeUnrollDisableMetaData(Lp);
-  return completeLoopSkeleton(Lp, OrigLoopID);
+  return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
 }
 
 BasicBlock *
@@ -8447,33 +8462,22 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
     // Introduce the early-exit compare IV <= BTC to form header block mask.
-    // This is used instead of IV < TC because TC may wrap, unlike BTC.
-    // Start by constructing the desired canonical IV in the header block.
-    VPValue *IV = nullptr;
-    if (Legal->getPrimaryInduction())
-      IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
-    else {
-      VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
-      auto *IVRecipe = new VPWidenCanonicalIVRecipe();
-      HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
-      IV = IVRecipe;
-    }
+    // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
+    // constructing the desired canonical IV in the header block as its first
+    // non-phi instructions.
+    assert(CM.foldTailByMasking() && "must fold the tail");
+    VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
+    auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
+    auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
+    HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
 
-    // Create the block in mask as the first non-phi instruction in the block.
     VPBuilder::InsertPointGuard Guard(Builder);
-    auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
-    Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
-
-    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
-    bool TailFolded = !CM.isScalarEpilogueAllowed();
-
-    if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
-      // While ActiveLaneMask is a binary op that consumes the loop tripcount
-      // as a second argument, we only pass the IV here and extract the
-      // tripcount from the transform state where codegen of the VP instructions
-      // happen.
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
+    Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
+    if (CM.TTI.emitGetActiveLaneMask()) {
+      VPValue *TC = Plan->getOrCreateTripCount();
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
     } else {
+      VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     }
     return BlockMaskCache[BB] = BlockMask;
@@ -8621,7 +8625,9 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                                                    VFRange &Range) const {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
+      [this, CI](ElementCount VF) {
+        return CM.isScalarWithPredication(CI, VF);
+      },
       Range);
 
   if (IsPredicated)
@@ -8661,7 +8667,8 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
   // scalarization is profitable or it is predicated.
   auto WillScalarize = [this, I](ElementCount VF) -> bool {
     return CM.isScalarAfterVectorization(I, VF) ||
-           CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
+           CM.isProfitableToScalarize(I, VF) ||
+           CM.isScalarWithPredication(I, VF);
   };
   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
                                                              Range);
@@ -8719,7 +8726,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
 
 void VPRecipeBuilder::fixHeaderPhis() {
   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
-  for (VPWidenPHIRecipe *R : PhisToFix) {
+  for (VPHeaderPHIRecipe *R : PhisToFix) {
     auto *PN = cast<PHINode>(R->getUnderlyingValue());
     VPRecipeBase *IncR =
         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
@@ -8735,7 +8742,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
+      [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
       Range);
 
   // Even if the instruction is not marked as uniform, there are certain
@@ -8861,7 +8868,7 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
       return toVPRecipeResult(Recipe);
 
-    VPWidenPHIRecipe *PhiRecipe = nullptr;
+    VPHeaderPHIRecipe *PhiRecipe = nullptr;
     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
       VPValue *StartV = Operands[0];
       if (Legal->isReductionVariable(Phi)) {
@@ -8882,11 +8889,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
       PhisToFix.push_back(PhiRecipe);
     } else {
-      // TODO: record start and backedge value for remaining pointer induction
-      // phis.
+      // TODO: record backedge value for remaining pointer induction phis.
       assert(Phi->getType()->isPointerTy() &&
              "only pointer phis should be handled here");
-      PhiRecipe = new VPWidenPHIRecipe(Phi);
+      assert(Legal->getInductionVars().count(Phi) &&
+             "Not an induction variable");
+      InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
+      VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
+      PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
     }
 
     return toVPRecipeResult(PhiRecipe);
@@ -8966,6 +8976,40 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   }
 }
 
+// Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
+// CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
+// BranchOnCount VPInstruction to the latch.
+static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
+                                  bool HasNUW, bool IsVPlanNative) {
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  auto *StartV = Plan.getOrAddVPValue(StartIdx);
+
+  auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
+  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
+  if (IsVPlanNative)
+    Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+  Header->insert(CanonicalIVPHI, Header->begin());
+
+  auto *CanonicalIVIncrement =
+      new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
+                               : VPInstruction::CanonicalIVIncrement,
+                        {CanonicalIVPHI}, DL);
+  CanonicalIVPHI->addOperand(CanonicalIVIncrement);
+
+  VPBasicBlock *EB = TopRegion->getExitBasicBlock();
+  if (IsVPlanNative) {
+    EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
+    EB->setCondBit(nullptr);
+  }
+  EB->appendRecipe(CanonicalIVIncrement);
+
+  auto *BranchOnCount =
+      new VPInstruction(VPInstruction::BranchOnCount,
+                        {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
+  EB->appendRecipe(BranchOnCount);
+}
+
 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
     const MapVector<Instruction *, Instruction *> &SinkAfter) {
@@ -9033,6 +9077,12 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
   auto Plan = std::make_unique<VPlan>(TopRegion);
 
+  Instruction *DLInst =
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
+  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
+                        DLInst ? DLInst->getDebugLoc() : DebugLoc(),
+                        !CM.foldTailByMasking(), false);
+
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
   LoopBlocksDFS DFS(OrigLoop);
@@ -9194,6 +9244,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     }
   }
 
+  VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
   VPlanTransforms::removeRedundantInductionCasts(*Plan);
 
   // Now that sink-after is done, move induction recipes for optimized truncates
@@ -9325,6 +9376,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
       OrigLoop, Plan,
       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
       DeadInstructions, *PSE.getSE());
+
+  addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
+                        true, true);
   return Plan;
 }
 
@@ -9414,16 +9468,19 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
   }
 
   // If tail is folded by masking, introduce selects between the phi
-  // and the live-out instruction of each reduction, at the end of the latch.
+  // and the live-out instruction of each reduction, at the beginning of the
+  // dedicated latch block.
   if (CM.foldTailByMasking()) {
+    Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
       if (!PhiR || PhiR->isInLoop())
         continue;
-      Builder.setInsertPoint(LatchVPBB);
       VPValue *Cond =
           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
       VPValue *Red = PhiR->getBackedgeValue();
+      assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
+             "reduction recipe must be defined before latch");
       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
     }
   }
@@ -9682,9 +9739,8 @@ void VPWidenGEPRecipe::execute(VPTransformState &State) {
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Int or FP induction being replicated.");
-  State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
-                                   getStartValue()->getLiveInIRValue(),
-                                   getTruncInst(), getVPValue(0), State);
+  auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
+  State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
 }
 
 void VPWidenPHIRecipe::execute(VPTransformState &State) {
@@ -10013,7 +10069,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
     }
 
-    State.set(getVPSingleValue(), NewLI, Part);
+    State.set(this, NewLI, Part);
   }
 }
 
@@ -10561,6 +10617,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
                                                  Checks);
 
         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+
+        // Ensure that the start values for any VPReductionPHIRecipes are
+        // updated before vectorising the epilogue loop.
+        VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
+        for (VPRecipeBase &R : Header->phis()) {
+          if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
+            if (auto *Resume = MainILV.getReductionResumeValue(
+                    ReductionPhi->getRecurrenceDescriptor())) {
+              VPValue *StartVal = new VPValue(Resume);
+              BestEpiPlan.addExternalDef(StartVal);
+              ReductionPhi->setOperand(0, StartVal);
+            }
+          }
+        }
+
         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
                         DT);
         ++LoopsEpilogueVectorized;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 37ae13666f7a..99c265fc5101 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -435,7 +435,7 @@ struct InstructionsState {
   }
 
   /// Some of the instructions in the list have alternate opcodes.
-  bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+  bool isAltShuffle() const { return AltOp != MainOp; }
 
   bool isOpcodeOrAlt(Instruction *I) const {
     unsigned CheckedOpcode = I->getOpcode();
@@ -581,7 +581,7 @@ static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
 }
 
 /// \returns the AA location that is being access by the instruction.
-static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
+static MemoryLocation getLocation(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return MemoryLocation::get(SI);
   if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -1417,7 +1417,11 @@ public:
           HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
         } else if (NumFreeOpsHash.NumOfAPOs == Min &&
                    NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
-          ++HashMap[NumFreeOpsHash.Hash].first;
+          auto It = HashMap.find(NumFreeOpsHash.Hash);
+          if (It == HashMap.end())
+            HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
+          else
+            ++It->second.first;
         }
       }
       // Select the lane with the minimum counter.
@@ -2019,9 +2023,7 @@ private:
     }
 
     /// Some of the instructions in the list have alternate opcodes.
-    bool isAltShuffle() const {
-      return getOpcode() != getAltOpcode();
-    }
+    bool isAltShuffle() const { return MainOp != AltOp; }
 
     bool isOpcodeOrAlt(Instruction *I) const {
       unsigned CheckedOpcode = I->getOpcode();
@@ -2519,12 +2521,11 @@ private:
       SD->IsScheduled = true;
       LLVM_DEBUG(dbgs() << "SLP:   schedule " << *SD << "\n");
 
-      ScheduleData *BundleMember = SD;
-      while (BundleMember) {
-        if (BundleMember->Inst != BundleMember->OpValue) {
-          BundleMember = BundleMember->NextInBundle;
+      for (ScheduleData *BundleMember = SD; BundleMember;
+           BundleMember = BundleMember->NextInBundle) {
+        if (BundleMember->Inst != BundleMember->OpValue)
           continue;
-        }
+        
         // Handle the def-use chain dependencies.
 
         // Decrement the unscheduled counter and insert to ready list if ready.
@@ -2589,7 +2590,6 @@ private:
                        << "SLP:    gets ready (mem): " << *DepBundle << "\n");
           }
         }
-        BundleMember = BundleMember->NextInBundle;
       }
     }
 
@@ -2618,6 +2618,10 @@ private:
       }
     }
 
+    /// Build a bundle from the ScheduleData nodes corresponding to the
+    /// scalar instruction for each lane.
+    ScheduleData *buildBundle(ArrayRef<Value *> VL);
+
     /// Checks if a bundle of instructions can be scheduled, i.e. has no
     /// cyclic dependencies. This is only a dry-run, no instructions are
     /// actually moved at this stage.
@@ -3040,7 +3044,7 @@ Optional<BoUpSLP::OrdersType> BoUpSLP::getReorderingData(const TreeEntry &TE,
 
 void BoUpSLP::reorderTopToBottom() {
   // Maps VF to the graph nodes.
-  DenseMap<unsigned, SmallPtrSet<TreeEntry *, 4>> VFToOrderedEntries;
+  DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;
   // ExtractElement gather nodes which can be vectorized and need to handle
   // their ordering.
   DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
@@ -3051,6 +3055,29 @@ void BoUpSLP::reorderTopToBottom() {
                                  const std::unique_ptr<TreeEntry> &TE) {
     if (Optional<OrdersType> CurrentOrder =
             getReorderingData(*TE.get(), /*TopToBottom=*/true)) {
+      // Do not include ordering for nodes used in the alt opcode vectorization,
+      // better to reorder them during bottom-to-top stage. If follow the order
+      // here, it causes reordering of the whole graph though actually it is
+      // profitable just to reorder the subgraph that starts from the alternate
+      // opcode vectorization node. Such nodes already end-up with the shuffle
+      // instruction and it is just enough to change this shuffle rather than
+      // rotate the scalars for the whole graph.
+      unsigned Cnt = 0;
+      const TreeEntry *UserTE = TE.get();
+      while (UserTE && Cnt < RecursionMaxDepth) {
+        if (UserTE->UserTreeIndices.size() != 1)
+          break;
+        if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
+              return EI.UserTE->State == TreeEntry::Vectorize &&
+                     EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
+            }))
+          return;
+        if (UserTE->UserTreeIndices.empty())
+          UserTE = nullptr;
+        else
+          UserTE = UserTE->UserTreeIndices.back().UserTE;
+        ++Cnt;
+      }
       VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
       if (TE->State != TreeEntry::Vectorize)
         GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
@@ -3066,7 +3093,7 @@ void BoUpSLP::reorderTopToBottom() {
     // Try to find the most profitable order. We just are looking for the most
     // used order and reorder scalar elements in the nodes according to this
     // mostly used order.
-    const SmallPtrSetImpl<TreeEntry *> &OrderedEntries = It->getSecond();
+    ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
     // All operands are reordered and used only in this node - propagate the
     // most used order to the user node.
     MapVector<OrdersType, unsigned,
@@ -4459,6 +4486,8 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
     CurrentOrder.clear();
     return false;
   }
+  if (ShouldKeepOrder)
+    CurrentOrder.clear();
 
   return ShouldKeepOrder;
 }
@@ -7202,6 +7231,33 @@ void BoUpSLP::optimizeGatherSequence() {
   GatherShuffleSeq.clear();
 }
 
+BoUpSLP::ScheduleData *
+BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
+  ScheduleData *Bundle = nullptr;  
+  ScheduleData *PrevInBundle = nullptr;
+  for (Value *V : VL) {
+    ScheduleData *BundleMember = getScheduleData(V);
+    assert(BundleMember &&
+           "no ScheduleData for bundle member "
+           "(maybe not in same basic block)");
+    assert(BundleMember->isSchedulingEntity() &&
+           "bundle member already part of other bundle");
+    if (PrevInBundle) {
+      PrevInBundle->NextInBundle = BundleMember;
+    } else {
+      Bundle = BundleMember;
+    }
+    BundleMember->UnscheduledDepsInBundle = 0;
+    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+    // Group the instructions to a bundle.
+    BundleMember->FirstInBundle = Bundle;
+    PrevInBundle = BundleMember;
+  }
+  assert(Bundle && "Failed to find schedule bundle");
+  return Bundle;
+}
+
 // Groups the instructions to a bundle (which is then a single scheduling entity)
 // and schedules instructions until the bundle gets ready.
 Optional<BoUpSLP::ScheduleData *>
@@ -7214,12 +7270,9 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
 
   // Initialize the instruction bundle.
   Instruction *OldScheduleEnd = ScheduleEnd;
-  ScheduleData *PrevInBundle = nullptr;
-  ScheduleData *Bundle = nullptr;
-  bool ReSchedule = false;
   LLVM_DEBUG(dbgs() << "SLP:  bundle: " << *S.OpValue << "\n");
 
-  auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule,
+  auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
                                                          ScheduleData *Bundle) {
     // The scheduling region got new instructions at the lower end (or it is a
     // new region for the first bundle). This makes it necessary to
@@ -7263,39 +7316,28 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
       // Otherwise the compiler may crash trying to incorrectly calculate
       // dependencies and emit instruction in the wrong order at the actual
       // scheduling.
-      TryScheduleBundle(/*ReSchedule=*/false, nullptr);
+      TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
       return None;
     }
   }
 
+  bool ReSchedule = false;
   for (Value *V : VL) {
     ScheduleData *BundleMember = getScheduleData(V);
     assert(BundleMember &&
            "no ScheduleData for bundle member (maybe not in same basic block)");
-    if (BundleMember->IsScheduled) {
-      // A bundle member was scheduled as single instruction before and now
-      // needs to be scheduled as part of the bundle. We just get rid of the
-      // existing schedule.
-      LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
-                        << " was already scheduled\n");
-      ReSchedule = true;
-    }
-    assert(BundleMember->isSchedulingEntity() &&
-           "bundle member already part of other bundle");
-    if (PrevInBundle) {
-      PrevInBundle->NextInBundle = BundleMember;
-    } else {
-      Bundle = BundleMember;
-    }
-    BundleMember->UnscheduledDepsInBundle = 0;
-    Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
-
-    // Group the instructions to a bundle.
-    BundleMember->FirstInBundle = Bundle;
-    PrevInBundle = BundleMember;
+    if (!BundleMember->IsScheduled)
+      continue;
+    // A bundle member was scheduled as single instruction before and now
+    // needs to be scheduled as part of the bundle. We just get rid of the
+    // existing schedule.
+    LLVM_DEBUG(dbgs() << "SLP:  reset schedule because " << *BundleMember
+                      << " was already scheduled\n");
+    ReSchedule = true;
   }
-  assert(Bundle && "Failed to find schedule bundle");
-  TryScheduleBundle(ReSchedule, Bundle);
+
+  auto *Bundle = buildBundle(VL);
+  TryScheduleBundleImpl(ReSchedule, Bundle);
   if (!Bundle->isReady()) {
     cancelScheduling(VL, S.OpValue);
     return None;
@@ -7464,20 +7506,33 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
 
   while (!WorkList.empty()) {
     ScheduleData *SD = WorkList.pop_back_val();
-
-    ScheduleData *BundleMember = SD;
-    while (BundleMember) {
+    for (ScheduleData *BundleMember = SD; BundleMember;
+         BundleMember = BundleMember->NextInBundle) {
       assert(isInSchedulingRegion(BundleMember));
-      if (!BundleMember->hasValidDependencies()) {
-
-        LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
-                          << "\n");
-        BundleMember->Dependencies = 0;
-        BundleMember->resetUnscheduledDeps();
+      if (BundleMember->hasValidDependencies())
+        continue;
 
-        // Handle def-use chain dependencies.
-        if (BundleMember->OpValue != BundleMember->Inst) {
-          ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+      LLVM_DEBUG(dbgs() << "SLP:       update deps of " << *BundleMember
+                 << "\n");
+      BundleMember->Dependencies = 0;
+      BundleMember->resetUnscheduledDeps();
+
+      // Handle def-use chain dependencies.
+      if (BundleMember->OpValue != BundleMember->Inst) {
+        ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+        if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+          BundleMember->Dependencies++;
+          ScheduleData *DestBundle = UseSD->FirstInBundle;
+          if (!DestBundle->IsScheduled)
+            BundleMember->incrementUnscheduledDeps(1);
+          if (!DestBundle->hasValidDependencies())
+            WorkList.push_back(DestBundle);
+        }
+      } else {
+        for (User *U : BundleMember->Inst->users()) {
+          assert(isa<Instruction>(U) &&
+                 "user of instruction must be instruction");
+          ScheduleData *UseSD = getScheduleData(U);
           if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
             BundleMember->Dependencies++;
             ScheduleData *DestBundle = UseSD->FirstInBundle;
@@ -7486,89 +7541,69 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
             if (!DestBundle->hasValidDependencies())
               WorkList.push_back(DestBundle);
           }
-        } else {
-          for (User *U : BundleMember->Inst->users()) {
-            if (isa<Instruction>(U)) {
-              ScheduleData *UseSD = getScheduleData(U);
-              if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
-                BundleMember->Dependencies++;
-                ScheduleData *DestBundle = UseSD->FirstInBundle;
-                if (!DestBundle->IsScheduled)
-                  BundleMember->incrementUnscheduledDeps(1);
-                if (!DestBundle->hasValidDependencies())
-                  WorkList.push_back(DestBundle);
-              }
-            } else {
-              // I'm not sure if this can ever happen. But we need to be safe.
-              // This lets the instruction/bundle never be scheduled and
-              // eventually disable vectorization.
-              BundleMember->Dependencies++;
-              BundleMember->incrementUnscheduledDeps(1);
-            }
-          }
         }
+      }
 
-        // Handle the memory dependencies.
-        ScheduleData *DepDest = BundleMember->NextLoadStore;
-        if (DepDest) {
-          Instruction *SrcInst = BundleMember->Inst;
-          MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
-          bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
-          unsigned numAliased = 0;
-          unsigned DistToSrc = 1;
-
-          while (DepDest) {
-            assert(isInSchedulingRegion(DepDest));
-
-            // We have two limits to reduce the complexity:
-            // 1) AliasedCheckLimit: It's a small limit to reduce calls to
-            //    SLP->isAliased (which is the expensive part in this loop).
-            // 2) MaxMemDepDistance: It's for very large blocks and it aborts
-            //    the whole loop (even if the loop is fast, it's quadratic).
-            //    It's important for the loop break condition (see below) to
-            //    check this limit even between two read-only instructions.
-            if (DistToSrc >= MaxMemDepDistance ||
-                    ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
-                     (numAliased >= AliasedCheckLimit ||
-                      SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
-
-              // We increment the counter only if the locations are aliased
-              // (instead of counting all alias checks). This gives a better
-              // balance between reduced runtime and accurate dependencies.
-              numAliased++;
-
-              DepDest->MemoryDependencies.push_back(BundleMember);
-              BundleMember->Dependencies++;
-              ScheduleData *DestBundle = DepDest->FirstInBundle;
-              if (!DestBundle->IsScheduled) {
-                BundleMember->incrementUnscheduledDeps(1);
-              }
-              if (!DestBundle->hasValidDependencies()) {
-                WorkList.push_back(DestBundle);
-              }
-            }
-            DepDest = DepDest->NextLoadStore;
-
-            // Example, explaining the loop break condition: Let's assume our
-            // starting instruction is i0 and MaxMemDepDistance = 3.
-            //
-            //                      +--------v--v--v
-            //             i0,i1,i2,i3,i4,i5,i6,i7,i8
-            //             +--------^--^--^
-            //
-            // MaxMemDepDistance let us stop alias-checking at i3 and we add
-            // dependencies from i0 to i3,i4,.. (even if they are not aliased).
-            // Previously we already added dependencies from i3 to i6,i7,i8
-            // (because of MaxMemDepDistance). As we added a dependency from
-            // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
-            // and we can abort this loop at i6.
-            if (DistToSrc >= 2 * MaxMemDepDistance)
-              break;
-            DistToSrc++;
+      // Handle the memory dependencies (if any).
+      ScheduleData *DepDest = BundleMember->NextLoadStore;
+      if (!DepDest)
+        continue;
+      Instruction *SrcInst = BundleMember->Inst;
+      assert(SrcInst->mayReadOrWriteMemory() &&
+             "NextLoadStore list for non memory effecting bundle?");
+      MemoryLocation SrcLoc = getLocation(SrcInst);
+      bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+      unsigned numAliased = 0;
+      unsigned DistToSrc = 1;
+
+      for ( ; DepDest; DepDest = DepDest->NextLoadStore) {
+        assert(isInSchedulingRegion(DepDest));
+
+        // We have two limits to reduce the complexity:
+        // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+        //    SLP->isAliased (which is the expensive part in this loop).
+        // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+        //    the whole loop (even if the loop is fast, it's quadratic).
+        //    It's important for the loop break condition (see below) to
+        //    check this limit even between two read-only instructions.
+        if (DistToSrc >= MaxMemDepDistance ||
+            ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+             (numAliased >= AliasedCheckLimit ||
+              SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+          // We increment the counter only if the locations are aliased
+          // (instead of counting all alias checks). This gives a better
+          // balance between reduced runtime and accurate dependencies.
+          numAliased++;
+
+          DepDest->MemoryDependencies.push_back(BundleMember);
+          BundleMember->Dependencies++;
+          ScheduleData *DestBundle = DepDest->FirstInBundle;
+          if (!DestBundle->IsScheduled) {
+            BundleMember->incrementUnscheduledDeps(1);
+          }
+          if (!DestBundle->hasValidDependencies()) {
+            WorkList.push_back(DestBundle);
           }
         }
+
+        // Example, explaining the loop break condition: Let's assume our
+        // starting instruction is i0 and MaxMemDepDistance = 3.
+        //
+        //                      +--------v--v--v
+        //             i0,i1,i2,i3,i4,i5,i6,i7,i8
+        //             +--------^--^--^
+        //
+        // MaxMemDepDistance let us stop alias-checking at i3 and we add
+        // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+        // Previously we already added dependencies from i3 to i6,i7,i8
+        // (because of MaxMemDepDistance). As we added a dependency from
+        // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+        // and we can abort this loop at i6.
+        if (DistToSrc >= 2 * MaxMemDepDistance)
+          break;
+        DistToSrc++;
       }
-      BundleMember = BundleMember->NextInBundle;
     }
     if (InsertInReadyList && SD->isReady()) {
       ReadyInsts.push_back(SD);
@@ -7638,8 +7673,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
 
     // Move the scheduled instruction(s) to their dedicated places, if not
     // there yet.
-    ScheduleData *BundleMember = picked;
-    while (BundleMember) {
+    for (ScheduleData *BundleMember = picked; BundleMember;
+         BundleMember = BundleMember->NextInBundle) {
       Instruction *pickedInst = BundleMember->Inst;
       if (pickedInst->getNextNode() != LastScheduledInst) {
         BS->BB->getInstList().remove(pickedInst);
@@ -7647,7 +7682,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
                                      pickedInst);
       }
       LastScheduledInst = pickedInst;
-      BundleMember = BundleMember->NextInBundle;
     }
 
     BS->schedule(picked, ReadyInsts);
@@ -8045,8 +8079,11 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
 
   // If the target claims to have no vector registers don't attempt
   // vectorization.
-  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
+  if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
+    LLVM_DEBUG(
+        dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
     return false;
+  }
 
   // Don't vectorize when the attribute NoImplicitFloat is used.
   if (F.hasFnAttribute(Attribute::NoImplicitFloat))
@@ -8693,7 +8730,6 @@ class HorizontalReduction {
 
   static RecurKind getRdxKind(Instruction *I) {
     assert(I && "Expected instruction for reduction matching");
-    TargetTransformInfo::ReductionFlags RdxFlags;
     if (match(I, m_Add(m_Value(), m_Value())))
       return RecurKind::Add;
     if (match(I, m_Mul(m_Value(), m_Value())))
@@ -8767,7 +8803,6 @@ class HorizontalReduction {
           return RecurKind::None;
       }
 
-      TargetTransformInfo::ReductionFlags RdxFlags;
       switch (Pred) {
       default:
         return RecurKind::None;
@@ -9206,7 +9241,7 @@ private:
       auto *SclCondTy = CmpInst::makeCmpResultType(ScalarTy);
       auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VectorTy));
       VectorCost = TTI->getMinMaxReductionCost(VectorTy, VecCondTy,
-                                               /*unsigned=*/false, CostKind);
+                                               /*IsUnsigned=*/false, CostKind);
       CmpInst::Predicate RdxPred = getMinMaxReductionPredicate(RdxKind);
       ScalarCost = TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy,
                                            SclCondTy, RdxPred, CostKind) +
@@ -9571,8 +9606,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
-  // Aggregate value is unlikely to be processed in vector register, we need to
-  // extract scalars into scalar registers, so NeedExtraction is set true.
+  // Aggregate value is unlikely to be processed in vector register.
   return tryToVectorizeList(BuildVectorOpds, R);
 }
 
@@ -9598,7 +9632,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
                        function_ref<unsigned(T *)> Limit,
                        function_ref<bool(T *, T *)> Comparator,
                        function_ref<bool(T *, T *)> AreCompatible,
-                       function_ref<bool(ArrayRef<T *>, bool)> TryToVectorize,
+                       function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
                        bool LimitForRegisterSize) {
   bool Changed = false;
   // Sort by type, parent, operands.
@@ -9627,7 +9661,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
     // same/alternate ops only, this may result in some extra final
     // vectorization.
     if (NumElts > 1 &&
-        TryToVectorize(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
+        TryToVectorizeHelper(makeArrayRef(IncIt, NumElts), LimitForRegisterSize)) {
       // Success start over because instructions might have been changed.
       Changed = true;
     } else if (NumElts < Limit(*IncIt) &&
@@ -9638,7 +9672,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
     // Final attempt to vectorize instructions with the same types.
     if (Candidates.size() > 1 &&
         (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
-      if (TryToVectorize(Candidates, /*LimitForRegisterSize=*/false)) {
+      if (TryToVectorizeHelper(Candidates, /*LimitForRegisterSize=*/false)) {
         // Success start over because instructions might have been changed.
         Changed = true;
       } else if (LimitForRegisterSize) {
@@ -9649,7 +9683,7 @@ tryToVectorizeSequence(SmallVectorImpl<T *> &Incoming,
           while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
             ++SameTypeIt;
           unsigned NumElts = (SameTypeIt - It);
-          if (NumElts > 1 && TryToVectorize(makeArrayRef(It, NumElts),
+          if (NumElts > 1 && TryToVectorizeHelper(makeArrayRef(It, NumElts),
                                             /*LimitForRegisterSize=*/false))
             Changed = true;
           It = SameTypeIt;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 65857f034210..e5dded3c0f1e 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -59,7 +59,7 @@ class VPRecipeBuilder {
   /// Cross-iteration reduction & first-order recurrence phis for which we need
   /// to add the incoming value from the backedge after all recipes have been
   /// created.
-  SmallVector<VPWidenPHIRecipe *, 4> PhisToFix;
+  SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1d9e71663cd2..a96c122db2a9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -677,10 +677,10 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
     // Get the original loop tripcount.
-    Value *ScalarTC = State.TripCount;
+    Value *ScalarTC = State.get(getOperand(1), Part);
 
     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
+    auto *PredTy = VectorType::get(Int1Ty, State.VF);
     Instruction *Call = Builder.CreateIntrinsic(
         Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
         {VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
@@ -711,6 +711,51 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     }
     break;
   }
+
+  case VPInstruction::CanonicalIVIncrement:
+  case VPInstruction::CanonicalIVIncrementNUW: {
+    Value *Next = nullptr;
+    if (Part == 0) {
+      bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
+      auto *Phi = State.get(getOperand(0), 0);
+      // The loop step is equal to the vectorization factor (num of SIMD
+      // elements) times the unroll factor (num of SIMD instructions).
+      Value *Step =
+          createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+      Next = Builder.CreateAdd(Phi, Step, "index.next", IsNUW, false);
+    } else {
+      Next = State.get(this, 0);
+    }
+
+    State.set(this, Next, Part);
+    break;
+  }
+  case VPInstruction::BranchOnCount: {
+    if (Part != 0)
+      break;
+    // First create the compare.
+    Value *IV = State.get(getOperand(0), Part);
+    Value *TC = State.get(getOperand(1), Part);
+    Value *Cond = Builder.CreateICmpEQ(IV, TC);
+
+    // Now create the branch.
+    auto *Plan = getParent()->getPlan();
+    VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
+    VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
+    if (Header->empty()) {
+      assert(EnableVPlanNativePath &&
+             "empty entry block only expected in VPlanNativePath");
+      Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+    }
+    // TODO: Once the exit block is modeled in VPlan, use it instead of going
+    // through State.CFG.LastBB.
+    BasicBlock *Exit =
+        cast<BranchInst>(State.CFG.LastBB->getTerminator())->getSuccessor(0);
+
+    Builder.CreateCondBr(Cond, Exit, State.CFG.VPBB2IRBB[Header]);
+    Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -758,6 +803,15 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent,
   case VPInstruction::FirstOrderRecurrenceSplice:
     O << "first-order splice";
     break;
+  case VPInstruction::CanonicalIVIncrement:
+    O << "VF * UF + ";
+    break;
+  case VPInstruction::CanonicalIVIncrementNUW:
+    O << "VF * UF +(nuw) ";
+    break;
+  case VPInstruction::BranchOnCount:
+    O << "branch-on-count ";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -786,23 +840,55 @@ void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
   FMF = FMFNew;
 }
 
-/// Generate the code inside the body of the vectorized loop. Assumes a single
-/// LoopVectorBody basic-block was created for this. Introduce additional
-/// basic-blocks as needed, and fill them all.
-void VPlan::execute(VPTransformState *State) {
-  // -1. Check if the backedge taken count is needed, and if so build it.
+void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
+                             Value *CanonicalIVStartValue,
+                             VPTransformState &State) {
+  // Check if the trip count is needed, and if so build it.
+  if (TripCount && TripCount->getNumUsers()) {
+    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+      State.set(TripCount, TripCountV, Part);
+  }
+
+  // Check if the backedge taken count is needed, and if so build it.
   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
-    Value *TC = State->TripCount;
-    IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
-    auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+    IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+    auto *TCMO = Builder.CreateSub(TripCountV,
+                                   ConstantInt::get(TripCountV->getType(), 1),
                                    "trip.count.minus.1");
-    auto VF = State->VF;
+    auto VF = State.VF;
     Value *VTCMO =
         VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
-    for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
-      State->set(BackedgeTakenCount, VTCMO, Part);
+    for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+      State.set(BackedgeTakenCount, VTCMO, Part);
   }
 
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+    State.set(&VectorTripCount, VectorTripCountV, Part);
+
+  // When vectorizing the epilogue loop, the canonical induction start value
+  // needs to be changed from zero to the value after the main vector loop.
+  if (CanonicalIVStartValue) {
+    VPValue *VPV = new VPValue(CanonicalIVStartValue);
+    addExternalDef(VPV);
+    auto *IV = getCanonicalIV();
+    assert(all_of(IV->users(),
+                  [](const VPUser *U) {
+                    auto *VPI = cast<VPInstruction>(U);
+                    return VPI->getOpcode() ==
+                               VPInstruction::CanonicalIVIncrement ||
+                           VPI->getOpcode() ==
+                               VPInstruction::CanonicalIVIncrementNUW;
+                  }) &&
+           "the canonical IV should only be used by its increments when "
+           "resetting the start value");
+    IV->setOperand(0, VPV);
+  }
+}
+
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
   // 0. Set the reverse mapping from VPValues to Values for code generation.
   for (auto &Entry : Value2VPValue)
     State->VPValue2Value[Entry.second] = Entry.first;
@@ -834,28 +920,6 @@ void VPlan::execute(VPTransformState *State) {
   for (VPBlockBase *Block : depth_first(Entry))
     Block->execute(State);
 
-  // Fix the latch value of reduction and first-order recurrences phis in the
-  // vector loop.
-  VPBasicBlock *Header = Entry->getEntryBasicBlock();
-  for (VPRecipeBase &R : Header->phis()) {
-    auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
-    if (!PhiR || !(isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
-                   isa<VPReductionPHIRecipe>(&R)))
-      continue;
-    // For first-order recurrences and in-order reduction phis, only a single
-    // part is generated, which provides the last part from the previous
-    // iteration. Otherwise all UF parts are generated.
-    bool SinglePartNeeded = isa<VPFirstOrderRecurrencePHIRecipe>(&R) ||
-                            cast<VPReductionPHIRecipe>(&R)->isOrdered();
-    unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
-    for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
-      Value *VecPhi = State->get(PhiR, Part);
-      Value *Val = State->get(PhiR->getBackedgeValue(),
-                              SinglePartNeeded ? State->UF - 1 : Part);
-      cast<PHINode>(VecPhi)->addIncoming(Val, VectorLatchBB);
-    }
-  }
-
   // Setup branch terminator successors for VPBBs in VPBBsToFix based on
   // VPBB's successors.
   for (auto VPBB : State->CFG.VPBBsToFix) {
@@ -876,13 +940,19 @@ void VPlan::execute(VPTransformState *State) {
 
   // 3. Merge the temporary latch created with the last basic-block filled.
   BasicBlock *LastBB = State->CFG.PrevBB;
+  assert(isa<BranchInst>(LastBB->getTerminator()) &&
+         "Expected VPlan CFG to terminate with branch");
+
+  // Move both the branch and check from LastBB to VectorLatchBB.
+  auto *LastBranch = cast<BranchInst>(LastBB->getTerminator());
+  LastBranch->moveBefore(VectorLatchBB->getTerminator());
+  VectorLatchBB->getTerminator()->eraseFromParent();
+  // Move condition so it is guaranteed to be next to branch. This is only done
+  // to avoid excessive test updates.
+  // TODO: Remove special handling once the increments for all inductions are
+  // modeled explicitly in VPlan.
+  cast<Instruction>(LastBranch->getCondition())->moveBefore(LastBranch);
   // Connect LastBB to VectorLatchBB to facilitate their merge.
-  assert((EnableVPlanNativePath ||
-          isa<UnreachableInst>(LastBB->getTerminator())) &&
-         "Expected InnerLoop VPlan CFG to terminate with unreachable");
-  assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
-         "Expected VPlan CFG to terminate with branch in NativePath");
-  LastBB->getTerminator()->eraseFromParent();
   BranchInst::Create(VectorLatchBB, LastBB);
 
   // Merge LastBB with Latch.
@@ -891,6 +961,37 @@ void VPlan::execute(VPTransformState *State) {
   assert(Merged && "Could not merge last basic block with latch.");
   VectorLatchBB = LastBB;
 
+  // Fix the latch value of canonical, reduction and first-order recurrences
+  // phis in the vector loop.
+  VPBasicBlock *Header = Entry->getEntryBasicBlock();
+  if (Header->empty()) {
+    assert(EnableVPlanNativePath);
+    Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
+  }
+  for (VPRecipeBase &R : Header->phis()) {
+    // Skip phi-like recipes that generate their backedege values themselves.
+    // TODO: Model their backedge values explicitly.
+    if (isa<VPWidenIntOrFpInductionRecipe>(&R) || isa<VPWidenPHIRecipe>(&R))
+      continue;
+
+    auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+    // For  canonical IV, first-order recurrences and in-order reduction phis,
+    // only a single part is generated, which provides the last part from the
+    // previous iteration. For non-ordered reductions all UF parts are
+    // generated.
+    bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
+                            isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
+                            cast<VPReductionPHIRecipe>(PhiR)->isOrdered();
+    unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
+
+    for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
+      Value *Phi = State->get(PhiR, Part);
+      Value *Val = State->get(PhiR->getBackedgeValue(),
+                              SinglePartNeeded ? State->UF - 1 : Part);
+      cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
+    }
+  }
+
   // We do not attempt to preserve DT for outer loop vectorization currently.
   if (!EnableVPlanNativePath)
     updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
@@ -904,6 +1005,12 @@ void VPlan::print(raw_ostream &O) const {
 
   O << "VPlan '" << Name << "' {";
 
+  if (VectorTripCount.getNumUsers() > 0) {
+    O << "\nLive-in ";
+    VectorTripCount.printAsOperand(O, SlotTracker);
+    O << " = vector-trip-count\n";
+  }
+
   if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
     O << "\nLive-in ";
     BackedgeTakenCount->printAsOperand(O, SlotTracker);
@@ -1155,7 +1262,15 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
   } else
     O << " " << VPlanIngredient(IV);
 }
+#endif
 
+bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+  auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
+  auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
+  return StartC && StartC->isZero() && StepC && StepC->isOne();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
                              VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN-GEP ";
@@ -1255,7 +1370,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
   O << Indent << "WIDEN ";
 
   if (!isStore()) {
-    getVPSingleValue()->printAsOperand(O, SlotTracker);
+    printAsOperand(O, SlotTracker);
     O << " = ";
   }
   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
@@ -1264,26 +1379,39 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
+void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
+  Value *Start = getStartValue()->getLiveInIRValue();
+  PHINode *EntryPart = PHINode::Create(
+      Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
+  EntryPart->addIncoming(Start, State.CFG.VectorPreHeader);
+  EntryPart->setDebugLoc(DL);
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
+    State.set(this, EntryPart, Part);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                   VPSlotTracker &SlotTracker) const {
+  O << Indent << "EMIT ";
+  printAsOperand(O, SlotTracker);
+  O << " = CANONICAL-INDUCTION";
+}
+#endif
+
 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
-  Value *CanonicalIV = State.CanonicalIV;
+  Value *CanonicalIV = State.get(getOperand(0), 0);
   Type *STy = CanonicalIV->getType();
   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
   ElementCount VF = State.VF;
-  assert(!VF.isScalable() && "the code following assumes non scalables ECs");
   Value *VStart = VF.isScalar()
                       ? CanonicalIV
-                      : Builder.CreateVectorSplat(VF.getKnownMinValue(),
-                                                  CanonicalIV, "broadcast");
+                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
-    SmallVector<Constant *, 8> Indices;
-    for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
-      Indices.push_back(
-          ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
-    // If VF == 1, there is only one iteration in the loop above, thus the
-    // element pushed back into Indices is ConstantInt::get(STy, Part)
-    Constant *VStep =
-        VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
-    // Add the consecutive indices to the vector value.
+    Value *VStep = createStepForVF(Builder, STy, VF, Part);
+    if (VF.isVector()) {
+      VStep = Builder.CreateVectorSplat(VF, VStep);
+      VStep = Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
+    }
     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
     State.set(this, CanonicalVectorIV, Part);
   }
@@ -1294,7 +1422,8 @@ void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
                                      VPSlotTracker &SlotTracker) const {
   O << Indent << "EMIT ";
   printAsOperand(O, SlotTracker);
-  O << " = WIDEN-CANONICAL-INDUCTION";
+  O << " = WIDEN-CANONICAL-INDUCTION ";
+  printOperands(O, SlotTracker);
 }
 #endif
 
@@ -1461,7 +1590,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
                                          InterleavedAccessInfo &IAI) {
   if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
     for (VPRecipeBase &VPI : *VPBB) {
-      if (isa<VPWidenPHIRecipe>(&VPI))
+      if (isa<VPHeaderPHIRecipe>(&VPI))
         continue;
       assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
       auto *VPInst = cast<VPInstruction>(&VPI);
@@ -1506,6 +1635,7 @@ void VPSlotTracker::assignSlots(const VPlan &Plan) {
   for (const VPValue *V : Plan.VPExternalDefs)
     assignSlot(V);
 
+  assignSlot(&Plan.VectorTripCount);
   if (Plan.BackedgeTakenCount)
     assignSlot(Plan.BackedgeTakenCount);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index f4a1883e35d5..824440f98a8b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -69,6 +69,9 @@ class VPlanSlp;
 /// vectors it is an expression determined at runtime.
 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF);
 
+/// Return a value for Step multiplied by VF.
+Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF, int64_t Step);
+
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
 /// [1, 9) = {1, 2, 4, 8}
@@ -198,8 +201,8 @@ struct VPTransformState {
   VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                    DominatorTree *DT, IRBuilder<> &Builder,
                    InnerLoopVectorizer *ILV, VPlan *Plan)
-      : VF(VF), UF(UF), Instance(), LI(LI), DT(DT), Builder(Builder), ILV(ILV),
-        Plan(Plan) {}
+      : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan) {
+  }
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
   ElementCount VF;
@@ -341,9 +344,6 @@ struct VPTransformState {
   /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
   Value *CanonicalIV = nullptr;
 
-  /// Hold the trip count of the scalar loop.
-  Value *TripCount = nullptr;
-
   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
   InnerLoopVectorizer *ILV;
 
@@ -793,6 +793,9 @@ public:
     SLPLoad,
     SLPStore,
     ActiveLaneMask,
+    CanonicalIVIncrement,
+    CanonicalIVIncrementNUW,
+    BranchOnCount,
   };
 
 private:
@@ -833,6 +836,16 @@ public:
     return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
   }
 
+  /// Extra classof implementations to allow directly casting from VPUser ->
+  /// VPInstruction.
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && R->getVPDefID() == VPRecipeBase::VPInstructionSC;
+  }
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
+  }
+
   unsigned getOpcode() const { return Opcode; }
 
   /// Generate the instruction.
@@ -871,6 +884,7 @@ public:
     case Instruction::Unreachable:
     case Instruction::Fence:
     case Instruction::AtomicRMW:
+    case VPInstruction::BranchOnCount:
       return false;
     default:
       return true;
@@ -1045,6 +1059,7 @@ public:
 
   /// Returns the start value of the induction.
   VPValue *getStartValue() { return getOperand(0); }
+  const VPValue *getStartValue() const { return getOperand(0); }
 
   /// Returns the first defined value as TruncInst, if it is one or nullptr
   /// otherwise.
@@ -1057,66 +1072,65 @@ public:
 
   /// Returns the induction descriptor for the recipe.
   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
-};
 
-/// A recipe for handling first order recurrences and pointer inductions. For
-/// first-order recurrences, the start value is the first operand of the recipe
-/// and the incoming value from the backedge is the second operand. It also
-/// serves as base class for VPReductionPHIRecipe. In the VPlan native path, all
-/// incoming VPValues & VPBasicBlock pairs are managed in the recipe directly.
-class VPWidenPHIRecipe : public VPRecipeBase, public VPValue {
-  /// List of incoming blocks. Only used in the VPlan native path.
-  SmallVector<VPBasicBlock *, 2> IncomingBlocks;
+  /// Returns true if the induction is canonical, i.e. starting at 0 and
+  /// incremented by UF * VF (= the original IV is incremented by 1).
+  bool isCanonical() const;
+
+  /// Returns the scalar type of the induction.
+  const Type *getScalarType() const {
+    const TruncInst *TruncI = getTruncInst();
+    return TruncI ? TruncI->getType() : IV->getType();
+  }
+};
 
+/// A pure virtual base class for all recipes modeling header phis, including
+/// phis for first order recurrences, pointer inductions and reductions. The
+/// start value is the first operand of the recipe and the incoming value from
+/// the backedge is the second operand.
+class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
 protected:
-  VPWidenPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
-                   VPValue *Start = nullptr)
+  VPHeaderPHIRecipe(unsigned char VPVID, unsigned char VPDefID, PHINode *Phi,
+                    VPValue *Start = nullptr)
       : VPRecipeBase(VPDefID, {}), VPValue(VPVID, Phi, this) {
     if (Start)
       addOperand(Start);
   }
 
 public:
-  /// Create a VPWidenPHIRecipe for \p Phi
-  VPWidenPHIRecipe(PHINode *Phi)
-      : VPWidenPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {}
-
-  /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
-  VPWidenPHIRecipe(PHINode *Phi, VPValue &Start) : VPWidenPHIRecipe(Phi) {
-    addOperand(&Start);
-  }
-
-  ~VPWidenPHIRecipe() override = default;
+  ~VPHeaderPHIRecipe() override = default;
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPRecipeBase *B) {
-    return B->getVPDefID() == VPRecipeBase::VPWidenPHISC ||
+    return B->getVPDefID() == VPRecipeBase::VPCanonicalIVPHISC ||
            B->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC ||
-           B->getVPDefID() == VPRecipeBase::VPReductionPHISC;
+           B->getVPDefID() == VPRecipeBase::VPReductionPHISC ||
+           B->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC ||
+           B->getVPDefID() == VPRecipeBase::VPWidenPHISC;
   }
   static inline bool classof(const VPValue *V) {
-    return V->getVPValueID() == VPValue::VPVWidenPHISC ||
+    return V->getVPValueID() == VPValue::VPVCanonicalIVPHISC ||
            V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC ||
-           V->getVPValueID() == VPValue::VPVReductionPHISC;
+           V->getVPValueID() == VPValue::VPVReductionPHISC ||
+           V->getVPValueID() == VPValue::VPVWidenIntOrFpInductionSC ||
+           V->getVPValueID() == VPValue::VPVWidenPHISC;
   }
 
-  /// Generate the phi/select nodes.
-  void execute(VPTransformState &State) override;
+  /// Generate the phi nodes.
+  void execute(VPTransformState &State) override = 0;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
+             VPSlotTracker &SlotTracker) const override = 0;
 #endif
 
-  /// Returns the start value of the phi, if it is a reduction or first-order
-  /// recurrence.
+  /// Returns the start value of the phi, if one is set.
   VPValue *getStartValue() {
     return getNumOperands() == 0 ? nullptr : getOperand(0);
   }
 
-  /// Returns the incoming value from the loop backedge, if it is a reduction or
-  /// first-order recurrence.
+  /// Returns the incoming value from the loop backedge.
   VPValue *getBackedgeValue() {
     return getOperand(1);
   }
@@ -1126,6 +1140,44 @@ public:
   VPRecipeBase *getBackedgeRecipe() {
     return cast<VPRecipeBase>(getBackedgeValue()->getDef());
   }
+};
+
+/// A recipe for handling header phis that are widened in the vector loop.
+/// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
+/// managed in the recipe directly.
+class VPWidenPHIRecipe : public VPHeaderPHIRecipe {
+  /// List of incoming blocks. Only used in the VPlan native path.
+  SmallVector<VPBasicBlock *, 2> IncomingBlocks;
+
+public:
+  /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
+  VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr)
+      : VPHeaderPHIRecipe(VPVWidenPHISC, VPWidenPHISC, Phi) {
+    if (Start)
+      addOperand(Start);
+  }
+
+  ~VPWidenPHIRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *B) {
+    return B->getVPDefID() == VPRecipeBase::VPWidenPHISC;
+  }
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPRecipeBase::VPWidenPHISC;
+  }
+  static inline bool classof(const VPValue *V) {
+    return V->getVPValueID() == VPValue::VPVWidenPHISC;
+  }
+
+  /// Generate the phi/select nodes.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
 
   /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
   void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
@@ -1133,27 +1185,27 @@ public:
     IncomingBlocks.push_back(IncomingBlock);
   }
 
-  /// Returns the \p I th incoming VPValue.
-  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
-
   /// Returns the \p I th incoming VPBasicBlock.
   VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
+
+  /// Returns the \p I th incoming VPValue.
+  VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
 };
 
 /// A recipe for handling first-order recurrence phis. The start value is the
 /// first operand of the recipe and the incoming value from the backedge is the
 /// second operand.
-struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
+struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
   VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
-      : VPWidenPHIRecipe(VPVFirstOrderRecurrencePHISC,
-                         VPFirstOrderRecurrencePHISC, Phi, &Start) {}
+      : VPHeaderPHIRecipe(VPVFirstOrderRecurrencePHISC,
+                          VPFirstOrderRecurrencePHISC, Phi, &Start) {}
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
   }
-  static inline bool classof(const VPWidenPHIRecipe *D) {
-    return D->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPRecipeBase::VPFirstOrderRecurrencePHISC;
   }
   static inline bool classof(const VPValue *V) {
     return V->getVPValueID() == VPValue::VPVFirstOrderRecurrencePHISC;
@@ -1171,7 +1223,7 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPWidenPHIRecipe {
 /// A recipe for handling reduction phis. The start value is the first operand
 /// of the recipe and the incoming value from the backedge is the second
 /// operand.
-class VPReductionPHIRecipe : public VPWidenPHIRecipe {
+class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
   /// Descriptor for the reduction.
   const RecurrenceDescriptor &RdxDesc;
 
@@ -1187,7 +1239,7 @@ public:
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
                        bool IsOrdered = false)
-      : VPWidenPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
+      : VPHeaderPHIRecipe(VPVReductionPHISC, VPReductionPHISC, Phi, &Start),
         RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
@@ -1198,12 +1250,12 @@ public:
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
   }
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
+  }
   static inline bool classof(const VPValue *V) {
     return V->getVPValueID() == VPValue::VPVReductionPHISC;
   }
-  static inline bool classof(const VPWidenPHIRecipe *R) {
-    return R->getVPDefID() == VPRecipeBase::VPReductionPHISC;
-  }
 
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
@@ -1601,11 +1653,46 @@ public:
 #endif
 };
 
+/// Canonical scalar induction phi of the vector loop. Starting at the specified
+/// start value (either 0 or the resume value when vectorizing the epilogue
+/// loop). VPWidenCanonicalIVRecipe represents the vector version of the
+/// canonical induction variable.
+class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
+  DebugLoc DL;
+
+public:
+  VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
+      : VPHeaderPHIRecipe(VPValue::VPVCanonicalIVPHISC, VPCanonicalIVPHISC,
+                          nullptr, StartV),
+        DL(DL) {}
+
+  ~VPCanonicalIVPHIRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPDef *D) {
+    return D->getVPDefID() == VPCanonicalIVPHISC;
+  }
+
+  /// Generate the canonical scalar induction phi of the vector loop.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns the scalar type of the induction.
+  const Type *getScalarType() const {
+    return getOperand(0)->getLiveInIRValue()->getType();
+  }
+};
+
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
 public:
-  VPWidenCanonicalIVRecipe()
-      : VPRecipeBase(VPWidenCanonicalIVSC, {}),
+  VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
+      : VPRecipeBase(VPWidenCanonicalIVSC, {CanonicalIV}),
         VPValue(VPValue::VPVWidenCanonicalIVSC, nullptr, this) {}
 
   ~VPWidenCanonicalIVRecipe() override = default;
@@ -1615,6 +1702,16 @@ public:
     return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
   }
 
+  /// Extra classof implementations to allow directly casting from VPUser ->
+  /// VPWidenCanonicalIVRecipe.
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
+  }
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
+  }
+
   /// Generate a canonical vector induction variable of the vector loop, with
   /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
   /// step = <VF*UF, VF*UF, ..., VF*UF>.
@@ -1625,6 +1722,12 @@ public:
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
+
+  /// Returns the scalar type of the induction.
+  const Type *getScalarType() const {
+    return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDef())
+        ->getScalarType();
+  }
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -2112,10 +2215,17 @@ class VPlan {
   // (operators '==' and '<').
   SetVector<VPValue *> VPExternalDefs;
 
-  /// Represents the backedge taken count of the original loop, for folding
+  /// Represents the trip count of the original loop, for folding
   /// the tail.
+  VPValue *TripCount = nullptr;
+
+  /// Represents the backedge taken count of the original loop, for folding
+  /// the tail. It equals TripCount - 1.
   VPValue *BackedgeTakenCount = nullptr;
 
+  /// Represents the vector trip count.
+  VPValue VectorTripCount;
+
   /// Holds a mapping between Values and their corresponding VPValue inside
   /// VPlan.
   Value2VPValueTy Value2VPValue;
@@ -2147,12 +2257,18 @@ public:
     }
     for (VPValue *VPV : VPValuesToFree)
       delete VPV;
+    if (TripCount)
+      delete TripCount;
     if (BackedgeTakenCount)
       delete BackedgeTakenCount;
     for (VPValue *Def : VPExternalDefs)
       delete Def;
   }
 
+  /// Prepare the plan for execution, setting up the required live-in values.
+  void prepareToExecute(Value *TripCount, Value *VectorTripCount,
+                        Value *CanonicalIVStartValue, VPTransformState &State);
+
   /// Generate the IR code for this VPlan.
   void execute(struct VPTransformState *State);
 
@@ -2165,6 +2281,13 @@ public:
     return Entry;
   }
 
+  /// The trip count of the original loop.
+  VPValue *getOrCreateTripCount() {
+    if (!TripCount)
+      TripCount = new VPValue();
+    return TripCount;
+  }
+
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
     if (!BackedgeTakenCount)
@@ -2172,6 +2295,9 @@ public:
     return BackedgeTakenCount;
   }
 
+  /// The vector trip count.
+  VPValue &getVectorTripCount() { return VectorTripCount; }
+
   /// Mark the plan to indicate that using Value2VPValue is not safe any
   /// longer, because it may be stale.
   void disableValue2VPValue() { Value2VPValueEnabled = false; }
@@ -2264,6 +2390,21 @@ public:
     return !VPV->getDef() || (RepR && RepR->isUniform());
   }
 
+  /// Returns the VPRegionBlock of the vector loop.
+  VPRegionBlock *getVectorLoopRegion() {
+    return cast<VPRegionBlock>(getEntry());
+  }
+
+  /// Returns the canonical induction recipe of the vector loop.
+  VPCanonicalIVPHIRecipe *getCanonicalIV() {
+    VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
+    if (EntryVPBB->empty()) {
+      // VPlan native path.
+      EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
+    }
+    return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
+  }
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 86ecd6817873..e879a33db6ee 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -231,7 +231,7 @@ void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
 }
 
 // Entry point. The driver function for the predicator.
-void VPlanPredicator::predicate(void) {
+void VPlanPredicator::predicate() {
   // Predicate the blocks within Region.
   predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
index 692afd2978d5..a5db9a54da3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPredicator.h
@@ -68,7 +68,7 @@ public:
   VPlanPredicator(VPlan &Plan);
 
   /// Predicate Plan's HCFG.
-  void predicate(void);
+  void predicate();
 };
 } // end namespace llvm
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d2daf558c2c5..fb5f3d428189 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -324,3 +324,30 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
     E.first->eraseFromParent();
   }
 }
+
+void VPlanTransforms::removeRedundantCanonicalIVs(VPlan &Plan) {
+  VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV();
+  VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
+  for (VPUser *U : CanonicalIV->users()) {
+    WidenNewIV = dyn_cast<VPWidenCanonicalIVRecipe>(U);
+    if (WidenNewIV)
+      break;
+  }
+
+  if (!WidenNewIV)
+    return;
+
+  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
+  for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
+    auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
+
+    // If the induction recipe is canonical and the types match, use it
+    // directly.
+    if (WidenOriginalIV && WidenOriginalIV->isCanonical() &&
+        WidenOriginalIV->getScalarType() == WidenNewIV->getScalarType()) {
+      WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
+      WidenNewIV->eraseFromParent();
+      return;
+    }
+  }
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a82a562d5e35..e74409a86466 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -45,6 +45,10 @@ struct VPlanTransforms {
   /// in the vectorized loop. There is no need to vectorize the cast - the same
   /// value can be used for both the phi and casts in the vector loop.
   static void removeRedundantInductionCasts(VPlan &Plan);
+
+  /// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
+  /// recipe, if it exists.
+  static void removeRedundantCanonicalIVs(VPlan &Plan);
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index fd92201614df..5296d2b9485c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -96,14 +96,15 @@ public:
     VPVReplicateSC,
     VPVWidenSC,
     VPVWidenCallSC,
+    VPVWidenCanonicalIVSC,
     VPVWidenGEPSC,
     VPVWidenSelectSC,
 
     // Phi-like VPValues. Need to be kept together.
     VPVBlendSC,
+    VPVCanonicalIVPHISC,
     VPVFirstOrderRecurrencePHISC,
     VPVWidenPHISC,
-    VPVWidenCanonicalIVSC,
     VPVWidenIntOrFpInductionSC,
     VPVPredInstPHI,
     VPVReductionPHISC,
@@ -177,6 +178,7 @@ public:
   void replaceAllUsesWith(VPValue *New);
 
   VPDef *getDef() { return Def; }
+  const VPDef *getDef() const { return Def; }
 
   /// Returns the underlying IR value, if this VPValue is defined outside the
   /// scope of VPlan. Returns nullptr if the VPValue is defined by a VPDef
@@ -186,6 +188,11 @@ public:
            "VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
     return getUnderlyingValue();
   }
+  const Value *getLiveInIRValue() const {
+    assert(!getDef() &&
+           "VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
+    return getUnderlyingValue();
+  }
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -325,6 +332,7 @@ public:
     VPReductionSC,
     VPReplicateSC,
     VPWidenCallSC,
+    VPWidenCanonicalIVSC,
     VPWidenGEPSC,
     VPWidenMemoryInstructionSC,
     VPWidenSC,
@@ -332,9 +340,9 @@ public:
 
     // Phi-like recipes. Need to be kept together.
     VPBlendSC,
+    VPCanonicalIVPHISC,
     VPFirstOrderRecurrencePHISC,
     VPWidenPHISC,
-    VPWidenCanonicalIVSC,
     VPWidenIntOrFpInductionSC,
     VPPredInstPHISC,
     VPReductionPHISC,
@@ -403,7 +411,6 @@ public:
 
 class VPlan;
 class VPBasicBlock;
-class VPRegionBlock;
 
 /// This class can be used to assign consecutive numbers to all VPValues in a
 /// VPlan and allows querying the numbering for printing, similar to the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7732d9367985..d36f250995e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -163,12 +163,32 @@ bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
     errs() << "VPlan entry block is not a VPBasicBlock\n";
     return false;
   }
+
+  if (!isa<VPCanonicalIVPHIRecipe>(&*Entry->begin())) {
+    errs() << "VPlan vector loop header does not start with a "
+              "VPCanonicalIVPHIRecipe\n";
+    return false;
+  }
+
   const VPBasicBlock *Exit = dyn_cast<VPBasicBlock>(TopRegion->getExit());
   if (!Exit) {
     errs() << "VPlan exit block is not a VPBasicBlock\n";
     return false;
   }
 
+  if (Exit->empty()) {
+    errs() << "VPlan vector loop exit must end with BranchOnCount "
+              "VPInstruction but is empty\n";
+    return false;
+  }
+
+  auto *LastInst = dyn_cast<VPInstruction>(std::prev(Exit->end()));
+  if (!LastInst || LastInst->getOpcode() != VPInstruction::BranchOnCount) {
+    errs() << "VPlan vector loop exit must end with BranchOnCount "
+              "VPInstruction\n";
+    return false;
+  }
+
   for (const VPRegionBlock *Region :
        VPBlockUtils::blocksOnly<const VPRegionBlock>(
            depth_first(VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index c0aedab2fed0..620d388199e0 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -881,7 +881,8 @@ static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy,
   ConstantRange IdxRange(IntWidth, true);
 
   if (isGuaranteedNotToBePoison(Idx, &AC)) {
-    if (ValidIndices.contains(computeConstantRange(Idx, true, &AC, CtxI, &DT)))
+    if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
+                                                   true, &AC, CtxI, &DT)))
       return ScalarizationResult::safe();
     return ScalarizationResult::unsafe();
   }
diff --git a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
index 1be1d34417eb..40c03f7b0de7 100644
--- a/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
+++ b/llvm/lib/WindowsManifest/WindowsManifestMerger.cpp
@@ -669,7 +669,7 @@ WindowsManifestMerger::WindowsManifestMergerImpl::getMergedManifest() {
     std::unique_ptr<xmlDoc, XmlDeleter> OutputDoc(
         xmlNewDoc((const unsigned char *)"1.0"));
     xmlDocSetRootElement(OutputDoc.get(), CombinedRoot);
-    assert(0 == xmlDocGetRootElement(CombinedDoc));
+    assert(nullptr == xmlDocGetRootElement(CombinedDoc));
 
     xmlKeepBlanksDefault(0);
     xmlChar *Buff = nullptr;