419 files changed, 11599 insertions, 5293 deletions
diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp
index 234a73bff6a8..bb25244a88dd 100644
--- a/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -579,7 +579,7 @@ AliasSet &AliasSetTracker::mergeAllAliasSets() {
   AliasAnyAS->Access = AliasSet::ModRefAccess;
   AliasAnyAS->AliasAny = true;
 
-  for (auto Cur : ASVector) {
+  for (auto *Cur : ASVector) {
     // If Cur was already forwarding, just forward to the new AS instead.
     AliasSet *FwdTo = Cur->Forward;
     if (FwdTo) {
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index c78f822b8bcf..c3b032abcba2 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1764,7 +1764,7 @@ bool BasicAAResult::isValueEqualInPotentialCycles(const Value *V,
   // Make sure that the visited phis cannot reach the Value. This ensures that
   // the Values cannot come from different iterations of a potential cycle the
   // phi nodes could be involved in.
-  for (auto *P : VisitedPhiBBs)
+  for (const auto *P : VisitedPhiBBs)
     if (isPotentiallyReachable(&P->front(), Inst, nullptr, DT))
       return false;
 
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 428ae8975c30..f45728768fcd 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -1250,7 +1250,7 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LoopI,
 
   // Walk the basic blocks in post-order so that we can build up state about
   // the successors of a block iteratively.
-  for (auto BB : post_order(&F.getEntryBlock())) {
+  for (const auto *BB : post_order(&F.getEntryBlock())) {
     LLVM_DEBUG(dbgs() << "Computing probabilities for " << BB->getName()
                       << "\n");
     // If there is no at least two successors, no sense to set probability.
diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp
index 1902d72f2f89..e5dd45842d6a 100644
--- a/llvm/lib/Analysis/CFG.cpp
+++ b/llvm/lib/Analysis/CFG.cpp
@@ -149,7 +149,7 @@ bool llvm::isPotentiallyReachableFromMany(
   // untrue.
   SmallPtrSet<const Loop *, 8> LoopsWithHoles;
   if (LI && ExclusionSet) {
-    for (auto BB : *ExclusionSet) {
+    for (auto *BB : *ExclusionSet) {
       if (const Loop *L = getOutermostLoop(LI, BB))
         LoopsWithHoles.insert(L);
     }
diff --git a/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp b/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
index 602a01867f3b..d70e1b21d768 100644
--- a/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -615,7 +615,7 @@ static void initializeWorkList(std::vector<WorkListItem> &WorkList,
       auto Src = InstantiatedValue{Val, I};
       // If there's an assignment edge from X to Y, it means Y is reachable from
       // X at S3 and X is reachable from Y at S1
-      for (auto &Edge : ValueInfo.getNodeInfoAtLevel(I).Edges) {
+      for (const auto &Edge : ValueInfo.getNodeInfoAtLevel(I).Edges) {
         propagate(Edge.Other, Src, MatchState::FlowFromReadOnly, ReachSet,
                   WorkList);
         propagate(Src, Edge.Other, MatchState::FlowToWriteOnly, ReachSet,
diff --git a/llvm/lib/Analysis/CFLGraph.h b/llvm/lib/Analysis/CFLGraph.h
index 60fc8d18678c..47bb02ac8e8b 100644
--- a/llvm/lib/Analysis/CFLGraph.h
+++ b/llvm/lib/Analysis/CFLGraph.h
@@ -434,7 +434,8 @@ template <typename CFLAA> class CFLGraphBuilder {
       // introduce any aliases.
       // TODO: address other common library functions such as realloc(),
       // strdup(), etc.
-      if (isMallocOrCallocLikeFn(&Call, &TLI) || isFreeCall(&Call, &TLI))
+      if (isMallocOrCallocLikeFn(&Call, &TLI) ||
+          getFreedOperand(&Call, &TLI) != nullptr)
         return;
 
       // TODO: Add support for noalias args/all the other fun function
diff --git a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
index f92869c2ec63..33ed6f88f82b 100644
--- a/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/CFLSteensAliasAnalysis.cpp
@@ -216,7 +216,7 @@ CFLSteensAAResult::FunctionInfo CFLSteensAAResult::buildSetsFrom(Function *Fn) {
 
     for (unsigned I = 0, E = ValueInfo.getNumLevels(); I < E; ++I) {
       auto Src = InstantiatedValue{Val, I};
-      for (auto &Edge : ValueInfo.getNodeInfoAtLevel(I).Edges)
+      for (const auto &Edge : ValueInfo.getNodeInfoAtLevel(I).Edges)
         SetBuilder.addWith(Src, Edge.Other);
     }
   }
diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp
index dc774728ab3d..2efa474f3552 100644
--- a/llvm/lib/Analysis/ConstraintSystem.cpp
+++ b/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -110,7 +110,7 @@ void ConstraintSystem::dump(ArrayRef<std::string> Names) const {
   if (Constraints.empty())
     return;
 
-  for (auto &Row : Constraints) {
+  for (const auto &Row : Constraints) {
     SmallVector<std::string, 16> Parts;
     for (unsigned I = 1, S = Row.size(); I < S; ++I) {
       if (Row[I] == 0)
diff --git a/llvm/lib/Analysis/CostModel.cpp b/llvm/lib/Analysis/CostModel.cpp
index 52e424ae324b..3c162f604cd5 100644
--- a/llvm/lib/Analysis/CostModel.cpp
+++ b/llvm/lib/Analysis/CostModel.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/IntrinsicInst.h"
 using namespace llvm;
 
 static cl::opt<TargetTransformInfo::TargetCostKind> CostKind(
@@ -39,6 +40,9 @@ static cl::opt<TargetTransformInfo::TargetCostKind> CostKind(
                clEnumValN(TargetTransformInfo::TCK_SizeAndLatency,
                           "size-latency", "Code size and latency")));
 
+static cl::opt<bool> TypeBasedIntrinsicCost("type-based-intrinsic-cost",
+    cl::desc("Calculate intrinsics cost based only on argument types"),
+    cl::init(false));
 
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
@@ -103,7 +107,16 @@ void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
 
   for (BasicBlock &B : *F) {
     for (Instruction &Inst : B) {
-      InstructionCost Cost = TTI->getInstructionCost(&Inst, CostKind);
+      InstructionCost Cost;
+      if (TypeBasedIntrinsicCost && isa<IntrinsicInst>(&Inst)) {
+        auto *II = dyn_cast<IntrinsicInst>(&Inst);
+        IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II,
+                                    InstructionCost::getInvalid(), true);
+        Cost = TTI->getIntrinsicInstrCost(ICA, CostKind);
+      }
+      else {
+        Cost = TTI->getInstructionCost(&Inst, CostKind);
+      }
       if (auto CostVal = Cost.getValue())
         OS << "Cost Model: Found an estimated cost of " << *CostVal;
       else
@@ -122,7 +135,16 @@ PreservedAnalyses CostModelPrinterPass::run(Function &F,
     for (Instruction &Inst : B) {
       // TODO: Use a pass parameter instead of cl::opt CostKind to determine
       // which cost kind to print.
-      InstructionCost Cost = TTI.getInstructionCost(&Inst, CostKind);
+      InstructionCost Cost;
+      if (TypeBasedIntrinsicCost && isa<IntrinsicInst>(&Inst)) {
+        auto *II = dyn_cast<IntrinsicInst>(&Inst);
+        IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II,
+                                    InstructionCost::getInvalid(), true);
+        Cost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+      }
+      else {
+        Cost = TTI.getInstructionCost(&Inst, CostKind);
+      }
       if (auto CostVal = Cost.getValue())
         OS << "Cost Model: Found an estimated cost of " << *CostVal;
       else
diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp
index 998c888dd2d9..da64ef153960 100644
--- a/llvm/lib/Analysis/DDG.cpp
+++ b/llvm/lib/Analysis/DDG.cpp
@@ -95,7 +95,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const DDGNode &N) {
     llvm_unreachable("unimplemented type of node");
 
   OS << (N.getEdges().empty() ? " Edges:none!\n" : " Edges:\n");
-  for (auto &E : N.getEdges())
+  for (const auto &E : N.getEdges())
     OS.indent(2) << *E;
   return OS;
 }
@@ -188,7 +188,7 @@ DataDependenceGraph::DataDependenceGraph(Function &F, DependenceInfo &D)
   // Put the basic blocks in program order for correct dependence
   // directions.
   BasicBlockListType BBList;
-  for (auto &SCC : make_range(scc_begin(&F), scc_end(&F)))
+  for (const auto &SCC : make_range(scc_begin(&F), scc_end(&F)))
     append_range(BBList, SCC);
   std::reverse(BBList.begin(), BBList.end());
   DDGBuilder(*this, D, BBList).populate();
diff --git a/llvm/lib/Analysis/Delinearization.cpp b/llvm/lib/Analysis/Delinearization.cpp
index c36e1d922915..7ab91b9eaea4 100644
--- a/llvm/lib/Analysis/Delinearization.cpp
+++ b/llvm/lib/Analysis/Delinearization.cpp
@@ -133,7 +133,7 @@ struct SCEVCollectAddRecMultiplies {
     if (auto *Mul = dyn_cast<SCEVMulExpr>(S)) {
       bool HasAddRec = false;
       SmallVector<const SCEV *, 0> Operands;
-      for (auto Op : Mul->operands()) {
+      for (const auto *Op : Mul->operands()) {
         const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Op);
         if (Unknown && !isa<CallInst>(Unknown->getValue())) {
           Operands.push_back(Op);
diff --git a/llvm/lib/Analysis/DivergenceAnalysis.cpp b/llvm/lib/Analysis/DivergenceAnalysis.cpp
index 1a4b09e0cac2..02c40d2640c1 100644
--- a/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -220,19 +220,19 @@ void DivergenceAnalysisImpl::analyzeLoopExitDivergence(
     // phi nodes at the fringes of the dominance region
     if (!DT.dominates(&LoopHeader, UserBlock)) {
       // all PHI nodes of UserBlock become divergent
-      for (auto &Phi : UserBlock->phis()) {
+      for (const auto &Phi : UserBlock->phis()) {
         analyzeTemporalDivergence(Phi, OuterDivLoop);
       }
       continue;
     }
 
     // Taint outside users of values carried by OuterDivLoop.
-    for (auto &I : *UserBlock) {
+    for (const auto &I : *UserBlock) {
       analyzeTemporalDivergence(I, OuterDivLoop);
     }
 
     // visit all blocks in the dominance region
-    for (auto *SuccBlock : successors(UserBlock)) {
+    for (const auto *SuccBlock : successors(UserBlock)) {
       if (!Visited.insert(SuccBlock).second) {
         continue;
       }
@@ -399,7 +399,7 @@ DivergenceAnalysisPrinterPass::run(Function &F, FunctionAnalysisManager &FAM) {
     }
     for (const BasicBlock &BB : F) {
       OS << "\n           " << BB.getName() << ":\n";
-      for (auto &I : BB.instructionsWithoutDebug()) {
+      for (const auto &I : BB.instructionsWithoutDebug()) {
         OS << (DI.isDivergent(I) ? "DIVERGENT:     " : "               ");
         OS << I << "\n";
       }
diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp
index db6eae0d962a..e6ef1c78a628 100644
--- a/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -361,7 +361,7 @@ bool GlobalsAAResult::AnalyzeUsesOfPointer(Value *V,
       if (Call->isDataOperand(&U)) {
         // Detect calls to free.
         if (Call->isArgOperand(&U) &&
-            isFreeCall(I, &GetTLI(*Call->getFunction()))) {
+            getFreedOperand(Call, &GetTLI(*Call->getFunction())) == U) {
           if (Writers)
             Writers->insert(Call->getParent()->getParent());
         } else {
@@ -906,7 +906,7 @@ ModRefInfo GlobalsAAResult::getModRefInfoForArgument(const CallBase *Call,
 
   // Iterate through all the arguments to the called function. If any argument
   // is based on GV, return the conservative result.
-  for (auto &A : Call->args()) {
+  for (const auto &A : Call->args()) {
     SmallVector<const Value*, 4> Objects;
     getUnderlyingObjects(A, Objects);
 
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index e4d706ab045c..a51e974003f6 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -63,20 +63,6 @@ bool RecurrenceDescriptor::isFloatingPointRecurrenceKind(RecurKind Kind) {
   return (Kind != RecurKind::None) && !isIntegerRecurrenceKind(Kind);
 }
 
-bool RecurrenceDescriptor::isArithmeticRecurrenceKind(RecurKind Kind) {
-  switch (Kind) {
-  default:
-    break;
-  case RecurKind::Add:
-  case RecurKind::Mul:
-  case RecurKind::FAdd:
-  case RecurKind::FMul:
-  case RecurKind::FMulAdd:
-    return true;
-  }
-  return false;
-}
-
 /// Determines if Phi may have been type-promoted. If Phi has a single user
 /// that ANDs the Phi with a type mask, return the user. RT is updated to
 /// account for the narrower bit width represented by the mask, and the AND
@@ -1170,7 +1156,7 @@ RecurrenceDescriptor::getReductionOpChain(PHINode *Phi, Loop *L) const {
     ExpectedUses = 2;
 
   auto getNextInstruction = [&](Instruction *Cur) -> Instruction * {
-    for (auto User : Cur->users()) {
+    for (auto *User : Cur->users()) {
       Instruction *UI = cast<Instruction>(User);
       if (isa<PHINode>(UI))
         continue;
diff --git a/llvm/lib/Analysis/IVUsers.cpp b/llvm/lib/Analysis/IVUsers.cpp
index 5bde947bd851..830211658353 100644
--- a/llvm/lib/Analysis/IVUsers.cpp
+++ b/llvm/lib/Analysis/IVUsers.cpp
@@ -274,7 +274,7 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
     OS << "  ";
     IVUse.getOperandValToReplace()->printAsOperand(OS, false);
     OS << " = " << *getReplacementExpr(IVUse);
-    for (auto PostIncLoop : IVUse.PostIncLoops) {
+    for (const auto *PostIncLoop : IVUse.PostIncLoops) {
       OS << " (post-inc with loop ";
       PostIncLoop->getHeader()->printAsOperand(OS, false);
       OS << ")";
diff --git a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
index 9fee57c54b85..78e7f456ebc6 100644
--- a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
+++ b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
@@ -68,7 +68,7 @@ bool InstructionPrecedenceTracking::isPreceededBySpecialInstruction(
 
 void InstructionPrecedenceTracking::fill(const BasicBlock *BB) {
   FirstSpecialInsts.erase(BB);
-  for (auto &I : *BB) {
+  for (const auto &I : *BB) {
     NumInstScanned++;
     if (isSpecialInstruction(&I)) {
       FirstSpecialInsts[BB] = &I;
@@ -101,7 +101,7 @@ void InstructionPrecedenceTracking::validate(const BasicBlock *BB) const {
 
 void InstructionPrecedenceTracking::validateAll() const {
   // Check that for every known block the cached value is correct.
-  for (auto &It : FirstSpecialInsts)
+  for (const auto &It : FirstSpecialInsts)
     validate(It.first);
 }
 #endif
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index d49b20798c82..2fae260e0d8f 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1896,7 +1896,7 @@ void LazyValueInfoAnnotatedWriter::emitBasicBlockStartAnnot(
     const BasicBlock *BB, formatted_raw_ostream &OS) {
   // Find if there are latticevalues defined for arguments of the function.
   auto *F = BB->getParent();
-  for (auto &Arg : F->args()) {
+  for (const auto &Arg : F->args()) {
     ValueLatticeElement Result = LVIImpl->getValueInBlock(
         const_cast<Argument *>(&Arg), const_cast<BasicBlock *>(BB));
     if (Result.isUnknown())
@@ -1932,12 +1932,12 @@ void LazyValueInfoAnnotatedWriter::emitInstructionAnnot(
   printResult(ParentBB);
   // Print the LVI analysis results for the immediate successor blocks, that
   // are dominated by `ParentBB`.
-  for (auto *BBSucc : successors(ParentBB))
+  for (const auto *BBSucc : successors(ParentBB))
     if (DT.dominates(ParentBB, BBSucc))
       printResult(BBSucc);
 
   // Print LVI in blocks where `I` is used.
-  for (auto *U : I->users())
+  for (const auto *U : I->users())
     if (auto *UseI = dyn_cast<Instruction>(U))
       if (!isa<PHINode>(UseI) || DT.dominates(ParentBB, UseI->getParent()))
         printResult(UseI->getParent());
diff --git a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
index 491d44335f22..381d62a37662 100644
--- a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -393,14 +393,14 @@ void LegacyDivergenceAnalysis::print(raw_ostream &OS, const Module *) const {
     return;
 
   // Dumps all divergent values in F, arguments and then instructions.
-  for (auto &Arg : F->args()) {
+  for (const auto &Arg : F->args()) {
     OS << (isDivergent(&Arg) ? "DIVERGENT: " : "           ");
     OS << Arg << "\n";
   }
   // Iterate instructions using instructions() to ensure a deterministic order.
   for (const BasicBlock &BB : *F) {
     OS << "\n           " << BB.getName() << ":\n";
-    for (auto &I : BB.instructionsWithoutDebug()) {
+    for (const auto &I : BB.instructionsWithoutDebug()) {
       OS << (isDivergent(&I) ? "DIVERGENT:     " : "               ");
       OS << I << "\n";
     }
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 9cfb91a22b7d..8b0f2a8ed99b 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -229,7 +229,7 @@ void Lint::visitCallBase(CallBase &I) {
         if (Formal->hasNoAliasAttr() && Actual->getType()->isPointerTy()) {
           AttributeList PAL = I.getAttributes();
           unsigned ArgNo = 0;
-          for (auto BI = I.arg_begin(); BI != AE; ++BI, ++ArgNo) {
+          for (auto *BI = I.arg_begin(); BI != AE; ++BI, ++ArgNo) {
             // Skip ByVal arguments since they will be memcpy'd to the callee's
             // stack so we're not really passing the pointer anyway.
             if (PAL.hasParamAttr(ArgNo, Attribute::ByVal))
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 79161db9b5e4..bed684b7652a 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -130,6 +130,11 @@ static cl::opt<bool> EnableForwardingConflictDetection(
     cl::desc("Enable conflict detection in loop-access analysis"),
     cl::init(true));
 
+static cl::opt<unsigned> MaxForkedSCEVDepth(
+    "max-forked-scev-depth", cl::Hidden,
+    cl::desc("Maximum recursion depth when finding forked SCEVs (default = 5)"),
+    cl::init(5));
+
 bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
 }
@@ -288,8 +293,10 @@ void RuntimePointerChecking::tryToCreateDiffCheck(
       DC.getInstructionsForAccess(Sink->PointerValue, Sink->IsWritePtr);
   Type *SrcTy = getLoadStoreType(SrcInsts[0]);
   Type *DstTy = getLoadStoreType(SinkInsts[0]);
-  if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy))
+  if (isa<ScalableVectorType>(SrcTy) || isa<ScalableVectorType>(DstTy)) {
+    CanUseDiffCheck = false;
     return;
+  }
   unsigned AllocSize =
       std::max(DL.getTypeAllocSize(SrcTy), DL.getTypeAllocSize(DstTy));
   IntegerType *IntTy =
@@ -778,6 +785,140 @@ static void visitPointers(Value *StartPtr, const Loop &InnermostLoop,
   }
 }
 
+// Walk back through the IR for a pointer, looking for a select like the
+// following:
+//
+//  %offset = select i1 %cmp, i64 %a, i64 %b
+//  %addr = getelementptr double, double* %base, i64 %offset
+//  %ld = load double, double* %addr, align 8
+//
+// We won't be able to form a single SCEVAddRecExpr from this since the
+// address for each loop iteration depends on %cmp. We could potentially
+// produce multiple valid SCEVAddRecExprs, though, and check all of them for
+// memory safety/aliasing if needed.
+//
+// If we encounter some IR we don't yet handle, or something obviously fine
+// like a constant, then we just add the SCEV for that term to the list passed
+// in by the caller. If we have a node that may potentially yield a valid
+// SCEVAddRecExpr then we decompose it into parts and build the SCEV terms
+// ourselves before adding to the list.
+static void
+findForkedSCEVs(ScalarEvolution *SE, const Loop *L, Value *Ptr,
+                SmallVectorImpl<std::pair<const SCEV *, bool>> &ScevList,
+                unsigned Depth) {
+  // If our Value is a SCEVAddRecExpr, loop invariant, not an instruction, or
+  // we've exceeded our limit on recursion, just return whatever we have
+  // regardless of whether it can be used for a forked pointer or not, along
+  // with an indication of whether it might be a poison or undef value.
+  const SCEV *Scev = SE->getSCEV(Ptr);
+  if (isa<SCEVAddRecExpr>(Scev) || L->isLoopInvariant(Ptr) ||
+      !isa<Instruction>(Ptr) || Depth == 0) {
+    ScevList.push_back(
+        std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)));
+    return;
+  }
+
+  Depth--;
+
+  auto UndefPoisonCheck = [](std::pair<const SCEV *, bool> S) -> bool {
+    return S.second;
+  };
+
+  Instruction *I = cast<Instruction>(Ptr);
+  unsigned Opcode = I->getOpcode();
+  switch (Opcode) {
+  case Instruction::GetElementPtr: {
+    GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+    Type *SourceTy = GEP->getSourceElementType();
+    // We only handle base + single offset GEPs here for now.
+    // Not dealing with preexisting gathers yet, so no vectors.
+    if (I->getNumOperands() != 2 || SourceTy->isVectorTy()) {
+      ScevList.push_back(
+          std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(GEP)));
+      break;
+    }
+    SmallVector<std::pair<const SCEV *, bool>, 2> BaseScevs;
+    SmallVector<std::pair<const SCEV *, bool>, 2> OffsetScevs;
+    findForkedSCEVs(SE, L, I->getOperand(0), BaseScevs, Depth);
+    findForkedSCEVs(SE, L, I->getOperand(1), OffsetScevs, Depth);
+
+    // See if we need to freeze our fork...
+    bool NeedsFreeze = any_of(BaseScevs, UndefPoisonCheck) ||
+                       any_of(OffsetScevs, UndefPoisonCheck);
+
+    // Check that we only have a single fork, on either the base or the offset.
+    // Copy the SCEV across for the one without a fork in order to generate
+    // the full SCEV for both sides of the GEP.
+    if (OffsetScevs.size() == 2 && BaseScevs.size() == 1)
+      BaseScevs.push_back(BaseScevs[0]);
+    else if (BaseScevs.size() == 2 && OffsetScevs.size() == 1)
+      OffsetScevs.push_back(OffsetScevs[0]);
+    else {
+      ScevList.push_back(std::make_pair(Scev, NeedsFreeze));
+      break;
+    }
+
+    // Find the pointer type we need to extend to.
+    Type *IntPtrTy = SE->getEffectiveSCEVType(
+        SE->getSCEV(GEP->getPointerOperand())->getType());
+
+    // Find the size of the type being pointed to. We only have a single
+    // index term (guarded above) so we don't need to index into arrays or
+    // structures, just get the size of the scalar value.
+    const SCEV *Size = SE->getSizeOfExpr(IntPtrTy, SourceTy);
+
+    // Scale up the offsets by the size of the type, then add to the bases.
+    const SCEV *Scaled1 = SE->getMulExpr(
+        Size, SE->getTruncateOrSignExtend(OffsetScevs[0].first, IntPtrTy));
+    const SCEV *Scaled2 = SE->getMulExpr(
+        Size, SE->getTruncateOrSignExtend(OffsetScevs[1].first, IntPtrTy));
+    ScevList.push_back(std::make_pair(
+        SE->getAddExpr(BaseScevs[0].first, Scaled1), NeedsFreeze));
+    ScevList.push_back(std::make_pair(
+        SE->getAddExpr(BaseScevs[1].first, Scaled2), NeedsFreeze));
+    break;
+  }
+  case Instruction::Select: {
+    SmallVector<std::pair<const SCEV *, bool>, 2> ChildScevs;
+    // A select means we've found a forked pointer, but we currently only
+    // support a single select per pointer so if there's another behind this
+    // then we just bail out and return the generic SCEV.
+    findForkedSCEVs(SE, L, I->getOperand(1), ChildScevs, Depth);
+    findForkedSCEVs(SE, L, I->getOperand(2), ChildScevs, Depth);
+    if (ChildScevs.size() == 2) {
+      ScevList.push_back(ChildScevs[0]);
+      ScevList.push_back(ChildScevs[1]);
+    } else
+      ScevList.push_back(
+          std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)));
+    break;
+  }
+  default:
+    // Just return the current SCEV if we haven't handled the instruction yet.
+    LLVM_DEBUG(dbgs() << "ForkedPtr unhandled instruction: " << *I << "\n");
+    ScevList.push_back(
+        std::make_pair(Scev, !isGuaranteedNotToBeUndefOrPoison(Ptr)));
+    break;
+  }
+}
+
+static SmallVector<std::pair<const SCEV *, bool>>
+findForkedPointer(PredicatedScalarEvolution &PSE,
+                  const ValueToValueMap &StridesMap, Value *Ptr,
+                  const Loop *L) {
+  ScalarEvolution *SE = PSE.getSE();
+  assert(SE->isSCEVable(Ptr->getType()) && "Value is not SCEVable!");
+  SmallVector<std::pair<const SCEV *, bool>> Scevs;
+  findForkedSCEVs(SE, L, Ptr, Scevs, MaxForkedSCEVDepth);
+
+  // For now, we will only accept a forked pointer with two possible SCEVs.
+  if (Scevs.size() == 2)
+    return Scevs;
+
+  return {
+      std::make_pair(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false)};
+}
+
 bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
                                           MemAccessInfo Access, Type *AccessTy,
                                           const ValueToValueMap &StridesMap,
@@ -787,19 +928,8 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
                                           bool Assume) {
   Value *Ptr = Access.getPointer();
 
-  ScalarEvolution &SE = *PSE.getSE();
-  SmallVector<std::pair<const SCEV *, bool>> TranslatedPtrs;
-  auto *SI = dyn_cast<SelectInst>(Ptr);
-  // Look through selects in the current loop.
-  if (SI && !TheLoop->isLoopInvariant(SI)) {
-    TranslatedPtrs = {
-        std::make_pair(SE.getSCEV(SI->getOperand(1)),
-                       !isGuaranteedNotToBeUndefOrPoison(SI->getOperand(1))),
-        std::make_pair(SE.getSCEV(SI->getOperand(2)),
-                       !isGuaranteedNotToBeUndefOrPoison(SI->getOperand(2)))};
-  } else
-    TranslatedPtrs = {
-        std::make_pair(replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false)};
+  SmallVector<std::pair<const SCEV *, bool>> TranslatedPtrs =
+      findForkedPointer(PSE, StridesMap, Ptr, TheLoop);
 
   for (auto &P : TranslatedPtrs) {
     const SCEV *PtrExpr = P.first;
@@ -879,7 +1009,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     unsigned RunningDepId = 1;
     DenseMap<Value *, unsigned> DepSetId;
 
-    SmallVector<MemAccessInfo, 4> Retries;
+    SmallVector<std::pair<MemAccessInfo, Type *>, 4> Retries;
 
     // First, count how many write and read accesses are in the alias set. Also
     // collect MemAccessInfos for later.
@@ -911,13 +1041,13 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
 
     for (auto &Access : AccessInfos) {
-      for (auto &AccessTy : Accesses[Access]) {
+      for (const auto &AccessTy : Accesses[Access]) {
         if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
                                   DepSetId, TheLoop, RunningDepId, ASId,
                                   ShouldCheckWrap, false)) {
           LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
                             << *Access.getPointer() << '\n');
-          Retries.push_back(Access);
+          Retries.push_back({Access, AccessTy});
           CanDoAliasSetRT = false;
         }
       }
@@ -941,15 +1071,15 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       // We know that we need these checks, so we can now be more aggressive
       // and add further checks if required (overflow checks).
       CanDoAliasSetRT = true;
-      for (auto Access : Retries) {
-        for (auto &AccessTy : Accesses[Access]) {
-          if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
-                                    DepSetId, TheLoop, RunningDepId, ASId,
-                                    ShouldCheckWrap, /*Assume=*/true)) {
-            CanDoAliasSetRT = false;
-            UncomputablePtr = Access.getPointer();
-            break;
-          }
+      for (auto Retry : Retries) {
+        MemAccessInfo Access = Retry.first;
+        Type *AccessTy = Retry.second;
+        if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
+                                  DepSetId, TheLoop, RunningDepId, ASId,
+                                  ShouldCheckWrap, /*Assume=*/true)) {
+          CanDoAliasSetRT = false;
+          UncomputablePtr = Access.getPointer();
+          break;
         }
       }
     }
@@ -2461,7 +2591,7 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
 
   if (auto *Dependences = DepChecker->getDependences()) {
     OS.indent(Depth) << "Dependences:\n";
-    for (auto &Dep : *Dependences) {
+    for (const auto &Dep : *Dependences) {
       Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
       OS << "\n";
     }
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 2cbf1f7f2d28..85f2dad86711 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -289,18 +289,14 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
   LLVM_DEBUG(dbgs() << "TripCount=" << *TripCount << "\n");
 
   const SCEV *RefCost = nullptr;
-  if (isConsecutive(L, CLS)) {
+  const SCEV *Stride = nullptr;
+  if (isConsecutive(L, Stride, CLS)) {
     // If the indexed reference is 'consecutive' the cost is
     // (TripCount*Stride)/CLS.
-    const SCEV *Coeff = getLastCoefficient();
-    const SCEV *ElemSize = Sizes.back();
-    assert(Coeff->getType() == ElemSize->getType() &&
-           "Expecting the same type");
-    const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
+    assert(Stride != nullptr &&
+           "Stride should not be null for consecutive access!");
     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
     const SCEV *CacheLineSize = SE.getConstant(WiderType, CLS);
-    if (SE.isKnownNegative(Stride))
-      Stride = SE.getNegativeSCEV(Stride);
     Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
     TripCount = SE.getNoopOrAnyExtend(TripCount, WiderType);
     const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
@@ -464,7 +460,8 @@ bool IndexedReference::isLoopInvariant(const Loop &L) const {
   return allCoeffForLoopAreZero;
 }
 
-bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
+bool IndexedReference::isConsecutive(const Loop &L, const SCEV *&Stride,
+                                     unsigned CLS) const {
   // The indexed reference is 'consecutive' if the only coefficient that uses
   // the loop induction variable is the last one...
   const SCEV *LastSubscript = Subscripts.back();
@@ -478,7 +475,19 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
   // ...and the access stride is less than the cache line size.
   const SCEV *Coeff = getLastCoefficient();
   const SCEV *ElemSize = Sizes.back();
-  const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
+  Type *WiderType = SE.getWiderType(Coeff->getType(), ElemSize->getType());
+  // FIXME: This assumes that all values are signed integers which may
+  // be incorrect in unusual codes and incorrectly use sext instead of zext.
+  // for (uint32_t i = 0; i < 512; ++i) {
+  //   uint8_t trunc = i;
+  //   A[trunc] = 42;
+  // }
+  // This consecutively iterates twice over A. If `trunc` is sign-extended,
+  // we would conclude that this may iterate backwards over the array.
+  // However, LoopCacheAnalysis is heuristic anyway and transformations must
+  // not result in wrong optimizations if the heuristic was incorrect.
+  Stride = SE.getMulExpr(SE.getNoopOrSignExtend(Coeff, WiderType),
+                         SE.getNoopOrSignExtend(ElemSize, WiderType));
   const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
 
   Stride = SE.isKnownNegative(Stride) ? SE.getNegativeSCEV(Stride) : Stride;
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 29c2437ff5ea..693b9ebd450a 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -425,12 +425,12 @@ bool Loop::isCanonical(ScalarEvolution &SE) const {
 
 // Check that 'BB' doesn't have any uses outside of the 'L'
 static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB,
-                               const DominatorTree &DT) {
+                               const DominatorTree &DT, bool IgnoreTokens) {
   for (const Instruction &I : BB) {
     // Tokens can't be used in PHI nodes and live-out tokens prevent loop
     // optimizations, so for the purposes of considered LCSSA form, we
     // can ignore them.
-    if (I.getType()->isTokenTy())
+    if (IgnoreTokens && I.getType()->isTokenTy())
       continue;
 
     for (const Use &U : I.uses()) {
@@ -455,20 +455,20 @@ static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB,
   return true;
 }
 
-bool Loop::isLCSSAForm(const DominatorTree &DT) const {
+bool Loop::isLCSSAForm(const DominatorTree &DT, bool IgnoreTokens) const {
   // For each block we check that it doesn't have any uses outside of this loop.
   return all_of(this->blocks(), [&](const BasicBlock *BB) {
-    return isBlockInLCSSAForm(*this, *BB, DT);
+    return isBlockInLCSSAForm(*this, *BB, DT, IgnoreTokens);
   });
 }
 
-bool Loop::isRecursivelyLCSSAForm(const DominatorTree &DT,
-                                  const LoopInfo &LI) const {
+bool Loop::isRecursivelyLCSSAForm(const DominatorTree &DT, const LoopInfo &LI,
+                                  bool IgnoreTokens) const {
   // For each block we check that it doesn't have any uses outside of its
   // innermost loop. This process will transitively guarantee that the current
   // loop and all of the nested loops are in LCSSA form.
   return all_of(this->blocks(), [&](const BasicBlock *BB) {
-    return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT);
+    return isBlockInLCSSAForm(*LI.getLoopFor(BB), *BB, DT, IgnoreTokens);
   });
 }
 
@@ -482,11 +482,8 @@ bool Loop::isLoopSimplifyForm() const {
 bool Loop::isSafeToClone() const {
   // Return false if any loop blocks contain indirectbrs, or there are any calls
   // to noduplicate functions.
-  // FIXME: it should be ok to clone CallBrInst's if we correctly update the
-  // operand list to reflect the newly cloned labels.
   for (BasicBlock *BB : this->blocks()) {
-    if (isa<IndirectBrInst>(BB->getTerminator()) ||
-        isa<CallBrInst>(BB->getTerminator()))
+    if (isa<IndirectBrInst>(BB->getTerminator()))
       return false;
 
     for (Instruction &I : *BB)
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index f5b121c98ec4..31e4380e4379 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -179,6 +179,11 @@ static const Function *getCalledFunction(const Value *V,
 static Optional<AllocFnsTy>
 getAllocationDataForFunction(const Function *Callee, AllocType AllocTy,
                              const TargetLibraryInfo *TLI) {
+  // Don't perform a slow TLI lookup, if this function doesn't return a pointer
+  // and thus can't be an allocation function.
+  if (!Callee->getReturnType()->isPointerTy())
+    return None;
+
   // Make sure that the function is available.
   LibFunc TLIFn;
   if (!TLI || !TLI->getLibFunc(*Callee, TLIFn) || !TLI->has(TLIFn))
@@ -266,15 +271,42 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
   return Result;
 }
 
+static AllocFnKind getAllocFnKind(const Value *V) {
+  if (const auto *CB = dyn_cast<CallBase>(V)) {
+    Attribute Attr = CB->getFnAttr(Attribute::AllocKind);
+    if (Attr.isValid())
+      return AllocFnKind(Attr.getValueAsInt());
+  }
+  return AllocFnKind::Unknown;
+}
+
+static AllocFnKind getAllocFnKind(const Function *F) {
+  Attribute Attr = F->getFnAttribute(Attribute::AllocKind);
+  if (Attr.isValid())
+    return AllocFnKind(Attr.getValueAsInt());
+  return AllocFnKind::Unknown;
+}
+
+static bool checkFnAllocKind(const Value *V, AllocFnKind Wanted) {
+  return (getAllocFnKind(V) & Wanted) != AllocFnKind::Unknown;
+}
+
+static bool checkFnAllocKind(const Function *F, AllocFnKind Wanted) {
+  return (getAllocFnKind(F) & Wanted) != AllocFnKind::Unknown;
+}
+
 /// Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
 bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, AnyAlloc, TLI).has_value();
+  return getAllocationData(V, AnyAlloc, TLI).has_value() ||
+         checkFnAllocKind(V, AllocFnKind::Alloc | AllocFnKind::Realloc);
 }
 bool llvm::isAllocationFn(
-    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
-  return getAllocationData(V, AnyAlloc, GetTLI).has_value();
+    const Value *V,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+  return getAllocationData(V, AnyAlloc, GetTLI).has_value() ||
+         checkFnAllocKind(V, AllocFnKind::Alloc | AllocFnKind::Realloc);
 }
 
 /// Tests if a value is a call or invoke to a library function that
@@ -304,30 +336,36 @@ bool llvm::isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI)
 /// Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
 bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, AllocLike, TLI).has_value();
-}
-
-/// Tests if a value is a call or invoke to a library function that
-/// reallocates memory (e.g., realloc).
-bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
-  return getAllocationData(V, ReallocLike, TLI).has_value();
+  return getAllocationData(V, AllocLike, TLI).has_value() ||
+         checkFnAllocKind(V, AllocFnKind::Alloc);
 }
 
 /// Tests if a functions is a call or invoke to a library function that
 /// reallocates memory (e.g., realloc).
 bool llvm::isReallocLikeFn(const Function *F, const TargetLibraryInfo *TLI) {
-  return getAllocationDataForFunction(F, ReallocLike, TLI).has_value();
+  return getAllocationDataForFunction(F, ReallocLike, TLI).has_value() ||
+         checkFnAllocKind(F, AllocFnKind::Realloc);
 }
 
-bool llvm::isAllocRemovable(const CallBase *CB, const TargetLibraryInfo *TLI) {
-  assert(isAllocationFn(CB, TLI));
+Value *llvm::getReallocatedOperand(const CallBase *CB,
+                                   const TargetLibraryInfo *TLI) {
+  if (getAllocationData(CB, ReallocLike, TLI).has_value()) {
+    // All currently supported realloc functions reallocate the first argument.
+    return CB->getArgOperand(0);
+  }
+  if (checkFnAllocKind(CB, AllocFnKind::Realloc))
+    return CB->getArgOperandWithAttribute(Attribute::AllocatedPointer);
+  return nullptr;
+}
 
+bool llvm::isRemovableAlloc(const CallBase *CB, const TargetLibraryInfo *TLI) {
   // Note: Removability is highly dependent on the source language.  For
   // example, recent C++ requires direct calls to the global allocation
   // [basic.stc.dynamic.allocation] to be observable unless part of a new
   // expression [expr.new paragraph 13].
 
-  // Historically we've treated the C family allocation routines as removable
+  // Historically we've treated the C family allocation routines and operator
+  // new as removable
   return isAllocLikeFn(CB, TLI);
 }
 
@@ -357,9 +395,8 @@ static bool CheckedZextOrTrunc(APInt &I, unsigned IntTyBits) {
 }
 
 Optional<APInt>
-llvm::getAllocSize(const CallBase *CB,
-                   const TargetLibraryInfo *TLI,
-                   std::function<const Value*(const Value*)> Mapper) {
+llvm::getAllocSize(const CallBase *CB, const TargetLibraryInfo *TLI,
+                   function_ref<const Value *(const Value *)> Mapper) {
   // Note: This handles both explicitly listed allocation functions and
   // allocsize.  The code structure could stand to be cleaned up a bit.
   Optional<AllocFnsTy> FnData = getAllocationSize(CB, TLI);
@@ -434,6 +471,12 @@ Constant *llvm::getInitialValueOfAllocation(const Value *V,
   if (isCallocLikeFn(Alloc, TLI))
     return Constant::getNullValue(Ty);
 
+  AllocFnKind AK = getAllocFnKind(Alloc);
+  if ((AK & AllocFnKind::Uninitialized) != AllocFnKind::Unknown)
+    return UndefValue::get(Ty);
+  if ((AK & AllocFnKind::Zeroed) != AllocFnKind::Unknown)
+    return Constant::getNullValue(Ty);
+
   return nullptr;
 }
 
@@ -497,14 +540,23 @@ Optional<StringRef> llvm::getAllocationFamily(const Value *I,
   if (Callee == nullptr || IsNoBuiltin)
     return None;
   LibFunc TLIFn;
-  if (!TLI || !TLI->getLibFunc(*Callee, TLIFn) || !TLI->has(TLIFn))
-    return None;
-  const auto AllocData = getAllocationDataForFunction(Callee, AnyAlloc, TLI);
-  if (AllocData)
-    return mangledNameForMallocFamily(AllocData.value().Family);
-  const auto FreeData = getFreeFunctionDataForFunction(Callee, TLIFn);
-  if (FreeData)
-    return mangledNameForMallocFamily(FreeData.value().Family);
+
+  if (TLI && TLI->getLibFunc(*Callee, TLIFn) && TLI->has(TLIFn)) {
+    // Callee is some known library function.
+    const auto AllocData = getAllocationDataForFunction(Callee, AnyAlloc, TLI);
+    if (AllocData)
+      return mangledNameForMallocFamily(AllocData.value().Family);
+    const auto FreeData = getFreeFunctionDataForFunction(Callee, TLIFn);
+    if (FreeData)
+      return mangledNameForMallocFamily(FreeData.value().Family);
+  }
+  // Callee isn't a known library function, still check attributes.
+  if (checkFnAllocKind(I, AllocFnKind::Free | AllocFnKind::Alloc |
+                              AllocFnKind::Realloc)) {
+    Attribute Attr = cast<CallBase>(I)->getFnAttr("alloc-family");
+    if (Attr.isValid())
+      return Attr.getValueAsString();
+  }
   return None;
 }
 
@@ -512,7 +564,7 @@ Optional<StringRef> llvm::getAllocationFamily(const Value *I,
 bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) {
   Optional<FreeFnsTy> FnData = getFreeFunctionDataForFunction(F, TLIFn);
   if (!FnData)
-    return false;
+    return checkFnAllocKind(F, AllocFnKind::Free);
 
   // Check free prototype.
   // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin
@@ -528,20 +580,24 @@ bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) {
   return true;
 }
 
-/// isFreeCall - Returns non-null if the value is a call to the builtin free()
-const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
+Value *llvm::getFreedOperand(const CallBase *CB, const TargetLibraryInfo *TLI) {
   bool IsNoBuiltinCall;
-  const Function *Callee = getCalledFunction(I, IsNoBuiltinCall);
+  const Function *Callee = getCalledFunction(CB, IsNoBuiltinCall);
   if (Callee == nullptr || IsNoBuiltinCall)
     return nullptr;
 
   LibFunc TLIFn;
-  if (!TLI || !TLI->getLibFunc(*Callee, TLIFn) || !TLI->has(TLIFn))
-    return nullptr;
+  if (TLI && TLI->getLibFunc(*Callee, TLIFn) && TLI->has(TLIFn) &&
+      isLibFreeFunction(Callee, TLIFn)) {
+    // All currently supported free functions free the first argument.
+    return CB->getArgOperand(0);
+  }
 
-  return isLibFreeFunction(Callee, TLIFn) ? dyn_cast<CallInst>(I) : nullptr;
-}
+  if (checkFnAllocKind(CB, AllocFnKind::Free))
+    return CB->getArgOperandWithAttribute(Attribute::AllocatedPointer);
 
+  return nullptr;
+}
 
 //===----------------------------------------------------------------------===//
 //  Utility functions to compute size of objects.
@@ -765,8 +821,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitCallBase(CallBase &CB) {
-  auto Mapper = [](const Value *V) { return V; };
-  if (Optional<APInt> Size = getAllocSize(&CB, TLI, Mapper))
+  if (Optional<APInt> Size = getAllocSize(&CB, TLI))
     return std::make_pair(*Size, Zero);
   return unknown();
 }
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index 690d575ef979..fce9d5b24faf 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -139,10 +139,12 @@ static ModRefInfo GetLocation(const Instruction *Inst, MemoryLocation &Loc,
     return ModRefInfo::ModRef;
   }
 
-  if (const CallInst *CI = isFreeCall(Inst, &TLI)) {
-    // calls to free() deallocate the entire structure
-    Loc = MemoryLocation::getAfter(CI->getArgOperand(0));
-    return ModRefInfo::Mod;
+  if (const CallBase *CB = dyn_cast<CallBase>(Inst)) {
+    if (Value *FreedOp = getFreedOperand(CB, &TLI)) {
+      // calls to free() deallocate the entire structure
+      Loc = MemoryLocation::getAfter(FreedOp);
+      return ModRefInfo::Mod;
+    }
   }
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
new file mode 100644
index 000000000000..3d11cb81226e
--- /dev/null
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -0,0 +1,226 @@
+//===-- MemoryProfileInfo.cpp - memory profile info ------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains utilities to analyze memory profile information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/MemoryProfileInfo.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+using namespace llvm::memprof;
+
+#define DEBUG_TYPE "memory-profile-info"
+
+// Upper bound on accesses per byte for marking an allocation cold.
+cl::opt<float> MemProfAccessesPerByteColdThreshold(
+    "memprof-accesses-per-byte-cold-threshold", cl::init(10.0), cl::Hidden,
+    cl::desc("The threshold the accesses per byte must be under to consider "
+             "an allocation cold"));
+
+// Lower bound on lifetime to mark an allocation cold (in addition to accesses
+// per byte above). This is to avoid pessimizing short lived objects.
+cl::opt<unsigned> MemProfMinLifetimeColdThreshold(
+    "memprof-min-lifetime-cold-threshold", cl::init(200), cl::Hidden,
+    cl::desc("The minimum lifetime (s) for an allocation to be considered "
+             "cold"));
+
+AllocationType llvm::memprof::getAllocType(uint64_t MaxAccessCount,
+                                           uint64_t MinSize,
+                                           uint64_t MinLifetime) {
+  if (((float)MaxAccessCount) / MinSize < MemProfAccessesPerByteColdThreshold &&
+      // MinLifetime is expected to be in ms, so convert the threshold to ms.
+      MinLifetime >= MemProfMinLifetimeColdThreshold * 1000)
+    return AllocationType::Cold;
+  return AllocationType::NotCold;
+}
+
+MDNode *llvm::memprof::buildCallstackMetadata(ArrayRef<uint64_t> CallStack,
+                                              LLVMContext &Ctx) {
+  std::vector<Metadata *> StackVals;
+  for (auto Id : CallStack) {
+    auto *StackValMD =
+        ValueAsMetadata::get(ConstantInt::get(Type::getInt64Ty(Ctx), Id));
+    StackVals.push_back(StackValMD);
+  }
+  return MDNode::get(Ctx, StackVals);
+}
+
+MDNode *llvm::memprof::getMIBStackNode(const MDNode *MIB) {
+  assert(MIB->getNumOperands() == 2);
+  // The stack metadata is the first operand of each memprof MIB metadata.
+  return cast<MDNode>(MIB->getOperand(0));
+}
+
+AllocationType llvm::memprof::getMIBAllocType(const MDNode *MIB) {
+  assert(MIB->getNumOperands() == 2);
+  // The allocation type is currently the second operand of each memprof
+  // MIB metadata. This will need to change as we add additional allocation
+  // types that can be applied based on the allocation profile data.
+  auto *MDS = dyn_cast<MDString>(MIB->getOperand(1));
+  assert(MDS);
+  if (MDS->getString().equals("cold"))
+    return AllocationType::Cold;
+  return AllocationType::NotCold;
+}
+
+static std::string getAllocTypeAttributeString(AllocationType Type) {
+  switch (Type) {
+  case AllocationType::NotCold:
+    return "notcold";
+    break;
+  case AllocationType::Cold:
+    return "cold";
+    break;
+  default:
+    assert(false && "Unexpected alloc type");
+  }
+  llvm_unreachable("invalid alloc type");
+}
+
+static void addAllocTypeAttribute(LLVMContext &Ctx, CallBase *CI,
+                                  AllocationType AllocType) {
+  auto AllocTypeString = getAllocTypeAttributeString(AllocType);
+  auto A = llvm::Attribute::get(Ctx, "memprof", AllocTypeString);
+  CI->addFnAttr(A);
+}
+
+static bool hasSingleAllocType(uint8_t AllocTypes) {
+  const unsigned NumAllocTypes = countPopulation(AllocTypes);
+  assert(NumAllocTypes != 0);
+  return NumAllocTypes == 1;
+}
+
+void CallStackTrie::addCallStack(AllocationType AllocType,
+                                 ArrayRef<uint64_t> StackIds) {
+  bool First = true;
+  CallStackTrieNode *Curr = nullptr;
+  for (auto StackId : StackIds) {
+    // If this is the first stack frame, add or update alloc node.
+    if (First) {
+      First = false;
+      if (Alloc) {
+        assert(AllocStackId == StackId);
+        Alloc->AllocTypes |= static_cast<uint8_t>(AllocType);
+      } else {
+        AllocStackId = StackId;
+        Alloc = new CallStackTrieNode(AllocType);
+      }
+      Curr = Alloc;
+      continue;
+    }
+    // Update existing caller node if it exists.
+    auto Next = Curr->Callers.find(StackId);
+    if (Next != Curr->Callers.end()) {
+      Curr = Next->second;
+      Curr->AllocTypes |= static_cast<uint8_t>(AllocType);
+      continue;
+    }
+    // Otherwise add a new caller node.
+    auto *New = new CallStackTrieNode(AllocType);
+    Curr->Callers[StackId] = New;
+    Curr = New;
+  }
+  assert(Curr);
+}
+
+void CallStackTrie::addCallStack(MDNode *MIB) {
+  MDNode *StackMD = getMIBStackNode(MIB);
+  assert(StackMD);
+  std::vector<uint64_t> CallStack;
+  CallStack.reserve(StackMD->getNumOperands());
+  for (auto &MIBStackIter : StackMD->operands()) {
+    auto *StackId = mdconst::dyn_extract<ConstantInt>(MIBStackIter);
+    assert(StackId);
+    CallStack.push_back(StackId->getZExtValue());
+  }
+  addCallStack(getMIBAllocType(MIB), CallStack);
+}
+
+static MDNode *createMIBNode(LLVMContext &Ctx,
+                             std::vector<uint64_t> &MIBCallStack,
+                             AllocationType AllocType) {
+  std::vector<Metadata *> MIBPayload(
+      {buildCallstackMetadata(MIBCallStack, Ctx)});
+  MIBPayload.push_back(
+      MDString::get(Ctx, getAllocTypeAttributeString(AllocType)));
+  return MDNode::get(Ctx, MIBPayload);
+}
+
+// Recursive helper to trim contexts and create metadata nodes.
+// Caller should have pushed Node's loc to MIBCallStack. Doing this in the
+// caller makes it simpler to handle the many early returns in this method.
+bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
+                                  std::vector<uint64_t> &MIBCallStack,
+                                  std::vector<Metadata *> &MIBNodes,
+                                  bool CalleeHasAmbiguousCallerContext) {
+  // Trim context below the first node in a prefix with a single alloc type.
+  // Add an MIB record for the current call stack prefix.
+  if (hasSingleAllocType(Node->AllocTypes)) {
+    MIBNodes.push_back(
+        createMIBNode(Ctx, MIBCallStack, (AllocationType)Node->AllocTypes));
+    return true;
+  }
+
+  // We don't have a single allocation for all the contexts sharing this prefix,
+  // so recursively descend into callers in trie.
+  if (!Node->Callers.empty()) {
+    bool NodeHasAmbiguousCallerContext = Node->Callers.size() > 1;
+    bool AddedMIBNodesForAllCallerContexts = true;
+    for (auto &Caller : Node->Callers) {
+      MIBCallStack.push_back(Caller.first);
+      AddedMIBNodesForAllCallerContexts &=
+          buildMIBNodes(Caller.second, Ctx, MIBCallStack, MIBNodes,
+                        NodeHasAmbiguousCallerContext);
+      // Remove Caller.
+      MIBCallStack.pop_back();
+    }
+    if (AddedMIBNodesForAllCallerContexts)
+      return true;
+    // We expect that the callers should be forced to add MIBs to disambiguate
+    // the context in this case (see below).
+    assert(!NodeHasAmbiguousCallerContext);
+  }
+
+  // If we reached here, then this node does not have a single allocation type,
+  // and we didn't add metadata for a longer call stack prefix including any of
+  // Node's callers. That means we never hit a single allocation type along all
+  // call stacks with this prefix. This can happen due to recursion collapsing
+  // or the stack being deeper than tracked by the profiler runtime, leading to
+  // contexts with different allocation types being merged. In that case, we
+  // trim the context just below the deepest context split, which is this
+  // node if the callee has an ambiguous caller context (multiple callers),
+  // since the recursive calls above returned false. Conservatively give it
+  // non-cold allocation type.
+  if (!CalleeHasAmbiguousCallerContext)
+    return false;
+  MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold));
+  return true;
+}
+
+// Build and attach the minimal necessary MIB metadata. If the alloc has a
+// single allocation type, add a function attribute instead. Returns true if
+// memprof metadata attached, false if not (attribute added).
+bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
+  auto &Ctx = CI->getContext();
+  if (hasSingleAllocType(Alloc->AllocTypes)) {
+    addAllocTypeAttribute(Ctx, CI, (AllocationType)Alloc->AllocTypes);
+    return false;
+  }
+  std::vector<uint64_t> MIBCallStack;
+  MIBCallStack.push_back(AllocStackId);
+  std::vector<Metadata *> MIBNodes;
+  assert(!Alloc->Callers.empty() && "addCallStack has not been called yet");
+  buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes,
+                /*CalleeHasAmbiguousCallerContext=*/true);
+  assert(MIBCallStack.size() == 1 &&
+         "Should only be left with Alloc's location in stack");
+  CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));
+  return true;
+}
diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index eb75118210b9..9ad60f774e9f 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -451,7 +451,7 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
     }
     // Existing Phi blocks may need renaming too, if an access was previously
     // optimized and the inserted Defs "covers" the Optimized value.
-    for (auto &MP : ExistingPhis) {
+    for (const auto &MP : ExistingPhis) {
       MemoryPhi *Phi = dyn_cast_or_null<MemoryPhi>(MP);
       if (Phi)
         MSSA->renamePass(Phi->getBlock(), nullptr, Visited);
@@ -462,7 +462,7 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
 void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<WeakVH> &Vars) {
   SmallPtrSet<const BasicBlock *, 8> Seen;
   SmallVector<const BasicBlock *, 16> Worklist;
-  for (auto &Var : Vars) {
+  for (const auto &Var : Vars) {
     MemoryAccess *NewDef = dyn_cast_or_null<MemoryAccess>(Var);
     if (!NewDef)
       continue;
@@ -744,10 +744,10 @@ void MemorySSAUpdater::updateForClonedLoop(const LoopBlocksRPO &LoopBlocks,
     cloneUsesAndDefs(BB, NewBlock, VMap, MPhiMap);
   };
 
-  for (auto BB : llvm::concat<BasicBlock *const>(LoopBlocks, ExitBlocks))
+  for (auto *BB : llvm::concat<BasicBlock *const>(LoopBlocks, ExitBlocks))
     ProcessBlock(BB);
 
-  for (auto BB : llvm::concat<BasicBlock *const>(LoopBlocks, ExitBlocks))
+  for (auto *BB : llvm::concat<BasicBlock *const>(LoopBlocks, ExitBlocks))
     if (MemoryPhi *MPhi = MSSA->getMemoryAccess(BB))
       if (MemoryAccess *NewPhi = MPhiMap.lookup(MPhi))
         FixPhiIncomingValues(MPhi, cast<MemoryPhi>(NewPhi));
@@ -811,7 +811,7 @@ void MemorySSAUpdater::applyUpdates(ArrayRef<CFGUpdate> Updates,
   SmallVector<CFGUpdate, 4> DeleteUpdates;
   SmallVector<CFGUpdate, 4> RevDeleteUpdates;
   SmallVector<CFGUpdate, 4> InsertUpdates;
-  for (auto &Update : Updates) {
+  for (const auto &Update : Updates) {
     if (Update.getKind() == DT.Insert)
       InsertUpdates.push_back({DT.Insert, Update.getFrom(), Update.getTo()});
     else {
@@ -958,7 +958,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
   };
   SmallDenseMap<BasicBlock *, PredInfo> PredMap;
 
-  for (auto &Edge : Updates) {
+  for (const auto &Edge : Updates) {
     BasicBlock *BB = Edge.getTo();
     auto &AddedBlockSet = PredMap[BB].Added;
     AddedBlockSet.insert(Edge.getFrom());
@@ -1003,7 +1003,7 @@ void MemorySSAUpdater::applyInsertUpdates(ArrayRef<CFGUpdate> Updates,
 
   // First create MemoryPhis in all blocks that don't have one. Create in the
   // order found in Updates, not in PredMap, to get deterministic numbering.
-  for (auto &Edge : Updates) {
+  for (const auto &Edge : Updates) {
     BasicBlock *BB = Edge.getTo();
     if (PredMap.count(BB) && !MSSA->getMemoryAccess(BB))
       InsertedPhis.push_back(MSSA->createMemoryPhi(BB));
@@ -1400,7 +1400,7 @@ void MemorySSAUpdater::removeBlocks(
 }
 
 void MemorySSAUpdater::tryRemoveTrivialPhis(ArrayRef<WeakVH> UpdatedPHIs) {
-  for (auto &VH : UpdatedPHIs)
+  for (const auto &VH : UpdatedPHIs)
     if (auto *MPhi = cast_or_null<MemoryPhi>(VH))
       tryRemoveTrivialPhi(MPhi);
 }
diff --git a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
index 373aaa48b1d1..756f92e1aac4 100644
--- a/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
+++ b/llvm/lib/Analysis/ModuleDebugInfoPrinter.cpp
@@ -95,7 +95,7 @@ static void printModuleDebugInfo(raw_ostream &O, const Module *M,
     O << '\n';
   }
 
-  for (auto GVU : Finder.global_variables()) {
+  for (auto *GVU : Finder.global_variables()) {
     const auto *GV = GVU->getVariable();
     O << "Global variable: " << GV->getName();
     printFile(O, GV->getFilename(), GV->getDirectory(), GV->getLine());
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 2b98634ef7bf..c52b27a38fe9 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -400,7 +400,7 @@ static void computeFunctionSummary(
         // to enable importing for subsequent indirect call promotion and
         // inlining.
         if (auto *MD = I.getMetadata(LLVMContext::MD_callees)) {
-          for (auto &Op : MD->operands()) {
+          for (const auto &Op : MD->operands()) {
             Function *Callee = mdconst::extract_or_null<Function>(Op);
             if (Callee)
               CallGraphEdges[Index.getOrInsertValueInfo(Callee)];
@@ -412,7 +412,7 @@ static void computeFunctionSummary(
         auto CandidateProfileData =
             ICallAnalysis.getPromotionCandidatesForInstruction(
                 &I, NumVals, TotalCount, NumCandidates);
-        for (auto &Candidate : CandidateProfileData)
+        for (const auto &Candidate : CandidateProfileData)
           CallGraphEdges[Index.getOrInsertValueInfo(Candidate.Value)]
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
@@ -451,7 +451,7 @@ static void computeFunctionSummary(
     // If both load and store instruction reference the same variable
     // we won't be able to optimize it. Add all such reference edges
     // to RefEdges set.
-    for (auto &VI : StoreRefEdges)
+    for (const auto &VI : StoreRefEdges)
       if (LoadRefEdges.remove(VI))
         RefEdges.insert(VI);
 
@@ -459,11 +459,11 @@ static void computeFunctionSummary(
     // All new reference edges inserted in two loops below are either
     // read or write only. They will be grouped in the end of RefEdges
     // vector, so we can use a single integer value to identify them.
-    for (auto &VI : LoadRefEdges)
+    for (const auto &VI : LoadRefEdges)
       RefEdges.insert(VI);
 
     unsigned FirstWORef = RefEdges.size();
-    for (auto &VI : StoreRefEdges)
+    for (const auto &VI : StoreRefEdges)
       RefEdges.insert(VI);
 
     Refs = RefEdges.takeVector();
@@ -646,15 +646,18 @@ static void computeVariableSummary(ModuleSummaryIndex &Index,
   Index.addGlobalValueSummary(V, std::move(GVarSummary));
 }
 
-static void
-computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
-                    DenseSet<GlobalValue::GUID> &CantBePromoted) {
+static void computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
+                                DenseSet<GlobalValue::GUID> &CantBePromoted) {
+  // Skip summary for indirect function aliases as summary for aliasee will not
+  // be emitted.
+  const GlobalObject *Aliasee = A.getAliaseeObject();
+  if (isa<GlobalIFunc>(Aliasee))
+    return;
   bool NonRenamableLocal = isNonRenamableLocal(A);
   GlobalValueSummary::GVFlags Flags(
       A.getLinkage(), A.getVisibility(), NonRenamableLocal,
       /* Live = */ false, A.isDSOLocal(), A.canBeOmittedFromSymbolTable());
   auto AS = std::make_unique<AliasSummary>(Flags);
-  auto *Aliasee = A.getAliaseeObject();
   auto AliaseeVI = Index.getValueInfo(Aliasee->getGUID());
   assert(AliaseeVI && "Alias expects aliasee summary to be available");
   assert(AliaseeVI.getSummaryList().size() == 1 &&
@@ -668,7 +671,7 @@ computeAliasSummary(ModuleSummaryIndex &Index, const GlobalAlias &A,
 // Set LiveRoot flag on entries matching the given value name.
 static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) {
   if (ValueInfo VI = Index.getValueInfo(GlobalValue::getGUID(Name)))
-    for (auto &Summary : VI.getSummaryList())
+    for (const auto &Summary : VI.getSummaryList())
       Summary->setLive(true);
 }
 
@@ -776,7 +779,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
 
   // Compute summaries for all functions defined in module, and save in the
   // index.
-  for (auto &F : M) {
+  for (const auto &F : M) {
     if (F.isDeclaration())
       continue;
 
@@ -811,6 +814,13 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
   for (const GlobalAlias &A : M.aliases())
     computeAliasSummary(Index, A, CantBePromoted);
 
+  // Iterate through ifuncs, set their resolvers all alive.
+  for (const GlobalIFunc &I : M.ifuncs()) {
+    I.applyAlongResolverPath([&Index](const GlobalValue &GV) {
+      Index.getGlobalValueSummary(GV)->setLive(true);
+    });
+  }
+
   for (auto *V : LocalsUsed) {
     auto *Summary = Index.getGlobalValueSummary(*V);
     assert(Summary && "Missing summary for global value");
diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
index ad8322d7bd79..ac6590c1d8a2 100644
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -81,7 +81,7 @@ void ICFLoopSafetyInfo::computeLoopSafetyInfo(const Loop *CurLoop) {
   MW.clear();
   MayThrow = false;
   // Figure out the fact that at least one block may throw.
-  for (auto &BB : CurLoop->blocks())
+  for (const auto &BB : CurLoop->blocks())
     if (ICF.hasICF(&*BB)) {
       MayThrow = true;
       break;
@@ -164,7 +164,7 @@ static void collectTransitivePredecessors(
   if (BB == CurLoop->getHeader())
     return;
   SmallVector<const BasicBlock *, 4> WorkList;
-  for (auto *Pred : predecessors(BB)) {
+  for (const auto *Pred : predecessors(BB)) {
     Predecessors.insert(Pred);
     WorkList.push_back(Pred);
   }
@@ -180,7 +180,7 @@ static void collectTransitivePredecessors(
     // @nested and @nested_no_throw in test/Analysis/MustExecute/loop-header.ll.
     // We can ignore backedge of all loops containing BB to get a sligtly more
     // optimistic result.
-    for (auto *PredPred : predecessors(Pred))
+    for (const auto *PredPred : predecessors(Pred))
       if (Predecessors.insert(PredPred).second)
         WorkList.push_back(PredPred);
   }
@@ -207,7 +207,7 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
   // 3) Exit blocks which are not taken on 1st iteration.
   // Memoize blocks we've already checked.
   SmallPtrSet<const BasicBlock *, 4> CheckedSuccessors;
-  for (auto *Pred : Predecessors) {
+  for (const auto *Pred : Predecessors) {
     // Predecessor block may throw, so it has a side exit.
     if (blockMayThrow(Pred))
       return false;
@@ -217,7 +217,7 @@ bool LoopSafetyInfo::allLoopPathsLeadToBlock(const Loop *CurLoop,
     if (DT->dominates(BB, Pred))
       continue;
 
-    for (auto *Succ : successors(Pred))
+    for (const auto *Succ : successors(Pred))
       if (CheckedSuccessors.insert(Succ).second &&
           Succ != BB && !Predecessors.count(Succ))
         // By discharging conditions that are not executed on the 1st iteration,
@@ -285,7 +285,7 @@ bool ICFLoopSafetyInfo::doesNotWriteMemoryBefore(const BasicBlock *BB,
   collectTransitivePredecessors(CurLoop, BB, Predecessors);
   // Find if there any instruction in either predecessor that could write
   // to memory.
-  for (auto *Pred : Predecessors)
+  for (const auto *Pred : Predecessors)
     if (MW.mayWriteToMemory(Pred))
       return false;
   return true;
@@ -413,7 +413,7 @@ class MustExecuteAnnotatedWriter : public AssemblyAnnotationWriter {
 public:
   MustExecuteAnnotatedWriter(const Function &F,
                              DominatorTree &DT, LoopInfo &LI) {
-    for (auto &I: instructions(F)) {
+    for (const auto &I: instructions(F)) {
       Loop *L = LI.getLoopFor(I.getParent());
       while (L) {
         if (isMustExecuteIn(I, L, &DT)) {
@@ -425,8 +425,8 @@ public:
   }
   MustExecuteAnnotatedWriter(const Module &M,
                              DominatorTree &DT, LoopInfo &LI) {
-    for (auto &F : M)
-    for (auto &I: instructions(F)) {
+    for (const auto &F : M)
+    for (const auto &I: instructions(F)) {
       Loop *L = LI.getLoopFor(I.getParent());
       while (L) {
         if (isMustExecuteIn(I, L, &DT)) {
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index f61806bd1dad..d46248aa3889 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1158,7 +1158,7 @@ const SCEV *ScalarEvolution::getLosslessPtrToIntExpr(const SCEV *Op,
     const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       bool Changed = false;
-      for (auto *Op : Expr->operands()) {
+      for (const auto *Op : Expr->operands()) {
         Operands.push_back(visit(Op));
         Changed |= Op != Operands.back();
       }
@@ -1168,7 +1168,7 @@ const SCEV *ScalarEvolution::getLosslessPtrToIntExpr(const SCEV *Op,
     const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
       SmallVector<const SCEV *, 2> Operands;
       bool Changed = false;
-      for (auto *Op : Expr->operands()) {
+      for (const auto *Op : Expr->operands()) {
         Operands.push_back(visit(Op));
         Changed |= Op != Operands.back();
       }
@@ -4662,7 +4662,7 @@ ScalarEvolution::getUMinFromMismatchedTypes(SmallVectorImpl<const SCEV *> &Ops,
 
   // Find the max type first.
   Type *MaxType = nullptr;
-  for (auto *S : Ops)
+  for (const auto *S : Ops)
     if (MaxType)
       MaxType = getWiderType(MaxType, S->getType());
     else
@@ -4671,7 +4671,7 @@ ScalarEvolution::getUMinFromMismatchedTypes(SmallVectorImpl<const SCEV *> &Ops,
 
   // Extend all ops to max type.
   SmallVector<const SCEV *, 2> PromotedOps;
-  for (auto *S : Ops)
+  for (const auto *S : Ops)
     PromotedOps.push_back(getNoopOrZeroExtend(S, MaxType));
 
   // Generate umin.
@@ -6636,7 +6636,7 @@ ScalarEvolution::getRangeRef(const SCEV *S,
       // Make sure that we do not run over cycled Phis.
       if (PendingPhiRanges.insert(Phi).second) {
         ConstantRange RangeFromOps(BitWidth, /*isFullSet=*/false);
-        for (auto &Op : Phi->operands()) {
+        for (const auto &Op : Phi->operands()) {
           auto OpRange = getRangeRef(getSCEV(Op), SignHint);
           RangeFromOps = RangeFromOps.unionWith(OpRange);
           // No point to continue if we already have a full set.
@@ -6651,6 +6651,13 @@ ScalarEvolution::getRangeRef(const SCEV *S,
       }
     }
 
+    // vscale can't be equal to zero
+    if (const auto *II = dyn_cast<IntrinsicInst>(U->getValue()))
+      if (II->getIntrinsicID() == Intrinsic::vscale) {
+        ConstantRange Disallowed = APInt::getZero(BitWidth);
+        ConservativeResult = ConservativeResult.difference(Disallowed);
+      }
+
     return setRange(U, SignHint, std::move(ConservativeResult));
   }
 
@@ -6973,13 +6980,13 @@ static void collectUniqueOps(const SCEV *S,
       Ops.push_back(S);
   };
   if (auto *S2 = dyn_cast<SCEVCastExpr>(S))
-    for (auto *Op : S2->operands())
+    for (const auto *Op : S2->operands())
       InsertUnique(Op);
   else if (auto *S2 = dyn_cast<SCEVNAryExpr>(S))
-    for (auto *Op : S2->operands())
+    for (const auto *Op : S2->operands())
       InsertUnique(Op);
   else if (auto *S2 = dyn_cast<SCEVUDivExpr>(S))
-    for (auto *Op : S2->operands())
+    for (const auto *Op : S2->operands())
       InsertUnique(Op);
 }
 
@@ -7001,7 +7008,7 @@ ScalarEvolution::getDefiningScopeBound(ArrayRef<const SCEV *> Ops,
     Worklist.push_back(S);
   };
 
-  for (auto *S : Ops)
+  for (const auto *S : Ops)
     pushOp(S);
 
   const Instruction *Bound = nullptr;
@@ -7013,7 +7020,7 @@ ScalarEvolution::getDefiningScopeBound(ArrayRef<const SCEV *> Ops,
     } else {
       SmallVector<const SCEV *, 4> Ops;
       collectUniqueOps(S, Ops);
-      for (auto *Op : Ops)
+      for (const auto *Op : Ops)
         pushOp(Op);
     }
   }
@@ -7117,7 +7124,7 @@ bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) {
   while (!PoisonStack.empty() && !LatchControlDependentOnPoison) {
     const Instruction *Poison = PoisonStack.pop_back_val();
 
-    for (auto *PoisonUser : Poison->users()) {
+    for (const auto *PoisonUser : Poison->users()) {
       if (propagatesPoison(cast<Operator>(PoisonUser))) {
         if (Pushed.insert(cast<Instruction>(PoisonUser)).second)
           PoisonStack.push_back(cast<Instruction>(PoisonUser));
@@ -7242,7 +7249,7 @@ ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops) {
   Operator *U = cast<Operator>(V);
   if (auto BO = MatchBinaryOp(U, DT)) {
     bool IsConstArg = isa<ConstantInt>(BO->RHS);
-    switch (U->getOpcode()) {
+    switch (BO->Opcode) {
     case Instruction::Add: {
       // For additions and multiplications, traverse add/mul chains for which we
       // can potentially create a single SCEV, to reduce the number of
@@ -7284,7 +7291,10 @@ ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops) {
       } while (true);
       return nullptr;
     }
-
+    case Instruction::Sub:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      break;
     case Instruction::AShr:
     case Instruction::Shl:
     case Instruction::Xor:
@@ -7296,7 +7306,10 @@ ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops) {
       if (!IsConstArg && BO->LHS->getType()->isIntegerTy(1))
         return nullptr;
       break;
+    case Instruction::LShr:
+      return getUnknown(V);
     default:
+      llvm_unreachable("Unhandled binop");
       break;
     }
 
@@ -7340,12 +7353,34 @@ ScalarEvolution::getOperandsToCreate(Value *V, SmallVectorImpl<Value *> &Ops) {
     // Keep constructing SCEVs' for phis recursively for now.
     return nullptr;
 
-  case Instruction::Select:
+  case Instruction::Select: {
+    // Check if U is a select that can be simplified to a SCEVUnknown.
+    auto CanSimplifyToUnknown = [this, U]() {
+      if (U->getType()->isIntegerTy(1) || isa<ConstantInt>(U->getOperand(0)))
+        return false;
+
+      auto *ICI = dyn_cast<ICmpInst>(U->getOperand(0));
+      if (!ICI)
+        return false;
+      Value *LHS = ICI->getOperand(0);
+      Value *RHS = ICI->getOperand(1);
+      if (ICI->getPredicate() == CmpInst::ICMP_EQ ||
+          ICI->getPredicate() == CmpInst::ICMP_NE) {
+        if (!(isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isZero()))
+          return true;
+      } else if (getTypeSizeInBits(LHS->getType()) >
+                 getTypeSizeInBits(U->getType()))
+        return true;
+      return false;
+    };
+    if (CanSimplifyToUnknown())
+      return getUnknown(U);
+
     for (Value *Inc : U->operands())
       Ops.push_back(Inc);
     return nullptr;
     break;
-
+  }
   case Instruction::Call:
   case Instruction::Invoke:
     if (Value *RV = cast<CallBase>(U)->getReturnedArgOperand()) {
@@ -8338,7 +8373,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
   // All exiting blocks we have gathered dominate loop's latch, so exact trip
   // count is simply a minimum out of all these calculated exit counts.
   SmallVector<const SCEV *, 2> Ops;
-  for (auto &ENT : ExitNotTaken) {
+  for (const auto &ENT : ExitNotTaken) {
     const SCEV *BECount = ENT.ExactNotTaken;
     assert(BECount != SE->getCouldNotCompute() && "Bad exit SCEV!");
     assert(SE->DT.dominates(ENT.ExitingBlock, Latch) &&
@@ -8348,7 +8383,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
     Ops.push_back(BECount);
 
     if (Preds)
-      for (auto *P : ENT.Predicates)
+      for (const auto *P : ENT.Predicates)
         Preds->push_back(P);
 
     assert((Preds || ENT.hasAlwaysTruePredicate()) &&
@@ -8365,7 +8400,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const Loop *L, ScalarEvolution *SE,
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
                                              ScalarEvolution *SE) const {
-  for (auto &ENT : ExitNotTaken)
+  for (const auto &ENT : ExitNotTaken)
     if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate())
       return ENT.ExactNotTaken;
 
@@ -8374,7 +8409,7 @@ ScalarEvolution::BackedgeTakenInfo::getExact(const BasicBlock *ExitingBlock,
 
 const SCEV *ScalarEvolution::BackedgeTakenInfo::getConstantMax(
     const BasicBlock *ExitingBlock, ScalarEvolution *SE) const {
-  for (auto &ENT : ExitNotTaken)
+  for (const auto &ENT : ExitNotTaken)
     if (ENT.ExitingBlock == ExitingBlock && ENT.hasAlwaysTruePredicate())
       return ENT.MaxNotTaken;
 
@@ -8433,8 +8468,8 @@ ScalarEvolution::ExitLimit::ExitLimit(
   assert((isa<SCEVCouldNotCompute>(MaxNotTaken) ||
           isa<SCEVConstant>(MaxNotTaken)) &&
          "No point in having a non-constant max backedge taken count!");
-  for (auto *PredSet : PredSetList)
-    for (auto *P : *PredSet)
+  for (const auto *PredSet : PredSetList)
+    for (const auto *P : *PredSet)
       addPredicate(P);
   assert((isa<SCEVCouldNotCompute>(E) || !E->getType()->isPointerTy()) &&
          "Backedge count should be int");
@@ -10522,8 +10557,8 @@ bool ScalarEvolution::isKnownViaInduction(ICmpInst::Predicate Pred,
 
   // Domination relationship must be a linear order on collected loops.
 #ifndef NDEBUG
-  for (auto *L1 : LoopsUsed)
-    for (auto *L2 : LoopsUsed)
+  for (const auto *L1 : LoopsUsed)
+    for (const auto *L2 : LoopsUsed)
       assert((DT.dominates(L1->getHeader(), L2->getHeader()) ||
               DT.dominates(L2->getHeader(), L1->getHeader())) &&
              "Domination relationship is not a linear order");
@@ -10977,8 +11012,10 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
                                              ICmpInst::Predicate Pred,
                                              const SCEV *LHS, const SCEV *RHS) {
   // Interpret a null as meaning no loop, where there is obviously no guard
-  // (interprocedural conditions notwithstanding).
-  if (!L) return true;
+  // (interprocedural conditions notwithstanding). Do not bother about
+  // unreachable loops.
+  if (!L || !DT.isReachableFromEntry(L->getHeader()))
+    return true;
 
   if (VerifyIR)
     assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()) &&
@@ -11035,12 +11072,6 @@ ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
       return true;
   }
 
-  // If the loop is not reachable from the entry block, we risk running into an
-  // infinite loop as we walk up into the dom tree.  These loops do not matter
-  // anyway, so we just return a conservative answer when we see them.
-  if (!DT.isReachableFromEntry(L->getHeader()))
-    return false;
-
   if (isImpliedViaGuard(Latch, Pred, LHS, RHS))
     return true;
 
@@ -11086,6 +11117,9 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
                                                      ICmpInst::Predicate Pred,
                                                      const SCEV *LHS,
                                                      const SCEV *RHS) {
+  // Do not bother proving facts for unreachable code.
+  if (!DT.isReachableFromEntry(BB))
+    return true;
   if (VerifyIR)
     assert(!verifyFunction(*BB->getParent(), &dbgs()) &&
            "This cannot be done on broken IR!");
@@ -11162,14 +11196,13 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
     if (ProveViaGuard(Pair.first))
       return true;
 
-    const BranchInst *LoopEntryPredicate =
+    const BranchInst *BlockEntryPredicate =
         dyn_cast<BranchInst>(Pair.first->getTerminator());
-    if (!LoopEntryPredicate ||
-        LoopEntryPredicate->isUnconditional())
+    if (!BlockEntryPredicate || BlockEntryPredicate->isUnconditional())
       continue;
 
-    if (ProveViaCond(LoopEntryPredicate->getCondition(),
-                     LoopEntryPredicate->getSuccessor(0) != Pair.second))
+    if (ProveViaCond(BlockEntryPredicate->getCondition(),
+                     BlockEntryPredicate->getSuccessor(0) != Pair.second))
       return true;
   }
 
@@ -13179,7 +13212,7 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
   if (!isa<SCEVCouldNotCompute>(PBT)) {
     OS << "Predicated backedge-taken count is " << *PBT << "\n";
     OS << " Predicates:\n";
-    for (auto *P : Preds)
+    for (const auto *P : Preds)
       P->print(OS, 4);
   } else {
     OS << "Unpredictable predicated backedge-taken count. ";
@@ -13256,7 +13289,7 @@ void ScalarEvolution::print(raw_ostream &OS) const {
           }
 
           bool First = true;
-          for (auto *Iter = L; Iter; Iter = Iter->getParentLoop()) {
+          for (const auto *Iter = L; Iter; Iter = Iter->getParentLoop()) {
             if (First) {
               OS << "\t\t" "LoopDispositions: { ";
               First = false;
@@ -13268,7 +13301,7 @@ void ScalarEvolution::print(raw_ostream &OS) const {
             OS << ": " << loopDispositionToStr(SE.getLoopDisposition(SV, Iter));
           }
 
-          for (auto *InnerL : depth_first(L)) {
+          for (const auto *InnerL : depth_first(L)) {
             if (InnerL == L)
               continue;
             if (First) {
@@ -13348,7 +13381,7 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
 
     // This recurrence is variant w.r.t. L if any of its operands
     // are variant.
-    for (auto *Op : AR->operands())
+    for (const auto *Op : AR->operands())
       if (!isLoopInvariant(Op, L))
         return LoopVariant;
 
@@ -13363,7 +13396,7 @@ ScalarEvolution::computeLoopDisposition(const SCEV *S, const Loop *L) {
   case scSMinExpr:
   case scSequentialUMinExpr: {
     bool HasVarying = false;
-    for (auto *Op : cast<SCEVNAryExpr>(S)->operands()) {
+    for (const auto *Op : cast<SCEVNAryExpr>(S)->operands()) {
       LoopDisposition D = getLoopDisposition(Op, L);
       if (D == LoopVariant)
         return LoopVariant;
@@ -13529,12 +13562,12 @@ void ScalarEvolution::forgetMemoizedResults(ArrayRef<const SCEV *> SCEVs) {
     const SCEV *Curr = Worklist.pop_back_val();
     auto Users = SCEVUsers.find(Curr);
     if (Users != SCEVUsers.end())
-      for (auto *User : Users->second)
+      for (const auto *User : Users->second)
         if (ToForget.insert(User).second)
           Worklist.push_back(User);
   }
 
-  for (auto *S : ToForget)
+  for (const auto *S : ToForget)
     forgetMemoizedResultsImpl(S);
 
   for (auto I = PredicatedSCEVRewrites.begin();
@@ -13747,7 +13780,7 @@ void ScalarEvolution::verify() const {
     if (ValidLoops.insert(L).second)
       Worklist.append(L->begin(), L->end());
   }
-  for (auto &KV : ValueExprMap) {
+  for (const auto &KV : ValueExprMap) {
 #ifndef NDEBUG
     // Check for SCEV expressions referencing invalid/deleted loops.
     if (auto *AR = dyn_cast<SCEVAddRecExpr>(KV.second)) {
@@ -14018,7 +14051,7 @@ public:
   const SCEV *visitUnknown(const SCEVUnknown *Expr) {
     if (Pred) {
       if (auto *U = dyn_cast<SCEVUnionPredicate>(Pred)) {
-        for (auto *Pred : U->getPredicates())
+        for (const auto *Pred : U->getPredicates())
           if (const auto *IPred = dyn_cast<SCEVComparePredicate>(Pred))
             if (IPred->getLHS() == Expr &&
                 IPred->getPredicate() == ICmpInst::ICMP_EQ)
@@ -14098,7 +14131,7 @@ private:
     PredicatedRewrite = SE.createAddRecFromPHIWithCasts(Expr);
     if (!PredicatedRewrite)
       return Expr;
-    for (auto *P : PredicatedRewrite->second){
+    for (const auto *P : PredicatedRewrite->second){
       // Wrap predicates from outer loops are not supported.
       if (auto *WP = dyn_cast<const SCEVWrapPredicate>(P)) {
         if (L != WP->getExpr()->getLoop())
@@ -14135,7 +14168,7 @@ const SCEVAddRecExpr *ScalarEvolution::convertSCEVToAddRecWithPredicates(
 
   // Since the transformation was successful, we can now transfer the SCEV
   // predicates.
-  for (auto *P : TransformPreds)
+  for (const auto *P : TransformPreds)
     Preds.insert(P);
 
   return AddRec;
@@ -14234,7 +14267,7 @@ SCEVWrapPredicate::getImpliedFlags(const SCEVAddRecExpr *AR,
 /// Union predicates don't get cached so create a dummy set ID for it.
 SCEVUnionPredicate::SCEVUnionPredicate(ArrayRef<const SCEVPredicate *> Preds)
   : SCEVPredicate(FoldingSetNodeIDRef(nullptr, 0), P_Union) {
-  for (auto *P : Preds)
+  for (const auto *P : Preds)
     add(P);
 }
 
@@ -14253,13 +14286,13 @@ bool SCEVUnionPredicate::implies(const SCEVPredicate *N) const {
 }
 
 void SCEVUnionPredicate::print(raw_ostream &OS, unsigned Depth) const {
-  for (auto Pred : Preds)
+  for (const auto *Pred : Preds)
     Pred->print(OS, Depth);
 }
 
 void SCEVUnionPredicate::add(const SCEVPredicate *N) {
   if (const auto *Set = dyn_cast<SCEVUnionPredicate>(N)) {
-    for (auto Pred : Set->Preds)
+    for (const auto *Pred : Set->Preds)
       add(Pred);
     return;
   }
@@ -14276,7 +14309,7 @@ PredicatedScalarEvolution::PredicatedScalarEvolution(ScalarEvolution &SE,
 
 void ScalarEvolution::registerUser(const SCEV *User,
                                    ArrayRef<const SCEV *> Ops) {
-  for (auto *Op : Ops)
+  for (const auto *Op : Ops)
     // We do not expect that forgetting cached data for SCEVConstants will ever
     // open any prospects for sharpening or introduce any correctness issues,
     // so we don't bother storing their dependencies.
@@ -14307,7 +14340,7 @@ const SCEV *PredicatedScalarEvolution::getBackedgeTakenCount() {
   if (!BackedgeCount) {
     SmallVector<const SCEVPredicate *, 4> Preds;
     BackedgeCount = SE.getPredicatedBackedgeTakenCount(&L, Preds);
-    for (auto *P : Preds)
+    for (const auto *P : Preds)
       addPredicate(*P);
   }
   return BackedgeCount;
@@ -14378,7 +14411,7 @@ const SCEVAddRecExpr *PredicatedScalarEvolution::getAsAddRec(Value *V) {
   if (!New)
     return nullptr;
 
-  for (auto *P : NewPreds)
+  for (const auto *P : NewPreds)
     addPredicate(*P);
 
   RewriteMap[SE.getSCEV(V)] = {Generation, New};
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 52e8566aca3c..162fd75c73e0 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -182,7 +182,7 @@ void StackLifetime::calculateLocalLiveness() {
 
       // Compute LiveIn by unioning together the LiveOut sets of all preds.
       BitVector LocalLiveIn;
-      for (auto *PredBB : predecessors(BB)) {
+      for (const auto *PredBB : predecessors(BB)) {
         LivenessMap::const_iterator I = BlockLiveness.find(PredBB);
         // If a predecessor is unreachable, ignore it.
         if (I == BlockLiveness.end())
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 94b646ab7c06..9698af3ca85c 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -206,7 +206,7 @@ template <typename CalleeTy> struct FunctionInfo {
 
     O << "    allocas uses:\n";
     if (F) {
-      for (auto &I : instructions(F)) {
+      for (const auto &I : instructions(F)) {
         if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
           auto &AS = Allocas.find(AI)->second;
           O << "      " << AI->getName() << "["
@@ -763,7 +763,7 @@ const ConstantRange *findParamAccess(const FunctionSummary &FS,
                                      uint32_t ParamNo) {
   assert(FS.isLive());
   assert(FS.isDSOLocal());
-  for (auto &PS : FS.paramAccesses())
+  for (const auto &PS : FS.paramAccesses())
     if (ParamNo == PS.ParamNo)
       return &PS.Use;
   return nullptr;
@@ -823,7 +823,7 @@ GVToSSI createGlobalStackSafetyInfo(
       Copy.begin()->first->getParent()->getDataLayout().getPointerSizeInBits();
   StackSafetyDataFlowAnalysis<GlobalValue> SSDFA(PointerSize, std::move(Copy));
 
-  for (auto &F : SSDFA.run()) {
+  for (const auto &F : SSDFA.run()) {
     auto FI = F.second;
     auto &SrcF = Functions[F.first];
     for (auto &KV : FI.Allocas) {
@@ -922,7 +922,7 @@ StackSafetyInfo::getParamAccesses(ModuleSummaryIndex &Index) const {
     FunctionSummary::ParamAccess &Param = ParamAccesses.back();
 
     Param.Calls.reserve(PS.Calls.size());
-    for (auto &C : PS.Calls) {
+    for (const auto &C : PS.Calls) {
       // Parameter forwarded into another function by any or unknown offset
       // will make ParamAccess::Range as FullSet anyway. So we can drop the
       // entire parameter like we did above.
@@ -978,7 +978,7 @@ void StackSafetyGlobalInfo::print(raw_ostream &O) const {
   if (SSI.empty())
     return;
   const Module &M = *SSI.begin()->first->getParent();
-  for (auto &F : M.functions()) {
+  for (const auto &F : M.functions()) {
     if (!F.isDeclaration()) {
       SSI.find(&F)->second.print(O, F.getName(), &F);
       O << "    safe accesses:"
@@ -1094,7 +1094,7 @@ bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
 bool llvm::needsParamAccessSummary(const Module &M) {
   if (StackSafetyRun)
     return true;
-  for (auto &F : M.functions())
+  for (const auto &F : M.functions())
     if (F.hasFnAttribute(Attribute::SanitizeMemTag))
       return true;
   return false;
@@ -1126,13 +1126,13 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
         continue;
       if (FS->isLive() && FS->isDSOLocal()) {
         FunctionInfo<FunctionSummary> FI;
-        for (auto &PS : FS->paramAccesses()) {
+        for (const auto &PS : FS->paramAccesses()) {
           auto &US =
               FI.Params
                   .emplace(PS.ParamNo, FunctionSummary::ParamAccess::RangeWidth)
                   .first->second;
           US.Range = PS.Use;
-          for (auto &Call : PS.Calls) {
+          for (const auto &Call : PS.Calls) {
             assert(!Call.Offsets.isFullSet());
             FunctionSummary *S =
                 findCalleeFunctionSummary(Call.Callee, FS->modulePath());
@@ -1158,10 +1158,10 @@ void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
   NumCombinedDataFlowNodes += Functions.size();
   StackSafetyDataFlowAnalysis<FunctionSummary> SSDFA(
       FunctionSummary::ParamAccess::RangeWidth, std::move(Functions));
-  for (auto &KV : SSDFA.run()) {
+  for (const auto &KV : SSDFA.run()) {
     std::vector<FunctionSummary::ParamAccess> NewParams;
     NewParams.reserve(KV.second.Params.size());
-    for (auto &Param : KV.second.Params) {
+    for (const auto &Param : KV.second.Params) {
       // It's not needed as FullSet is processed the same as a missing value.
       if (Param.second.Range.isFullSet())
         continue;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 6e34a8303c08..cfa6e3a97626 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -58,14 +58,16 @@ bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
 }
 
 IntrinsicCostAttributes::IntrinsicCostAttributes(
-    Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost)
+    Intrinsic::ID Id, const CallBase &CI, InstructionCost ScalarizationCost,
+    bool TypeBasedOnly)
     : II(dyn_cast<IntrinsicInst>(&CI)), RetTy(CI.getType()), IID(Id),
       ScalarizationCost(ScalarizationCost) {
 
   if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
     FMF = FPMO->getFastMathFlags();
 
-  Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
+  if (!TypeBasedOnly)
+    Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
   FunctionType *FTy = CI.getCalledFunction()->getFunctionType();
   ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
 }
@@ -294,8 +296,8 @@ bool TargetTransformInfo::isHardwareLoopProfitable(
 bool TargetTransformInfo::preferPredicateOverEpilogue(
     Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
     TargetLibraryInfo *TLI, DominatorTree *DT,
-    const LoopAccessInfo *LAI) const {
-  return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
+    LoopVectorizationLegality *LVL) const {
+  return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
 }
 
 PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const {
diff --git a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
index 560f46d39d0d..216027778fab 100644
--- a/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -303,7 +303,7 @@ public:
   /// given offset. Update the offset to be relative to the field type.
   TBAAStructTypeNode getField(uint64_t &Offset) const {
     bool NewFormat = isNewFormat();
-    const ArrayRef<MDOperand> Operands(Node->op_begin(), Node->op_end());
+    const ArrayRef<MDOperand> Operands = Node->operands();
     const unsigned NumOperands = Operands.size();
 
     if (NewFormat) {
@@ -811,7 +811,8 @@ MDNode *AAMDNodes::extendToTBAA(MDNode *MD, ssize_t Len) {
     return nullptr;
 
   // Otherwise, create TBAA with the new Len
-  SmallVector<Metadata *, 4> NextNodes(MD->operands());
+  ArrayRef<MDOperand> MDOperands = MD->operands();
+  SmallVector<Metadata *, 4> NextNodes(MDOperands.begin(), MDOperands.end());
   ConstantInt *PreviousSize = mdconst::extract<ConstantInt>(NextNodes[3]);
 
   // Don't create a new MDNode if it is the same length.
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index add2d427e05b..1f3798d1338e 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -2297,7 +2297,7 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
     return false;
 
   unsigned NumUsesExplored = 0;
-  for (auto *U : V->users()) {
+  for (const auto *U : V->users()) {
     // Avoid massive lists
     if (NumUsesExplored >= DomConditionsMaxUses)
       break;
@@ -2338,7 +2338,7 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
 
     SmallVector<const User *, 4> WorkList;
     SmallPtrSet<const User *, 4> Visited;
-    for (auto *CmpU : U->users()) {
+    for (const auto *CmpU : U->users()) {
       assert(WorkList.empty() && "Should be!");
       if (Visited.insert(CmpU).second)
         WorkList.push_back(CmpU);
@@ -2352,7 +2352,7 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
         // TODO: Support similar logic of OR and EQ predicate?
         if (NonNullIfTrue)
           if (match(Curr, m_LogicalAnd(m_Value(), m_Value()))) {
-            for (auto *CurrU : Curr->users())
+            for (const auto *CurrU : Curr->users())
               if (Visited.insert(CurrU).second)
                 WorkList.push_back(CurrU);
             continue;
@@ -5073,7 +5073,7 @@ bool llvm::isOverflowIntrinsicNoWrap(const WithOverflowInst *WO,
       if (DT.dominates(NoWrapEdge, Result->getParent()))
         continue;
 
-      for (auto &RU : Result->uses())
+      for (const auto &RU : Result->uses())
         if (!DT.dominates(NoWrapEdge, RU))
           return false;
     }
@@ -5645,7 +5645,7 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
     // whether a value is directly passed to an instruction that must take
     // well-defined operands.
 
-    for (auto &I : make_range(Begin, End)) {
+    for (const auto &I : make_range(Begin, End)) {
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       if (--ScanLimit == 0)
@@ -5676,7 +5676,7 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
   Visited.insert(BB);
 
   while (true) {
-    for (auto &I : make_range(Begin, End)) {
+    for (const auto &I : make_range(Begin, End)) {
       if (isa<DbgInfoIntrinsic>(I))
         continue;
       if (--ScanLimit == 0)
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 894680cda1fc..c4795a80ead2 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -762,7 +762,7 @@ static void addToAccessGroupList(ListT &List, MDNode *AccGroups) {
     return;
   }
 
-  for (auto &AccGroupListOp : AccGroups->operands()) {
+  for (const auto &AccGroupListOp : AccGroups->operands()) {
     auto *Item = cast<MDNode>(AccGroupListOp.get());
     assert(isValidAsAccessGroup(Item) && "List item must be an access group");
     List.insert(Item);
@@ -1497,7 +1497,7 @@ void VFABI::getVectorVariantNames(
   SmallVector<StringRef, 8> ListAttr;
   S.split(ListAttr, ",");
 
-  for (auto &S : SetVector<StringRef>(ListAttr.begin(), ListAttr.end())) {
+  for (const auto &S : SetVector<StringRef>(ListAttr.begin(), ListAttr.end())) {
 #ifndef NDEBUG
     LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << S << "'\n");
     Optional<VFInfo> Info = VFABI::tryDemangleForVFABI(S, *(CI.getModule()));
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index 1613e7e42a0a..c5ab35d94860 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -260,6 +260,9 @@ bool MetadataVerifier::verifyKernel(msgpack::DocNode &Node) {
     return false;
   if (!verifyIntegerEntry(KernelMap, ".private_segment_fixed_size", true))
     return false;
+  if (!verifyScalarEntry(KernelMap, ".uses_dynamic_stack", false,
+                         msgpack::Type::Boolean))
+    return false;
   if (!verifyIntegerEntry(KernelMap, ".kernarg_segment_align", true))
     return false;
   if (!verifyIntegerEntry(KernelMap, ".wavefront_size", true))
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 8d5a2555f9af..1d6c21bd66d1 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -5510,6 +5510,61 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (!OperandBundles.empty())
         UpgradeOperandBundles(OperandBundles);
 
+      if (auto *IA = dyn_cast<InlineAsm>(Callee)) {
+        InlineAsm::ConstraintInfoVector ConstraintInfo = IA->ParseConstraints();
+        auto IsLabelConstraint = [](const InlineAsm::ConstraintInfo &CI) {
+          return CI.Type == InlineAsm::isLabel;
+        };
+        if (none_of(ConstraintInfo, IsLabelConstraint)) {
+          // Upgrade explicit blockaddress arguments to label constraints.
+          // Verify that the last arguments are blockaddress arguments that
+          // match the indirect destinations. Clang always generates callbr
+          // in this form. We could support reordering with more effort.
+          unsigned FirstBlockArg = Args.size() - IndirectDests.size();
+          for (unsigned ArgNo = FirstBlockArg; ArgNo < Args.size(); ++ArgNo) {
+            unsigned LabelNo = ArgNo - FirstBlockArg;
+            auto *BA = dyn_cast<BlockAddress>(Args[ArgNo]);
+            if (!BA || BA->getFunction() != F ||
+                LabelNo > IndirectDests.size() ||
+                BA->getBasicBlock() != IndirectDests[LabelNo])
+              return error("callbr argument does not match indirect dest");
+          }
+
+          // Remove blockaddress arguments.
+          Args.erase(Args.begin() + FirstBlockArg, Args.end());
+          ArgTyIDs.erase(ArgTyIDs.begin() + FirstBlockArg, ArgTyIDs.end());
+
+          // Recreate the function type with less arguments.
+          SmallVector<Type *> ArgTys;
+          for (Value *Arg : Args)
+            ArgTys.push_back(Arg->getType());
+          FTy =
+              FunctionType::get(FTy->getReturnType(), ArgTys, FTy->isVarArg());
+
+          // Update constraint string to use label constraints.
+          std::string Constraints = IA->getConstraintString();
+          unsigned ArgNo = 0;
+          size_t Pos = 0;
+          for (const auto &CI : ConstraintInfo) {
+            if (CI.hasArg()) {
+              if (ArgNo >= FirstBlockArg)
+                Constraints.insert(Pos, "!");
+              ++ArgNo;
+            }
+
+            // Go to next constraint in string.
+            Pos = Constraints.find(',', Pos);
+            if (Pos == std::string::npos)
+              break;
+            ++Pos;
+          }
+
+          Callee = InlineAsm::get(FTy, IA->getAsmString(), Constraints,
+                                  IA->hasSideEffects(), IA->isAlignStack(),
+                                  IA->getDialect(), IA->canThrow());
+        }
+      }
+
       I = CallBrInst::Create(FTy, Callee, DefaultDest, IndirectDests, Args,
                              OperandBundles);
       ResTypeID = getContainedTypeID(FTyID);
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 0d57ae4ef9df..13d53a35084d 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1226,10 +1226,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       break;
     }
 
-    MetadataList.assignValue(
-        LocalAsMetadata::get(ValueList.getValueFwdRef(
-            Record[1], Ty, TyID, /*ConstExprInsertBB*/ nullptr)),
-        NextMetadataNo);
+    Value *V = ValueList.getValueFwdRef(Record[1], Ty, TyID,
+                                        /*ConstExprInsertBB*/ nullptr);
+    if (!V)
+      return error("Invalid value reference from old fn metadata");
+
+    MetadataList.assignValue(LocalAsMetadata::get(V), NextMetadataNo);
     NextMetadataNo++;
     break;
   }
@@ -1248,8 +1250,11 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       if (Ty->isMetadataTy())
         Elts.push_back(getMD(Record[i + 1]));
       else if (!Ty->isVoidTy()) {
-        auto *MD = ValueAsMetadata::get(ValueList.getValueFwdRef(
-            Record[i + 1], Ty, TyID, /*ConstExprInsertBB*/ nullptr));
+        Value *V = ValueList.getValueFwdRef(Record[i + 1], Ty, TyID,
+                                            /*ConstExprInsertBB*/ nullptr);
+        if (!V)
+          return error("Invalid value reference from old metadata");
+        auto *MD = ValueAsMetadata::get(V);
         assert(isa<ConstantAsMetadata>(MD) &&
                "Expected non-function-local metadata");
         Elts.push_back(MD);
@@ -1269,10 +1274,12 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (Ty->isMetadataTy() || Ty->isVoidTy())
       return error("Invalid record");
 
-    MetadataList.assignValue(
-        ValueAsMetadata::get(ValueList.getValueFwdRef(
-            Record[1], Ty, TyID, /*ConstExprInsertBB*/ nullptr)),
-        NextMetadataNo);
+    Value *V = ValueList.getValueFwdRef(Record[1], Ty, TyID,
+                                        /*ConstExprInsertBB*/ nullptr);
+    if (!V)
+      return error("Invalid value reference from metadata");
+
+    MetadataList.assignValue(ValueAsMetadata::get(V), NextMetadataNo);
     NextMetadataNo++;
     break;
   }
diff --git a/llvm/lib/Bitcode/Reader/ValueList.h b/llvm/lib/Bitcode/Reader/ValueList.h
index 995d46f01f75..a5b3f6e20707 100644
--- a/llvm/lib/Bitcode/Reader/ValueList.h
+++ b/llvm/lib/Bitcode/Reader/ValueList.h
@@ -21,7 +21,6 @@
 
 namespace llvm {
 
-class Constant;
 class Error;
 class Type;
 class Value;
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 590562ce2796..d7e012fb6a9e 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -4104,8 +4104,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
 
   for (const GlobalAlias &A : M.aliases()) {
     auto *Aliasee = A.getAliaseeObject();
-    if (!Aliasee->hasName())
-      // Nameless function don't have an entry in the summary, skip it.
+    // Skip ifunc and nameless functions which don't have an entry in the
+    // summary.
+    if (!Aliasee->hasName() || isa<GlobalIFunc>(Aliasee))
       continue;
     auto AliasId = VE.getValueID(&A);
     auto AliaseeId = VE.getValueID(Aliasee);
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index 727ec2e02cc2..998f629aaa4e 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -802,7 +802,7 @@ void ValueEnumerator::organizeMetadata() {
   //   - by function, then
   //   - by isa<MDString>
   // and then sort by the original/current ID.  Since the IDs are guaranteed to
-  // be unique, the result of std::sort will be deterministic.  There's no need
+  // be unique, the result of llvm::sort will be deterministic.  There's no need
   // for std::stable_sort.
   llvm::sort(Order, [this](MDIndex LHS, MDIndex RHS) {
     return std::make_tuple(LHS.F, getMetadataTypeOrder(LHS.get(MDs)), LHS.ID) <
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index b10d79f4b5a6..9526bf7610b4 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -245,8 +245,8 @@ public:
 void AccelTableWriter::emitHashes() const {
   uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
   unsigned BucketIdx = 0;
-  for (auto &Bucket : Contents.getBuckets()) {
-    for (auto &Hash : Bucket) {
+  for (const auto &Bucket : Contents.getBuckets()) {
+    for (const auto &Hash : Bucket) {
       uint32_t HashValue = Hash->HashValue;
       if (SkipIdenticalHashes && PrevHash == HashValue)
         continue;
@@ -327,7 +327,7 @@ void AppleAccelTableWriter::emitData() const {
   const auto &Buckets = Contents.getBuckets();
   for (const AccelTableBase::HashList &Bucket : Buckets) {
     uint64_t PrevHash = std::numeric_limits<uint64_t>::max();
-    for (auto &Hash : Bucket) {
+    for (const auto &Hash : Bucket) {
       // Terminate the previous entry if there is no hash collision with the
       // current one.
       if (PrevHash != std::numeric_limits<uint64_t>::max() &&
@@ -667,12 +667,12 @@ void AccelTableBase::print(raw_ostream &OS) const {
   }
 
   OS << "Buckets and Hashes: \n";
-  for (auto &Bucket : Buckets)
-    for (auto &Hash : Bucket)
+  for (const auto &Bucket : Buckets)
+    for (const auto &Hash : Bucket)
       Hash->print(OS);
 
   OS << "Data: \n";
-  for (auto &E : Entries)
+  for (const auto &E : Entries)
     E.second.print(OS);
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 94612a51d2e1..e0050a47a6f6 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -488,7 +488,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
-  for (auto &I : *MI)
+  for (const auto &I : *MI)
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
       MP->beginAssembly(M, *MI, *this);
 
@@ -1731,7 +1731,7 @@ static unsigned getNumGlobalVariableUses(const Constant *C) {
     return 1;
 
   unsigned NumUses = 0;
-  for (auto *CU : C->users())
+  for (const auto *CU : C->users())
     NumUses += getNumGlobalVariableUses(dyn_cast<Constant>(CU));
 
   return NumUses;
@@ -1754,7 +1754,7 @@ static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
 
   // To be a got equivalent, at least one of its users need to be a constant
   // expression used by another global variable.
-  for (auto *U : GV->users())
+  for (const auto *U : GV->users())
     NumGOTEquivUsers += getNumGlobalVariableUses(dyn_cast<Constant>(U));
 
   return NumGOTEquivUsers > 0;
@@ -1797,7 +1797,7 @@ void AsmPrinter::emitGlobalGOTEquivs() {
   }
   GlobalGOTEquivs.clear();
 
-  for (auto *GV : FailedCandidates)
+  for (const auto *GV : FailedCandidates)
     emitGlobalVariable(GV);
 }
 
@@ -2731,6 +2731,8 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   // to represent relocations on supported targets. Expressions involving only
   // constant addresses are constant folded instead.
   switch (CE->getOpcode()) {
+  default:
+    break; // Error
   case Instruction::AddrSpaceCast: {
     const Constant *Op = CE->getOperand(0);
     unsigned DstAS = CE->getType()->getPointerAddressSpace();
@@ -2738,24 +2740,7 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     if (TM.isNoopAddrSpaceCast(SrcAS, DstAS))
       return lowerConstant(Op);
 
-    // Fallthrough to error.
-    LLVM_FALLTHROUGH;
-  }
-  default: {
-    // If the code isn't optimized, there may be outstanding folding
-    // opportunities. Attempt to fold the expression using DataLayout as a
-    // last resort before giving up.
-    Constant *C = ConstantFoldConstant(CE, getDataLayout());
-    if (C != CE)
-      return lowerConstant(C);
-
-    // Otherwise report the problem to the user.
-    std::string S;
-    raw_string_ostream OS(S);
-    OS << "Unsupported expression in static initializer: ";
-    CE->printAsOperand(OS, /*PrintType=*/false,
-                   !MF ? nullptr : MF->getFunction().getParent());
-    report_fatal_error(Twine(OS.str()));
+    break; // Error
   }
   case Instruction::GetElementPtr: {
     // Generate a symbolic expression for the byte address
@@ -2860,6 +2845,21 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
   }
   }
+
+  // If the code isn't optimized, there may be outstanding folding
+  // opportunities. Attempt to fold the expression using DataLayout as a
+  // last resort before giving up.
+  Constant *C = ConstantFoldConstant(CE, getDataLayout());
+  if (C != CE)
+    return lowerConstant(C);
+
+  // Otherwise report the problem to the user.
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << "Unsupported expression in static initializer: ";
+  CE->printAsOperand(OS, /*PrintType=*/false,
+                     !MF ? nullptr : MF->getFunction().getParent());
+  report_fatal_error(Twine(OS.str()));
 }
 
 static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
@@ -3359,9 +3359,12 @@ void AsmPrinter::emitGlobalConstant(const DataLayout &DL, const Constant *CV,
   }
   if (!AliasList)
     return;
-  for (const auto &AliasPair : *AliasList)
-    report_fatal_error("Aliases with offset " + Twine(AliasPair.first) +
-                       " were not emitted.");
+  // TODO: These remaining aliases are not emitted in the correct location. Need
+  // to handle the case where the alias offset doesn't refer to any sub-element.
+  for (auto &AliasPair : *AliasList) {
+    for (const GlobalAlias *GA : AliasPair.second)
+      OutStreamer->emitLabel(getSymbol(GA));
+  }
 }
 
 void AsmPrinter::emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
@@ -3717,7 +3720,7 @@ void AsmPrinter::emitStackMaps(StackMaps &SM) {
     // No GC strategy, use the default format.
     NeedsDefault = true;
   else
-    for (auto &I : *MI) {
+    for (const auto &I : *MI) {
       if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
         if (MP->emitStackMaps(SM, *this))
           continue;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 719fec06aa33..bfa53f5b9374 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -309,7 +309,7 @@ void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
 
   // Emit the DIE children if any.
   if (Die.hasChildren()) {
-    for (auto &Child : Die.children())
+    for (const auto &Child : Die.children())
       emitDwarfDIE(Child);
 
     OutStreamer->AddComment("End Of Children Mark");
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 5da50d7aab9f..1d546e5fd72e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -374,7 +374,7 @@ void DIEHash::computeHash(const DIE &Die) {
   addAttributes(Die);
 
   // Then hash each of the children of the DIE.
-  for (auto &C : Die.children()) {
+  for (const auto &C : Die.children()) {
     // 7.27 Step 7
     // If C is a nested type entry or a member function entry, ...
     if (isType(C.getTag()) || (C.getTag() == dwarf::DW_TAG_subprogram && isType(C.getParent()->getTag()))) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 1358f4d25990..dabbfb45f687 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -340,11 +340,11 @@ static void clobberRegEntries(InlinedEntity Var, unsigned RegNo,
     if (Entry.getInstr()->hasDebugOperandForReg(RegNo)) {
       IndicesToErase.push_back(Index);
       Entry.endEntry(ClobberIndex);
-      for (auto &MO : Entry.getInstr()->debug_operands())
+      for (const auto &MO : Entry.getInstr()->debug_operands())
         if (MO.isReg() && MO.getReg() && MO.getReg() != RegNo)
           MaybeRemovedRegisters.insert(MO.getReg());
     } else {
-      for (auto &MO : Entry.getInstr()->debug_operands())
+      for (const auto &MO : Entry.getInstr()->debug_operands())
         if (MO.isReg() && MO.getReg())
           KeepRegisters.insert(MO.getReg());
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 660a064687d3..8ebbed974abb 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -304,7 +304,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
         LabelsBeforeInsn[Entries.front().getInstr()] = Asm->getFunctionBegin();
       if (Entries.front().getInstr()->getDebugExpression()->isFragment()) {
         // Mark all non-overlapping initial fragments.
-        for (auto I = Entries.begin(); I != Entries.end(); ++I) {
+        for (const auto *I = Entries.begin(); I != Entries.end(); ++I) {
           if (!I->isDbgValue())
             continue;
           const DIExpression *Fragment = I->getInstr()->getDebugExpression();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index b3f99d346faa..b26960cdebb8 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -848,7 +848,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
   Optional<unsigned> NVPTXAddressSpace;
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
-  for (auto &Fragment : DV.getFrameIndexExprs()) {
+  for (const auto &Fragment : DV.getFrameIndexExprs()) {
     Register FrameReg;
     const DIExpression *Expr = Fragment.Expr;
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
@@ -970,7 +970,7 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
   SmallDenseSet<DbgVariable *, 8> Visiting;
 
   // Initialize the worklist and the DIVariable lookup table.
-  for (auto Var : reverse(Input)) {
+  for (auto *Var : reverse(Input)) {
     DbgVar.insert({Var->getVariable(), Var});
     WorkList.push_back({Var, 0});
   }
@@ -1005,7 +1005,7 @@ sortLocalVars(SmallVectorImpl<DbgVariable *> &Input) {
     // Push dependencies and this node onto the worklist, so that this node is
     // visited again after all of its dependencies are handled.
     WorkList.push_back({Var, 1});
-    for (auto *Dependency : dependencies(Var)) {
+    for (const auto *Dependency : dependencies(Var)) {
       // Don't add dependency if it is in a different lexical scope or a global.
       if (const auto *Dep = dyn_cast<const DILocalVariable>(Dependency))
         if (DbgVariable *Var = DbgVar.lookup(Dep))
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 866338a949f3..54af14429907 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -819,7 +819,7 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
   }
 
   // Do not emit CSInfo for undef forwarding registers.
-  for (auto &MO : CallMI->uses())
+  for (const auto &MO : CallMI->uses())
     if (MO.isReg() && MO.isUndef())
       ForwardedRegWorklist.erase(MO.getReg());
 
@@ -2235,7 +2235,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
 #endif
   // Construct abstract scopes.
   for (LexicalScope *AScope : LScopes.getAbstractScopesList()) {
-    auto *SP = cast<DISubprogram>(AScope->getScopeNode());
+    const auto *SP = cast<DISubprogram>(AScope->getScopeNode());
     for (const DINode *DN : SP->getRetainedNodes()) {
       if (!Processed.insert(InlinedEntity(DN, nullptr)).second)
         continue;
@@ -2527,7 +2527,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
 
   using Encoding = DWARFExpression::Operation::Encoding;
   uint64_t Offset = 0;
-  for (auto &Op : Expr) {
+  for (const auto &Op : Expr) {
     assert(Op.getCode() != dwarf::DW_OP_const_type &&
            "3 operand ops not yet supported");
     Streamer.emitInt8(Op.getCode(), Comment != End ? *(Comment++) : "");
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index ad9dc517539a..f21c1bf4e914 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -187,7 +187,7 @@ bool AtomicExpand::runOnFunction(Function &F) {
       AtomicInsts.push_back(&I);
 
   bool MadeChange = false;
-  for (auto I : AtomicInsts) {
+  for (auto *I : AtomicInsts) {
     auto LI = dyn_cast<LoadInst>(I);
     auto SI = dyn_cast<StoreInst>(I);
     auto RMWI = dyn_cast<AtomicRMWInst>(I);
@@ -1371,7 +1371,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // Look for any users of the cmpxchg that are just comparing the loaded value
   // against the desired one, and replace them with the CFG-derived version.
   SmallVector<ExtractValueInst *, 2> PrunedInsts;
-  for (auto User : CI->users()) {
+  for (auto *User : CI->users()) {
     ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
     if (!EV)
       continue;
@@ -1388,7 +1388,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   }
 
   // We can remove the instructions now we're no longer iterating through them.
-  for (auto EV : PrunedInsts)
+  for (auto *EV : PrunedInsts)
     EV->eraseFromParent();
 
   if (!CI->use_empty()) {
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index f05f5b9f9947..958212a0e448 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -268,8 +268,8 @@ void llvm::sortBasicBlocksAndUpdateBranches(
 // If the exception section begins with a landing pad, that landing pad will
 // assume a zero offset (relative to @LPStart) in the LSDA. However, a value of
 // zero implies "no landing pad." This function inserts a NOP just before the EH
-// pad label to ensure a nonzero offset. Returns true if padding is not needed.
-static bool avoidZeroOffsetLandingPad(MachineFunction &MF) {
+// pad label to ensure a nonzero offset.
+void llvm::avoidZeroOffsetLandingPad(MachineFunction &MF) {
   for (auto &MBB : MF) {
     if (MBB.isBeginSection() && MBB.isEHPad()) {
       MachineBasicBlock::iterator MI = MBB.begin();
@@ -278,10 +278,8 @@ static bool avoidZeroOffsetLandingPad(MachineFunction &MF) {
       MCInst Nop = MF.getSubtarget().getInstrInfo()->getNop();
       BuildMI(MBB, MI, DebugLoc(),
               MF.getSubtarget().getInstrInfo()->get(Nop.getOpcode()));
-      return false;
     }
   }
-  return true;
 }
 
 // This checks if the source of this function has drifted since this binary was
@@ -297,7 +295,7 @@ static bool hasInstrProfHashMismatch(MachineFunction &MF) {
   auto *Existing = MF.getFunction().getMetadata(LLVMContext::MD_annotation);
   if (Existing) {
     MDTuple *Tuple = cast<MDTuple>(Existing);
-    for (auto &N : Tuple->operands())
+    for (const auto &N : Tuple->operands())
       if (cast<MDString>(N.get())->getString() == MetadataName)
         return true;
   }
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index 689e49978d43..519b24c21d7a 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -121,7 +121,7 @@ bool VirtRegAuxInfo::isRematerializable(const LiveInterval &LI,
       assert(MI && "Dead valno in interval");
     }
 
-    if (!TII.isTriviallyReMaterializable(*MI, LIS.getAliasAnalysis()))
+    if (!TII.isTriviallyReMaterializable(*MI))
       return false;
   }
   return true;
@@ -279,7 +279,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &LI, SlotIndex *Start,
       MRI.clearSimpleHint(LI.reg());
 
     std::set<Register> HintedRegs;
-    for (auto &Hint : CopyHints) {
+    for (const auto &Hint : CopyHints) {
       if (!HintedRegs.insert(Hint.Reg).second ||
           (TargetHint.first != 0 && Hint.Reg == TargetHint.second))
         // Don't add the same reg twice or the target-type hint again.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 6778af22f532..b6c762b93ca5 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -730,7 +730,7 @@ bool CodeGenPrepare::eliminateFallThrough(Function &F) {
 
   // (Repeatedly) merging blocks into their predecessors can create redundant
   // debug intrinsics.
-  for (auto &Pred : Preds)
+  for (const auto &Pred : Preds)
     if (auto *BB = cast_or_null<BasicBlock>(Pred))
       RemoveRedundantDbgInstrs(BB);
 
@@ -3684,7 +3684,7 @@ private:
       // Phi we added (subject to match) and both of them is in the same basic
       // block then we can match our pair if values match. So we state that
       // these values match and add it to work list to verify that.
-      for (auto B : Item.first->blocks()) {
+      for (auto *B : Item.first->blocks()) {
         Value *FirstValue = Item.first->getIncomingValueForBlock(B);
         Value *SecondValue = Item.second->getIncomingValueForBlock(B);
         if (FirstValue == SecondValue)
@@ -5227,18 +5227,31 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   WeakTrackingVH SunkAddrVH = SunkAddrs[Addr];
 
   Value * SunkAddr = SunkAddrVH.pointsToAliveValue() ? SunkAddrVH : nullptr;
+  Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
   if (SunkAddr) {
     LLVM_DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode
                       << " for " << *MemoryInst << "\n");
-    if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
+    if (SunkAddr->getType() != Addr->getType()) {
+      if (SunkAddr->getType()->getPointerAddressSpace() !=
+              Addr->getType()->getPointerAddressSpace() &&
+          !DL->isNonIntegralPointerType(Addr->getType())) {
+        // There are two reasons the address spaces might not match: a no-op
+        // addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a
+        // ptrtoint/inttoptr pair to ensure we match the original semantics.
+        // TODO: allow bitcast between different address space pointers with the
+        // same size.
+        SunkAddr = Builder.CreatePtrToInt(SunkAddr, IntPtrTy, "sunkaddr");
+        SunkAddr =
+            Builder.CreateIntToPtr(SunkAddr, Addr->getType(), "sunkaddr");
+      } else
+        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
+    }
   } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
                                    SubtargetInfo->addrSinkUsingGEPs())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
                       << " for " << *MemoryInst << "\n");
-    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
     Value *ResultPtr = nullptr, *ResultIndex = nullptr;
 
     // First, find the pointer.
@@ -5361,8 +5374,21 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                                      AddrMode.InBounds);
       }
 
-      if (SunkAddr->getType() != Addr->getType())
-        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
+      if (SunkAddr->getType() != Addr->getType()) {
+        if (SunkAddr->getType()->getPointerAddressSpace() !=
+                Addr->getType()->getPointerAddressSpace() &&
+            !DL->isNonIntegralPointerType(Addr->getType())) {
+          // There are two reasons the address spaces might not match: a no-op
+          // addrspacecast, or a ptrtoint/inttoptr pair. Either way, we emit a
+          // ptrtoint/inttoptr pair to ensure we match the original semantics.
+          // TODO: allow bitcast between different address space pointers with
+          // the same size.
+          SunkAddr = Builder.CreatePtrToInt(SunkAddr, IntPtrTy, "sunkaddr");
+          SunkAddr =
+              Builder.CreateIntToPtr(SunkAddr, Addr->getType(), "sunkaddr");
+        } else
+          SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
+      }
     }
   } else {
     // We'd require a ptrtoint/inttoptr down the line, which we can't do for
@@ -7793,9 +7819,11 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
   }
   // After unmerging, verify that GEPIOp is actually only used in SrcBlock (not
   // alive on IndirectBr edges).
-  assert(find_if(GEPIOp->users(), [&](User *Usr) {
-        return cast<Instruction>(Usr)->getParent() != SrcBlock;
-      }) == GEPIOp->users().end() && "GEPIOp is used outside SrcBlock");
+  assert(llvm::none_of(GEPIOp->users(),
+                       [&](User *Usr) {
+                         return cast<Instruction>(Usr)->getParent() != SrcBlock;
+                       }) &&
+         "GEPIOp is used outside SrcBlock");
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/DFAPacketizer.cpp b/llvm/lib/CodeGen/DFAPacketizer.cpp
index 42192f41dbda..34fb1d286a58 100644
--- a/llvm/lib/CodeGen/DFAPacketizer.cpp
+++ b/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -239,7 +239,7 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
     });
     if (ResourceAvail && shouldAddToPacket(MI)) {
       // Dependency check for MI with instructions in CurrentPacketMIs.
-      for (auto MJ : CurrentPacketMIs) {
+      for (auto *MJ : CurrentPacketMIs) {
         SUnit *SUJ = MIToSUnit[MJ];
         assert(SUJ && "Missing SUnit Info!");
 
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 32858d043383..c108f0088d43 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -576,7 +576,7 @@ static bool hasSameValue(const MachineRegisterInfo &MRI,
 
   // If the instruction could modify memory, or there may be some intervening
   // store between the two, we can't consider them to be equal.
-  if (TDef->mayLoadOrStore() && !TDef->isDereferenceableInvariantLoad(nullptr))
+  if (TDef->mayLoadOrStore() && !TDef->isDereferenceableInvariantLoad())
     return false;
 
   // We also can't guarantee that they are the same if, for example, the
@@ -808,7 +808,7 @@ void updateDomTree(MachineDominatorTree *DomTree, const SSAIfConv &IfConv,
   // TBB and FBB should not dominate any blocks.
   // Tail children should be transferred to Head.
   MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
-  for (auto B : Removed) {
+  for (auto *B : Removed) {
     MachineDomTreeNode *Node = DomTree->getNode(B);
     assert(Node != HeadNode && "Cannot erase the head node");
     while (Node->getNumChildren()) {
@@ -826,7 +826,7 @@ void updateLoops(MachineLoopInfo *Loops,
     return;
   // If-conversion doesn't change loop structure, and it doesn't mess with back
   // edges, so updating LoopInfo is simply removing the dead blocks.
-  for (auto B : Removed)
+  for (auto *B : Removed)
     Loops->removeBlock(B);
 }
 } // namespace
@@ -1065,7 +1065,7 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   // if-conversion in a single pass. The tryConvertIf() function may erase
   // blocks, but only blocks dominated by the head block. This makes it safe to
   // update the dominator tree while the post-order iterator is still active.
-  for (auto DomNode : post_order(DomTree))
+  for (auto *DomNode : post_order(DomTree))
     if (tryConvertIf(DomNode->getBlock()))
       Changed = true;
 
@@ -1198,7 +1198,7 @@ bool EarlyIfPredicator::runOnMachineFunction(MachineFunction &MF) {
   // if-conversion in a single pass. The tryConvertIf() function may erase
   // blocks, but only blocks dominated by the head block. This makes it safe to
   // update the dominator tree while the post-order iterator is still active.
-  for (auto DomNode : post_order(DomTree))
+  for (auto *DomNode : post_order(DomTree))
     if (tryConvertIf(DomNode->getBlock()))
       Changed = true;
 
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 59932a542bbc..db4d42bf3ca4 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -82,8 +83,11 @@ STATISTIC(NumLoweredVPOps, "Number of folded vector predication operations");
 
 /// \returns Whether the vector mask \p MaskVal has all lane bits set.
 static bool isAllTrueMask(Value *MaskVal) {
-  auto *ConstVec = dyn_cast<ConstantVector>(MaskVal);
-  return ConstVec && ConstVec->isAllOnesValue();
+  if (Value *SplattedVal = getSplatValue(MaskVal))
+    if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
+      return ConstValue->isAllOnesValue();
+
+  return false;
 }
 
 /// \returns A non-excepting divisor constant for this type.
@@ -171,6 +175,10 @@ struct CachingVPExpander {
   Value *expandPredicationInReduction(IRBuilder<> &Builder,
                                       VPReductionIntrinsic &PI);
 
+  /// \brief Lower this VP memory operation to a non-VP intrinsic.
+  Value *expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                            VPIntrinsic &VPI);
+
   /// \brief Query TTI and expand the vector predication in \p P accordingly.
   Value *expandPredication(VPIntrinsic &PI);
 
@@ -389,6 +397,71 @@ CachingVPExpander::expandPredicationInReduction(IRBuilder<> &Builder,
   return Reduction;
 }
 
+Value *
+CachingVPExpander::expandPredicationInMemoryIntrinsic(IRBuilder<> &Builder,
+                                                      VPIntrinsic &VPI) {
+  assert(VPI.canIgnoreVectorLengthParam());
+
+  const auto &DL = F.getParent()->getDataLayout();
+
+  Value *MaskParam = VPI.getMaskParam();
+  Value *PtrParam = VPI.getMemoryPointerParam();
+  Value *DataParam = VPI.getMemoryDataParam();
+  bool IsUnmasked = isAllTrueMask(MaskParam);
+
+  MaybeAlign AlignOpt = VPI.getPointerAlignment();
+
+  Value *NewMemoryInst = nullptr;
+  switch (VPI.getIntrinsicID()) {
+  default:
+    llvm_unreachable("Not a VP memory intrinsic");
+  case Intrinsic::vp_store:
+    if (IsUnmasked) {
+      StoreInst *NewStore =
+          Builder.CreateStore(DataParam, PtrParam, /*IsVolatile*/ false);
+      if (AlignOpt.has_value())
+        NewStore->setAlignment(AlignOpt.value());
+      NewMemoryInst = NewStore;
+    } else
+      NewMemoryInst = Builder.CreateMaskedStore(
+          DataParam, PtrParam, AlignOpt.valueOrOne(), MaskParam);
+
+    break;
+  case Intrinsic::vp_load:
+    if (IsUnmasked) {
+      LoadInst *NewLoad =
+          Builder.CreateLoad(VPI.getType(), PtrParam, /*IsVolatile*/ false);
+      if (AlignOpt.has_value())
+        NewLoad->setAlignment(AlignOpt.value());
+      NewMemoryInst = NewLoad;
+    } else
+      NewMemoryInst = Builder.CreateMaskedLoad(
+          VPI.getType(), PtrParam, AlignOpt.valueOrOne(), MaskParam);
+
+    break;
+  case Intrinsic::vp_scatter: {
+    auto *ElementType =
+        cast<VectorType>(DataParam->getType())->getElementType();
+    NewMemoryInst = Builder.CreateMaskedScatter(
+        DataParam, PtrParam,
+        AlignOpt.value_or(DL.getPrefTypeAlign(ElementType)), MaskParam);
+    break;
+  }
+  case Intrinsic::vp_gather: {
+    auto *ElementType = cast<VectorType>(VPI.getType())->getElementType();
+    NewMemoryInst = Builder.CreateMaskedGather(
+        VPI.getType(), PtrParam,
+        AlignOpt.value_or(DL.getPrefTypeAlign(ElementType)), MaskParam, nullptr,
+        VPI.getName());
+    break;
+  }
+  }
+
+  assert(NewMemoryInst);
+  replaceOperation(*NewMemoryInst, VPI);
+  return NewMemoryInst;
+}
+
 void CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   LLVM_DEBUG(dbgs() << "Discard EVL parameter in " << VPI << "\n");
 
@@ -465,6 +538,16 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   if (auto *VPRI = dyn_cast<VPReductionIntrinsic>(&VPI))
     return expandPredicationInReduction(Builder, *VPRI);
 
+  switch (VPI.getIntrinsicID()) {
+  default:
+    break;
+  case Intrinsic::vp_load:
+  case Intrinsic::vp_store:
+  case Intrinsic::vp_gather:
+  case Intrinsic::vp_scatter:
+    return expandPredicationInMemoryIntrinsic(Builder, VPI);
+  }
+
   return &VPI;
 }
 
diff --git a/llvm/lib/CodeGen/FaultMaps.cpp b/llvm/lib/CodeGen/FaultMaps.cpp
index 3ec666227651..3f8fe2402d65 100644
--- a/llvm/lib/CodeGen/FaultMaps.cpp
+++ b/llvm/lib/CodeGen/FaultMaps.cpp
@@ -85,7 +85,7 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
 
   OS.emitInt32(0); // Reserved
 
-  for (auto &Fault : FFI) {
+  for (const auto &Fault : FFI) {
     LLVM_DEBUG(dbgs() << WFMP << "    fault type: "
                       << faultTypeToString(Fault.Kind) << "\n");
     OS.emitInt32(Fault.Kind);
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
index ac140e745600..6a0d1c33d3e3 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -319,7 +319,7 @@ const GISelInstProfileBuilder &
 GISelInstProfileBuilder::addNodeID(const MachineInstr *MI) const {
   addNodeIDMBB(MI->getParent());
   addNodeIDOpcode(MI->getOpcode());
-  for (auto &Op : MI->operands())
+  for (const auto &Op : MI->operands())
     addNodeIDMachineOperand(Op);
   addNodeIDFlag(MI->getFlags());
   return *this;
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index b06043fb4c31..6c36c6445c65 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -116,7 +116,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
   // we'll pass to the assigner function.
   unsigned i = 0;
   unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
-  for (auto &Arg : CB.args()) {
+  for (const auto &Arg : CB.args()) {
     ArgInfo OrigArg{ArgRegs[i], *Arg.get(), i, getAttributesForArgIdx(CB, i),
                     i < NumFixedArgs};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
@@ -960,7 +960,7 @@ bool CallLowering::parametersInCSRMatch(
     const SmallVectorImpl<CCValAssign> &OutLocs,
     const SmallVectorImpl<ArgInfo> &OutArgs) const {
   for (unsigned i = 0; i < OutLocs.size(); ++i) {
-    auto &ArgLoc = OutLocs[i];
+    const auto &ArgLoc = OutLocs[i];
     // If it's not a register, it's fine.
     if (!ArgLoc.isRegLoc())
       continue;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index ad0c0c8315dc..da054b9c14fb 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2385,7 +2385,7 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
   // loading from. To be safe, let's just assume that all loads and stores
   // are different (unless we have something which is guaranteed to not
   // change.)
-  if (I1->mayLoadOrStore() && !I1->isDereferenceableInvariantLoad(nullptr))
+  if (I1->mayLoadOrStore() && !I1->isDereferenceableInvariantLoad())
     return false;
 
   // If both instructions are loads or stores, they are equal only if both
@@ -2396,7 +2396,7 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
     if (!LS1 || !LS2)
       return false;
 
-    if (!I2->isDereferenceableInvariantLoad(nullptr) ||
+    if (!I2->isDereferenceableInvariantLoad() ||
         (LS1->getMemSizeInBits() != LS2->getMemSizeInBits()))
       return false;
   }
@@ -4800,24 +4800,22 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
   auto BuildUDIVPattern = [&](const Constant *C) {
     auto *CI = cast<ConstantInt>(C);
     const APInt &Divisor = CI->getValue();
-    UnsignedDivisonByConstantInfo magics =
-        UnsignedDivisonByConstantInfo::get(Divisor);
+    UnsignedDivisionByConstantInfo magics =
+        UnsignedDivisionByConstantInfo::get(Divisor);
     unsigned PreShift = 0, PostShift = 0;
 
     // If the divisor is even, we can avoid using the expensive fixup by
     // shifting the divided value upfront.
-    if (magics.IsAdd != 0 && !Divisor[0]) {
+    if (magics.IsAdd && !Divisor[0]) {
       PreShift = Divisor.countTrailingZeros();
       // Get magic number for the shifted divisor.
       magics =
-          UnsignedDivisonByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
-      assert(magics.IsAdd == 0 && "Should use cheap fixup now");
+          UnsignedDivisionByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
+      assert(!magics.IsAdd && "Should use cheap fixup now");
     }
 
-    APInt Magic = magics.Magic;
-
     unsigned SelNPQ;
-    if (magics.IsAdd == 0 || Divisor.isOneValue()) {
+    if (!magics.IsAdd || Divisor.isOneValue()) {
       assert(magics.ShiftAmount < Divisor.getBitWidth() &&
              "We shouldn't generate an undefined shift!");
       PostShift = magics.ShiftAmount;
@@ -4829,7 +4827,7 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
 
     PreShifts.push_back(
         MIB.buildConstant(ScalarShiftAmtTy, PreShift).getReg(0));
-    MagicFactors.push_back(MIB.buildConstant(ScalarTy, Magic).getReg(0));
+    MagicFactors.push_back(MIB.buildConstant(ScalarTy, magics.Magic).getReg(0));
     NPQFactors.push_back(
         MIB.buildConstant(ScalarTy,
                           SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 947facc87b71..dbdcfe0b6f0b 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -166,8 +167,10 @@ void IRTranslator::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<StackProtector>();
   AU.addRequired<TargetPassConfig>();
   AU.addRequired<GISelCSEAnalysisWrapperPass>();
-  if (OptLevel != CodeGenOpt::None)
+  if (OptLevel != CodeGenOpt::None) {
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+  }
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addPreserved<TargetLibraryInfoWrapperPass>();
   getSelectionDAGFallbackAnalysisUsage(AU);
@@ -684,7 +687,7 @@ bool IRTranslator::translateSwitch(const User &U, MachineIRBuilder &MIB) {
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   CaseClusterVector Clusters;
   Clusters.reserve(SI.getNumCases());
-  for (auto &I : SI.cases()) {
+  for (const auto &I : SI.cases()) {
     MachineBasicBlock *Succ = &getMBB(*I.getCaseSuccessor());
     assert(Succ && "Could not find successor mbb in mapping");
     const ConstantInt *CaseVal = I.getCaseValue();
@@ -1275,26 +1278,41 @@ static bool isSwiftError(const Value *V) {
 
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
-  if (DL->getTypeStoreSize(LI.getType()) == 0)
+
+  unsigned StoreSize = DL->getTypeStoreSize(LI.getType());
+  if (StoreSize == 0)
     return true;
 
   ArrayRef<Register> Regs = getOrCreateVRegs(LI);
   ArrayRef<uint64_t> Offsets = *VMap.getOffsets(LI);
   Register Base = getOrCreateVReg(*LI.getPointerOperand());
+  AAMDNodes AAInfo = LI.getAAMetadata();
 
-  Type *OffsetIRTy = DL->getIntPtrType(LI.getPointerOperandType());
+  const Value *Ptr = LI.getPointerOperand();
+  Type *OffsetIRTy = DL->getIntPtrType(Ptr->getType());
   LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
 
-  if (CLI->supportSwiftError() && isSwiftError(LI.getPointerOperand())) {
+  if (CLI->supportSwiftError() && isSwiftError(Ptr)) {
     assert(Regs.size() == 1 && "swifterror should be single pointer");
-    Register VReg = SwiftError.getOrCreateVRegUseAt(&LI, &MIRBuilder.getMBB(),
-                                                    LI.getPointerOperand());
+    Register VReg =
+        SwiftError.getOrCreateVRegUseAt(&LI, &MIRBuilder.getMBB(), Ptr);
     MIRBuilder.buildCopy(Regs[0], VReg);
     return true;
   }
 
   auto &TLI = *MF->getSubtarget().getTargetLowering();
   MachineMemOperand::Flags Flags = TLI.getLoadMemOperandFlags(LI, *DL);
+  if (AA && !(Flags & MachineMemOperand::MOInvariant)) {
+    if (AA->pointsToConstantMemory(
+            MemoryLocation(Ptr, LocationSize::precise(StoreSize), AAInfo))) {
+      Flags |= MachineMemOperand::MOInvariant;
+
+      // FIXME: pointsToConstantMemory probably does not imply dereferenceable,
+      // but the previous usage implied it did. Probably should check
+      // isDereferenceableAndAlignedPointer.
+      Flags |= MachineMemOperand::MODereferenceable;
+    }
+  }
 
   const MDNode *Ranges =
       Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
@@ -1306,7 +1324,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     Align BaseAlign = getMemOpAlign(LI);
     auto MMO = MF->getMachineMemOperand(
         Ptr, Flags, MRI->getType(Regs[i]),
-        commonAlignment(BaseAlign, Offsets[i] / 8), LI.getAAMetadata(), Ranges,
+        commonAlignment(BaseAlign, Offsets[i] / 8), AAInfo, Ranges,
         LI.getSyncScopeID(), LI.getOrdering());
     MIRBuilder.buildLoad(Regs[i], Addr, *MMO);
   }
@@ -1400,7 +1418,7 @@ bool IRTranslator::translateInsertValue(const User &U,
   ArrayRef<uint64_t> DstOffsets = *VMap.getOffsets(U);
   ArrayRef<Register> SrcRegs = getOrCreateVRegs(*Src);
   ArrayRef<Register> InsertedRegs = getOrCreateVRegs(*U.getOperand(1));
-  auto InsertedIt = InsertedRegs.begin();
+  auto *InsertedIt = InsertedRegs.begin();
 
   for (unsigned i = 0; i < DstRegs.size(); ++i) {
     if (DstOffsets[i] >= Offset && InsertedIt != InsertedRegs.end())
@@ -1563,9 +1581,9 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 bool IRTranslator::translateMemFunc(const CallInst &CI,
                                     MachineIRBuilder &MIRBuilder,
                                     unsigned Opcode) {
-
+  const Value *SrcPtr = CI.getArgOperand(1);
   // If the source is undef, then just emit a nop.
-  if (isa<UndefValue>(CI.getArgOperand(1)))
+  if (isa<UndefValue>(SrcPtr))
     return true;
 
   SmallVector<Register, 3> SrcRegs;
@@ -1595,15 +1613,20 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
   unsigned IsVol =
       cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1))->getZExtValue();
 
+  ConstantInt *CopySize = nullptr;
+
   if (auto *MCI = dyn_cast<MemCpyInst>(&CI)) {
     DstAlign = MCI->getDestAlign().valueOrOne();
     SrcAlign = MCI->getSourceAlign().valueOrOne();
+    CopySize = dyn_cast<ConstantInt>(MCI->getArgOperand(2));
   } else if (auto *MCI = dyn_cast<MemCpyInlineInst>(&CI)) {
     DstAlign = MCI->getDestAlign().valueOrOne();
     SrcAlign = MCI->getSourceAlign().valueOrOne();
+    CopySize = dyn_cast<ConstantInt>(MCI->getArgOperand(2));
   } else if (auto *MMI = dyn_cast<MemMoveInst>(&CI)) {
     DstAlign = MMI->getDestAlign().valueOrOne();
     SrcAlign = MMI->getSourceAlign().valueOrOne();
+    CopySize = dyn_cast<ConstantInt>(MMI->getArgOperand(2));
   } else {
     auto *MSI = cast<MemSetInst>(&CI);
     DstAlign = MSI->getDestAlign().valueOrOne();
@@ -1617,14 +1640,31 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
   }
 
   // Create mem operands to store the alignment and volatile info.
-  auto VolFlag = IsVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
-  ICall.addMemOperand(MF->getMachineMemOperand(
-      MachinePointerInfo(CI.getArgOperand(0)),
-      MachineMemOperand::MOStore | VolFlag, 1, DstAlign));
+  MachineMemOperand::Flags LoadFlags = MachineMemOperand::MOLoad;
+  MachineMemOperand::Flags StoreFlags = MachineMemOperand::MOStore;
+  if (IsVol) {
+    LoadFlags |= MachineMemOperand::MOVolatile;
+    StoreFlags |= MachineMemOperand::MOVolatile;
+  }
+
+  AAMDNodes AAInfo = CI.getAAMetadata();
+  if (AA && CopySize &&
+      AA->pointsToConstantMemory(MemoryLocation(
+          SrcPtr, LocationSize::precise(CopySize->getZExtValue()), AAInfo))) {
+    LoadFlags |= MachineMemOperand::MOInvariant;
+
+    // FIXME: pointsToConstantMemory probably does not imply dereferenceable,
+    // but the previous usage implied it did. Probably should check
+    // isDereferenceableAndAlignedPointer.
+    LoadFlags |= MachineMemOperand::MODereferenceable;
+  }
+
+  ICall.addMemOperand(
+      MF->getMachineMemOperand(MachinePointerInfo(CI.getArgOperand(0)),
+                               StoreFlags, 1, DstAlign, AAInfo));
   if (Opcode != TargetOpcode::G_MEMSET)
     ICall.addMemOperand(MF->getMachineMemOperand(
-        MachinePointerInfo(CI.getArgOperand(1)),
-        MachineMemOperand::MOLoad | VolFlag, 1, SrcAlign));
+        MachinePointerInfo(SrcPtr), LoadFlags, 1, SrcAlign, AAInfo));
 
   return true;
 }
@@ -1785,7 +1825,7 @@ bool IRTranslator::translateSimpleIntrinsic(const CallInst &CI,
 
   // Yes. Let's translate it.
   SmallVector<llvm::SrcOp, 4> VRegs;
-  for (auto &Arg : CI.args())
+  for (const auto &Arg : CI.args())
     VRegs.push_back(getOrCreateVReg(*Arg));
 
   MIRBuilder.buildInstr(Op, {getOrCreateVReg(CI)}, VRegs,
@@ -2305,7 +2345,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
   SmallVector<ArrayRef<Register>, 8> Args;
   Register SwiftInVReg = 0;
   Register SwiftErrorVReg = 0;
-  for (auto &Arg : CB.args()) {
+  for (const auto &Arg : CB.args()) {
     if (CLI->supportSwiftError() && isSwiftError(Arg)) {
       assert(SwiftInVReg == 0 && "Expected only one swift error argument");
       LLT Ty = getLLTForType(*Arg->getType(), *DL);
@@ -2394,7 +2434,7 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (isa<FPMathOperator>(CI))
     MIB->copyIRFlags(CI);
 
-  for (auto &Arg : enumerate(CI.args())) {
+  for (const auto &Arg : enumerate(CI.args())) {
     // If this is required to be an immediate, don't materialize it in a
     // register.
     if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
@@ -2947,7 +2987,7 @@ void IRTranslator::finishPendingPhis() {
     for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) {
       auto IRPred = PI->getIncomingBlock(i);
       ArrayRef<Register> ValRegs = getOrCreateVRegs(*PI->getIncomingValue(i));
-      for (auto Pred : getMachinePredBBs({IRPred, PI->getParent()})) {
+      for (auto *Pred : getMachinePredBBs({IRPred, PI->getParent()})) {
         if (SeenPreds.count(Pred) || !PhiMBB->isPredecessor(Pred))
           continue;
         SeenPreds.insert(Pred);
@@ -3347,10 +3387,13 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   TM.resetTargetOptions(F);
   EnableOpts = OptLevel != CodeGenOpt::None && !skipFunction(F);
   FuncInfo.MF = MF;
-  if (EnableOpts)
+  if (EnableOpts) {
+    AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
     FuncInfo.BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
-  else
+  } else {
+    AA = nullptr;
     FuncInfo.BPI = nullptr;
+  }
 
   FuncInfo.CanLowerReturn = CLI->checkReturnTypeForCallConv(*MF);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
index 95ae8383b6fa..e0357c50e555 100644
--- a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -332,6 +332,8 @@ bool InlineAsmLowering::lowerInlineAsm(
       }
       ++ResNo;
     } else {
+      assert(OpInfo.Type != InlineAsm::isLabel &&
+             "GlobalISel currently doesn't support callbr");
       OpInfo.ConstraintVT = MVT::Other;
     }
 
@@ -427,7 +429,8 @@ bool InlineAsmLowering::lowerInlineAsm(
       }
 
       break;
-    case InlineAsm::isInput: {
+    case InlineAsm::isInput:
+    case InlineAsm::isLabel: {
       if (OpInfo.isMatchingInputConstraint()) {
         unsigned DefIdx = OpInfo.getMatchedOperand();
         // Find operand with register def that corresponds to DefIdx.
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index fb046d519ac8..52ee13757f27 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -2393,30 +2393,14 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_FCONSTANT: {
+    // To avoid changing the bits of the constant due to extension to a larger
+    // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
     MachineOperand &SrcMO = MI.getOperand(1);
-    LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
-    APFloat Val = SrcMO.getFPImm()->getValueAPF();
-    bool LosesInfo;
-    switch (WideTy.getSizeInBits()) {
-    case 32:
-      Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
-                  &LosesInfo);
-      break;
-    case 64:
-      Val.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
-                  &LosesInfo);
-      break;
-    default:
-      return UnableToLegalize;
-    }
-
-    assert(!LosesInfo && "extend should always be lossless");
-
-    Observer.changingInstr(MI);
-    SrcMO.setFPImm(ConstantFP::get(Ctx, Val));
-
-    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
-    Observer.changedInstr(MI);
+    APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
+    MIRBuilder.setInstrAndDebugLoc(MI);
+    auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
+    widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
+    MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_IMPLICIT_DEF: {
diff --git a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
index d4fbf7d15089..be1bc865d1e1 100644
--- a/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LoadStoreOpt.cpp
@@ -298,7 +298,7 @@ bool LoadStoreOpt::mergeStores(SmallVectorImpl<GStore *> &StoresToMerge) {
   const auto &LegalSizes = LegalStoreSizes[AS];
 
 #ifndef NDEBUG
-  for (auto StoreMI : StoresToMerge)
+  for (auto *StoreMI : StoresToMerge)
     assert(MRI->getType(StoreMI->getValueReg()) == OrigTy);
 #endif
 
@@ -366,7 +366,7 @@ bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) {
   // directly. Otherwise, we need to generate some instructions to merge the
   // existing values together into a wider type.
   SmallVector<APInt, 8> ConstantVals;
-  for (auto Store : Stores) {
+  for (auto *Store : Stores) {
     auto MaybeCst =
         getIConstantVRegValWithLookThrough(Store->getValueReg(), *MRI);
     if (!MaybeCst) {
@@ -415,7 +415,7 @@ bool LoadStoreOpt::doSingleStoreMerge(SmallVectorImpl<GStore *> &Stores) {
     return R;
   });
 
-  for (auto MI : Stores)
+  for (auto *MI : Stores)
     InstsToErase.insert(MI);
   return true;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 0d9580e25606..2e22dae35e5a 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -646,7 +646,7 @@ MachineIRBuilder::buildBuildVectorConstant(const DstOp &Res,
   SmallVector<SrcOp> TmpVec;
   TmpVec.reserve(Ops.size());
   LLT EltTy = Res.getLLTTy(*getMRI()).getElementType();
-  for (auto &Op : Ops)
+  for (const auto &Op : Ops)
     TmpVec.push_back(buildConstant(EltTy, Op));
   return buildInstr(TargetOpcode::G_BUILD_VECTOR, Res, TmpVec);
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index 7781761bc131..013c8700e8ae 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -228,7 +228,7 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
     return false;
 
   // Instructions without side-effects are dead iff they only define dead vregs.
-  for (auto &MO : MI.operands()) {
+  for (const auto &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
 
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index 67d6a3df7807..258ad1931b12 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -332,7 +332,7 @@ void HardwareLoop::Create() {
 
   // Run through the basic blocks of the loop and see if any of them have dead
   // PHIs that can be removed.
-  for (auto I : L->blocks())
+  for (auto *I : L->blocks())
     DeleteDeadPHIs(I);
 }
 
@@ -407,13 +407,13 @@ Value *HardwareLoop::InitLoopCount() {
     BasicBlock *Predecessor = BB->getSinglePredecessor();
     // If it's not safe to create a while loop then don't force it and create a
     // do-while loop instead
-    if (!isSafeToExpandAt(ExitCount, Predecessor->getTerminator(), SE))
+    if (!SCEVE.isSafeToExpandAt(ExitCount, Predecessor->getTerminator()))
         UseLoopGuard = false;
     else
         BB = Predecessor;
   }
 
-  if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
+  if (!SCEVE.isSafeToExpandAt(ExitCount, BB->getTerminator())) {
     LLVM_DEBUG(dbgs() << "- Bailing, unsafe to expand ExitCount "
                << *ExitCount << "\n");
     return nullptr;
diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index fc97938ccd3e..da6ec76bd770 100644
--- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -758,7 +758,7 @@ void ImplicitNullChecks::rewriteNullChecks(
     ArrayRef<ImplicitNullChecks::NullCheck> NullCheckList) {
   DebugLoc DL;
 
-  for (auto &NC : NullCheckList) {
+  for (const auto &NC : NullCheckList) {
     // Remove the conditional branch dependent on the null check.
     unsigned BranchesRemoved = TII->removeBranch(*NC.getCheckBlock());
     (void)BranchesRemoved;
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 06c660807c5c..3ea1d6c7f1ef 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -86,7 +86,6 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
   MachineFunction &MF;
   LiveIntervals &LIS;
   LiveStacks &LSS;
-  AliasAnalysis *AA;
   MachineDominatorTree &MDT;
   MachineLoopInfo &Loops;
   VirtRegMap &VRM;
@@ -140,7 +139,6 @@ public:
                    VirtRegMap &vrm)
       : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
         LSS(pass.getAnalysis<LiveStacks>()),
-        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
         MDT(pass.getAnalysis<MachineDominatorTree>()),
         Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
         MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
@@ -159,7 +157,6 @@ class InlineSpiller : public Spiller {
   MachineFunction &MF;
   LiveIntervals &LIS;
   LiveStacks &LSS;
-  AliasAnalysis *AA;
   MachineDominatorTree &MDT;
   MachineLoopInfo &Loops;
   VirtRegMap &VRM;
@@ -200,7 +197,6 @@ public:
                 VirtRegAuxInfo &VRAI)
       : MF(MF), LIS(Pass.getAnalysis<LiveIntervals>()),
         LSS(Pass.getAnalysis<LiveStacks>()),
-        AA(&Pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
         MDT(Pass.getAnalysis<MachineDominatorTree>()),
         Loops(Pass.getAnalysis<MachineLoopInfo>()), VRM(VRM),
         MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
@@ -659,7 +655,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
 /// reMaterializeAll - Try to rematerialize as many uses as possible,
 /// and trim the live ranges after.
 void InlineSpiller::reMaterializeAll() {
-  if (!Edit->anyRematerializable(AA))
+  if (!Edit->anyRematerializable())
     return;
 
   UsedValues.clear();
@@ -702,7 +698,7 @@ void InlineSpiller::reMaterializeAll() {
   if (DeadDefs.empty())
     return;
   LLVM_DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
-  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
+  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
 
   // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions
   // after rematerialization.  To remove a VNI for a vreg from its LiveInterval,
@@ -1180,7 +1176,7 @@ void InlineSpiller::spillAll() {
   // Hoisted spills may cause dead code.
   if (!DeadDefs.empty()) {
     LLVM_DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
-    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
+    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill);
   }
 
   // Finally delete the SnippetCopies.
@@ -1298,7 +1294,7 @@ void HoistSpillHelper::rmRedundantSpills(
   // For each spill saw, check SpillBBToSpill[] and see if its BB already has
   // another spill inside. If a BB contains more than one spill, only keep the
   // earlier spill with smaller SlotIndex.
-  for (const auto CurrentSpill : Spills) {
+  for (auto *const CurrentSpill : Spills) {
     MachineBasicBlock *Block = CurrentSpill->getParent();
     MachineDomTreeNode *Node = MDT.getBase().getNode(Block);
     MachineInstr *PrevSpill = SpillBBToSpill[Node];
@@ -1313,7 +1309,7 @@ void HoistSpillHelper::rmRedundantSpills(
       SpillBBToSpill[MDT.getBase().getNode(Block)] = CurrentSpill;
     }
   }
-  for (const auto SpillToRm : SpillsToRm)
+  for (auto *const SpillToRm : SpillsToRm)
     Spills.erase(SpillToRm);
 }
 
@@ -1347,7 +1343,7 @@ void HoistSpillHelper::getVisitOrders(
   // the path starting from the first node with non-redundant spill to the Root
   // node will be added to the WorkSet, which will contain all the possible
   // locations where spills may be hoisted to after the loop below is done.
-  for (const auto Spill : Spills) {
+  for (auto *const Spill : Spills) {
     MachineBasicBlock *Block = Spill->getParent();
     MachineDomTreeNode *Node = MDT[Block];
     MachineInstr *SpillToRm = nullptr;
@@ -1492,7 +1488,7 @@ void HoistSpillHelper::runHoistSpills(
                                        : BranchProbability(1, 1);
     if (SubTreeCost > MBFI.getBlockFreq(Block) * MarginProb) {
       // Hoist: Move spills to current Block.
-      for (const auto SpillBB : SpillsInSubTree) {
+      for (auto *const SpillBB : SpillsInSubTree) {
         // When SpillBB is a BB contains original spill, insert the spill
         // to SpillsToRm.
         if (SpillsToKeep.find(SpillBB) != SpillsToKeep.end() &&
@@ -1609,7 +1605,7 @@ void HoistSpillHelper::hoistAllSpills() {
 
     // Remove redundant spills or change them to dead instructions.
     NumSpills -= SpillsToRm.size();
-    for (auto const RMEnt : SpillsToRm) {
+    for (auto *const RMEnt : SpillsToRm) {
       RMEnt->setDesc(TII.get(TargetOpcode::KILL));
       for (unsigned i = RMEnt->getNumOperands(); i; --i) {
         MachineOperand &MO = RMEnt->getOperand(i - 1);
@@ -1617,7 +1613,7 @@ void HoistSpillHelper::hoistAllSpills() {
           RMEnt->removeOperand(i - 1);
       }
     }
-    Edit.eliminateDeadDefs(SpillsToRm, None, AA);
+    Edit.eliminateDeadDefs(SpillsToRm, None);
   }
 }
 
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 55f3ad796291..0582378be4cd 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -541,7 +541,7 @@ bool InterleavedAccess::runOnFunction(Function &F) {
       Changed |= lowerInterleavedStore(SI, DeadInsts);
   }
 
-  for (auto I : DeadInsts)
+  for (auto *I : DeadInsts)
     I->eraseFromParent();
 
   return Changed;
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 43858071025a..a0f304659bca 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -528,8 +528,8 @@ public:
     if (B.size() != o.B.size())
       return false;
 
-    auto ob = o.B.begin();
-    for (auto &b : B) {
+    auto *ob = o.B.begin();
+    for (const auto &b : B) {
       if (b != *ob)
         return false;
       ob++;
@@ -1154,7 +1154,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   // Test if all participating instruction will be dead after the
   // transformation. If intermediate results are used, no performance gain can
   // be expected. Also sum the cost of the Instructions beeing left dead.
-  for (auto &I : Is) {
+  for (const auto &I : Is) {
     // Compute the old cost
     InstructionCost += TTI.getInstructionCost(I, CostKind);
 
@@ -1182,7 +1182,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   // that the corresponding defining access dominates first LI. This guarantees
   // that there are no aliasing stores in between the loads.
   auto FMA = MSSA.getMemoryAccess(First);
-  for (auto LI : LIs) {
+  for (auto *LI : LIs) {
     auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
     if (!MSSA.dominates(MADef, FMA))
       return false;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index 43c12c67939e..ef49d3888f2b 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -284,7 +284,7 @@ public:
 
     // Initialized the preferred-location map with illegal locations, to be
     // filled in later.
-    for (auto &VLoc : VLocs)
+    for (const auto &VLoc : VLocs)
       if (VLoc.second.Kind == DbgValue::Def)
         ValueToLoc.insert({VLoc.second.ID, LocIdx::MakeIllegalLoc()});
 
@@ -507,7 +507,7 @@ public:
     // date. Wipe old tracking data for the location if it's been clobbered in
     // the meantime.
     if (MTracker->readMLoc(NewLoc) != VarLocs[NewLoc.asU64()]) {
-      for (auto &P : ActiveMLocs[NewLoc]) {
+      for (const auto &P : ActiveMLocs[NewLoc]) {
         ActiveVLocs.erase(P);
       }
       ActiveMLocs[NewLoc.asU64()].clear();
@@ -560,7 +560,7 @@ public:
     // explicitly undef, then stop here.
     if (!NewLoc && !MakeUndef) {
       // Try and recover a few more locations with entry values.
-      for (auto &Var : ActiveMLocIt->second) {
+      for (const auto &Var : ActiveMLocIt->second) {
         auto &Prop = ActiveVLocs.find(Var)->second.Properties;
         recoverAsEntryValue(Var, Prop, OldValue);
       }
@@ -570,7 +570,7 @@ public:
 
     // Examine all the variables based on this location.
     DenseSet<DebugVariable> NewMLocs;
-    for (auto &Var : ActiveMLocIt->second) {
+    for (const auto &Var : ActiveMLocIt->second) {
       auto ActiveVLocIt = ActiveVLocs.find(Var);
       // Re-state the variable location: if there's no replacement then NewLoc
       // is None and a $noreg DBG_VALUE will be created. Otherwise, a DBG_VALUE
@@ -623,7 +623,7 @@ public:
     VarLocs[Dst.asU64()] = VarLocs[Src.asU64()];
 
     // For each variable based on Src; create a location at Dst.
-    for (auto &Var : MovingVars) {
+    for (const auto &Var : MovingVars) {
       auto ActiveVLocIt = ActiveVLocs.find(Var);
       assert(ActiveVLocIt != ActiveVLocs.end());
       ActiveVLocIt->second.Loc = Dst;
@@ -1224,7 +1224,7 @@ bool InstrRefBasedLDV::transferDebugInstrRef(MachineInstr &MI,
       // FIXME: no index for this?
       Register Reg = MTracker->LocIdxToLocID[L];
       const TargetRegisterClass *TRC = nullptr;
-      for (auto *TRCI : TRI->regclasses())
+      for (const auto *TRCI : TRI->regclasses())
         if (TRCI->contains(Reg))
           TRC = TRCI;
       assert(TRC && "Couldn't find target register class?");
@@ -1454,7 +1454,7 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
   for (uint32_t DeadReg : DeadRegs)
     MTracker->defReg(DeadReg, CurBB, CurInst);
 
-  for (auto *MO : RegMaskPtrs)
+  for (const auto *MO : RegMaskPtrs)
     MTracker->writeRegMask(MO, CurBB, CurInst);
 
   // If this instruction writes to a spill slot, def that slot.
@@ -1493,7 +1493,7 @@ void InstrRefBasedLDV::transferRegisterDef(MachineInstr &MI) {
       if (IgnoreSPAlias(Reg))
         continue;
 
-      for (auto *MO : RegMaskPtrs)
+      for (const auto *MO : RegMaskPtrs)
         if (MO->clobbersPhysReg(Reg))
           TTracker->clobberMloc(L.Idx, MI.getIterator(), false);
     }
@@ -1822,7 +1822,7 @@ void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) {
   // Otherwise, examine all other seen fragments for this variable, with "this"
   // fragment being a previously unseen fragment. Record any pair of
   // overlapping fragments.
-  for (auto &ASeenFragment : AllSeenFragments) {
+  for (const auto &ASeenFragment : AllSeenFragments) {
     // Does this previously seen fragment overlap?
     if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
       // Yes: Mark the current fragment as being overlapped.
@@ -1993,7 +1993,7 @@ bool InstrRefBasedLDV::mlocJoin(
   // redundant PHI that we can eliminate.
 
   SmallVector<const MachineBasicBlock *, 8> BlockOrders;
-  for (auto Pred : MBB.predecessors())
+  for (auto *Pred : MBB.predecessors())
     BlockOrders.push_back(Pred);
 
   // Visit predecessors in RPOT order.
@@ -2313,7 +2313,7 @@ void InstrRefBasedLDV::buildMLocValueMap(
       // All successors should be visited: put any back-edges on the pending
       // list for the next pass-through, and any other successors to be
       // visited this pass, if they're not going to be already.
-      for (auto s : MBB->successors()) {
+      for (auto *s : MBB->successors()) {
         // Does branching to this successor represent a back-edge?
         if (BBToOrder[s] > BBToOrder[MBB]) {
           // No: visit it during this dataflow iteration.
@@ -2367,7 +2367,7 @@ Optional<ValueIDNum> InstrRefBasedLDV::pickVPHILoc(
   if (BlockOrders.empty())
     return None;
 
-  for (auto p : BlockOrders) {
+  for (const auto *p : BlockOrders) {
     unsigned ThisBBNum = p->getNumber();
     auto OutValIt = LiveOuts.find(p);
     if (OutValIt == LiveOuts.end())
@@ -2422,7 +2422,7 @@ Optional<ValueIDNum> InstrRefBasedLDV::pickVPHILoc(
   // Check that all properties are the same. We can't pick a location if they're
   // not.
   const DbgValueProperties *Properties0 = Properties[0];
-  for (auto *Prop : Properties)
+  for (const auto *Prop : Properties)
     if (*Prop != *Properties0)
       return None;
 
@@ -2472,7 +2472,7 @@ bool InstrRefBasedLDV::vlocJoin(
   SmallVector<InValueT, 8> Values;
   bool Bail = false;
   int BackEdgesStart = 0;
-  for (auto p : BlockOrders) {
+  for (auto *p : BlockOrders) {
     // If the predecessor isn't in scope / to be explored, we'll never be
     // able to join any locations.
     if (!BlocksToExplore.contains(p)) {
@@ -2577,7 +2577,7 @@ void InstrRefBasedLDV::getBlocksForScope(
   // instructions in scope at all. To accurately replicate VarLoc
   // LiveDebugValues, this means exploring all artificial successors too.
   // Perform a depth-first-search to enumerate those blocks.
-  for (auto *MBB : BlocksToExplore) {
+  for (const auto *MBB : BlocksToExplore) {
     // Depth-first-search state: each node is a block and which successor
     // we're currently exploring.
     SmallVector<std::pair<const MachineBasicBlock *,
@@ -2662,7 +2662,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
     MutBlocksToExplore.insert(const_cast<MachineBasicBlock *>(MBB));
 
   // Picks out relevants blocks RPO order and sort them.
-  for (auto *MBB : BlocksToExplore)
+  for (const auto *MBB : BlocksToExplore)
     BlockOrders.push_back(const_cast<MachineBasicBlock *>(MBB));
 
   llvm::sort(BlockOrders, Cmp);
@@ -2696,7 +2696,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
   // between blocks. This keeps the locality of working on one lexical scope at
   // at time, but avoids re-processing variable values because some other
   // variable has been assigned.
-  for (auto &Var : VarsWeCareAbout) {
+  for (const auto &Var : VarsWeCareAbout) {
     // Re-initialize live-ins and live-outs, to clear the remains of previous
     // variables live-ins / live-outs.
     for (unsigned int I = 0; I < NumBlocks; ++I) {
@@ -2823,7 +2823,7 @@ void InstrRefBasedLDV::buildVLocValueMap(
         // We should visit all successors. Ensure we'll visit any non-backedge
         // successors during this dataflow iteration; book backedge successors
         // to be visited next time around.
-        for (auto s : MBB->successors()) {
+        for (auto *s : MBB->successors()) {
           // Ignore out of scope / not-to-be-explored successors.
           if (LiveInIdx.find(s) == LiveInIdx.end())
             continue;
@@ -2906,7 +2906,7 @@ void InstrRefBasedLDV::placePHIsForSingleVarDefinition(
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void InstrRefBasedLDV::dump_mloc_transfer(
     const MLocTransferMap &mloc_transfer) const {
-  for (auto &P : mloc_transfer) {
+  for (const auto &P : mloc_transfer) {
     std::string foo = MTracker->LocIdxToName(P.first);
     std::string bar = MTracker->IDAsString(P.second);
     dbgs() << "Loc " << foo << " --> " << bar << "\n";
@@ -2993,7 +2993,7 @@ void InstrRefBasedLDV::makeDepthFirstEjectionMap(
       if (DILocationIt != ScopeToDILocation.end()) {
         getBlocksForScope(DILocationIt->second, BlocksToExplore,
                           ScopeToAssignBlocks.find(WS)->second);
-        for (auto *MBB : BlocksToExplore) {
+        for (const auto *MBB : BlocksToExplore) {
           unsigned BBNum = MBB->getNumber();
           if (EjectionMap[BBNum] == 0)
             EjectionMap[BBNum] = WS->getDFSOut();
@@ -3100,7 +3100,7 @@ bool InstrRefBasedLDV::depthFirstVLocAndEmit(
 
       getBlocksForScope(DILocationIt->second, BlocksToExplore,
                         ScopeToAssignBlocks.find(WS)->second);
-      for (auto *MBB : BlocksToExplore)
+      for (const auto *MBB : BlocksToExplore)
         if (WS->getDFSOut() == EjectionMap[MBB->getNumber()])
           EjectBlock(const_cast<MachineBasicBlock &>(*MBB));
 
@@ -3709,10 +3709,9 @@ Optional<ValueIDNum> InstrRefBasedLDV::resolveDbgPHIsImpl(
   for (auto &PHI : CreatedPHIs)
     SortedPHIs.push_back(PHI);
 
-  std::sort(
-      SortedPHIs.begin(), SortedPHIs.end(), [&](LDVSSAPhi *A, LDVSSAPhi *B) {
-        return BBToOrder[&A->getParent()->BB] < BBToOrder[&B->getParent()->BB];
-      });
+  llvm::sort(SortedPHIs, [&](LDVSSAPhi *A, LDVSSAPhi *B) {
+    return BBToOrder[&A->getParent()->BB] < BBToOrder[&B->getParent()->BB];
+  });
 
   for (auto &PHI : SortedPHIs) {
     ValueIDNum ThisBlockValueNum =
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index 24c00b8a10ec..32e07eb77efe 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -1874,7 +1874,7 @@ void VarLocBasedLDV::accumulateFragmentMap(MachineInstr &MI,
   // Otherwise, examine all other seen fragments for this variable, with "this"
   // fragment being a previously unseen fragment. Record any pair of
   // overlapping fragments.
-  for (auto &ASeenFragment : AllSeenFragments) {
+  for (const auto &ASeenFragment : AllSeenFragments) {
     // Does this previously seen fragment overlap?
     if (DIExpression::fragmentsOverlap(ThisFragment, ASeenFragment)) {
       // Yes: Mark the current fragment as being overlapped.
@@ -1922,7 +1922,7 @@ bool VarLocBasedLDV::join(
   // For all predecessors of this MBB, find the set of VarLocs that
   // can be joined.
   int NumVisited = 0;
-  for (auto p : MBB.predecessors()) {
+  for (auto *p : MBB.predecessors()) {
     // Ignore backedges if we have not visited the predecessor yet. As the
     // predecessor hasn't yet had locations propagated into it, most locations
     // will not yet be valid, so treat them as all being uninitialized and
@@ -2246,7 +2246,7 @@ bool VarLocBasedLDV::ExtendRanges(MachineFunction &MF,
 
         if (OLChanged) {
           OLChanged = false;
-          for (auto s : MBB->successors())
+          for (auto *s : MBB->successors())
             if (OnPending.insert(s).second) {
               Pending.push(BBToOrder[s]);
             }
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 35cf25330186..574c0f98161e 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -1891,7 +1891,7 @@ void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
   // insert position, insert all instructions at the same SlotIdx. They are
   // guaranteed to appear in-sequence in StashedDebugInstrs because we insert
   // them in order.
-  for (auto StashIt = StashedDebugInstrs.begin();
+  for (auto *StashIt = StashedDebugInstrs.begin();
        StashIt != StashedDebugInstrs.end(); ++StashIt) {
     SlotIndex Idx = StashIt->Idx;
     MachineBasicBlock *MBB = StashIt->MBB;
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 1242ce20b732..8a76048bb8c4 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -19,7 +19,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveVariables.h"
@@ -60,9 +59,8 @@ using namespace llvm;
 
 char LiveIntervals::ID = 0;
 char &llvm::LiveIntervalsID = LiveIntervals::ID;
-INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
-                "Live Interval Analysis", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals", "Live Interval Analysis",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_END(LiveIntervals, "liveintervals",
@@ -87,8 +85,6 @@ cl::opt<bool> UseSegmentSetForPhysRegs(
 
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<AAResultsWrapperPass>();
-  AU.addPreserved<AAResultsWrapperPass>();
   AU.addPreserved<LiveVariables>();
   AU.addPreservedID(MachineLoopInfoID);
   AU.addRequiredTransitiveID(MachineDominatorsID);
@@ -126,7 +122,6 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MRI = &MF->getRegInfo();
   TRI = MF->getSubtarget().getRegisterInfo();
   TII = MF->getSubtarget().getInstrInfo();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
 
@@ -1417,7 +1412,7 @@ private:
             NewIdxDef.getRegSlot(), (NewIdxOut + 1)->end, OldIdxVNI);
         OldIdxVNI->def = NewIdxDef;
         // Modify subsequent segments to be defined by the moved def OldIdxVNI.
-        for (auto Idx = NewIdxOut + 2; Idx <= OldIdxOut; ++Idx)
+        for (auto *Idx = NewIdxOut + 2; Idx <= OldIdxOut; ++Idx)
           Idx->valno = OldIdxVNI;
         // Aggressively remove all dead flags from the former dead definition.
         // Kill/dead flags shouldn't be used while live intervals exist; they
@@ -1662,7 +1657,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
                                       ArrayRef<Register> OrigRegs) {
   // Find anchor points, which are at the beginning/end of blocks or at
   // instructions that already have indexes.
-  while (Begin != MBB->begin() && !Indexes->hasIndex(*Begin))
+  while (Begin != MBB->begin() && !Indexes->hasIndex(*std::prev(Begin)))
     --Begin;
   while (End != MBB->end() && !Indexes->hasIndex(*End))
     ++End;
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 58eb4110f153..2aafb746aa2c 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -68,17 +68,16 @@ Register LiveRangeEdit::createFrom(Register OldReg) {
 }
 
 bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
-                                          const MachineInstr *DefMI,
-                                          AAResults *aa) {
+                                          const MachineInstr *DefMI) {
   assert(DefMI && "Missing instruction");
   ScannedRemattable = true;
-  if (!TII.isTriviallyReMaterializable(*DefMI, aa))
+  if (!TII.isTriviallyReMaterializable(*DefMI))
     return false;
   Remattable.insert(VNI);
   return true;
 }
 
-void LiveRangeEdit::scanRemattable(AAResults *aa) {
+void LiveRangeEdit::scanRemattable() {
   for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
@@ -90,14 +89,14 @@ void LiveRangeEdit::scanRemattable(AAResults *aa) {
     MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def);
     if (!DefMI)
       continue;
-    checkRematerializable(OrigVNI, DefMI, aa);
+    checkRematerializable(OrigVNI, DefMI);
   }
   ScannedRemattable = true;
 }
 
-bool LiveRangeEdit::anyRematerializable(AAResults *aa) {
+bool LiveRangeEdit::anyRematerializable() {
   if (!ScannedRemattable)
-    scanRemattable(aa);
+    scanRemattable();
   return !Remattable.empty();
 }
 
@@ -274,8 +273,7 @@ bool LiveRangeEdit::useIsKill(const LiveInterval &LI,
 }
 
 /// Find all live intervals that need to shrink, then remove the instruction.
-void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
-                                     AAResults *AA) {
+void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
   assert(MI->allDefsAreDead() && "Def isn't really dead");
   SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
 
@@ -384,7 +382,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
     // register uses. That may provoke RA to split an interval at the KILL
     // and later result in an invalid live segment end.
     if (isOrigDef && DeadRemats && !HasLiveVRegUses &&
-        TII.isTriviallyReMaterializable(*MI, AA)) {
+        TII.isTriviallyReMaterializable(*MI)) {
       LiveInterval &NewLI = createEmptyIntervalFrom(Dest, false);
       VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
       NewLI.addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), VNI));
@@ -414,14 +412,13 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
 }
 
 void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
-                                      ArrayRef<Register> RegsBeingSpilled,
-                                      AAResults *AA) {
+                                      ArrayRef<Register> RegsBeingSpilled) {
   ToShrinkSet ToShrink;
 
   for (;;) {
     // Erase all dead defs.
     while (!Dead.empty())
-      eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA);
+      eliminateDeadDef(Dead.pop_back_val(), ToShrink);
 
     if (ToShrink.empty())
       break;
diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp
index 94bdfab5e5e0..40250171fe1e 100644
--- a/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/llvm/lib/CodeGen/LiveVariables.cpp
@@ -758,8 +758,7 @@ void LiveVariables::replaceKillInstruction(Register Reg, MachineInstr &OldMI,
 /// removeVirtualRegistersKilled - Remove all killed info for the specified
 /// instruction.
 void LiveVariables::removeVirtualRegistersKilled(MachineInstr &MI) {
-  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI.getOperand(i);
+  for (MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.isKill()) {
       MO.setIsKill(false);
       Register Reg = MO.getReg();
diff --git a/llvm/lib/CodeGen/LowerEmuTLS.cpp b/llvm/lib/CodeGen/LowerEmuTLS.cpp
index 984dc452fbfd..a517ee3794ca 100644
--- a/llvm/lib/CodeGen/LowerEmuTLS.cpp
+++ b/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -78,7 +78,7 @@ bool LowerEmuTLS::runOnModule(Module &M) {
     if (G.isThreadLocal())
       TlsVars.append({&G});
   }
-  for (const auto G : TlsVars)
+  for (const auto *const G : TlsVars)
     Changed |= addEmuTlsVar(M, G);
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
index eea24d8e9353..3e7b4dbc9d71 100644
--- a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -129,7 +129,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
   // Calculates the distance of MI from the beginning of its parent BB.
   auto getInstrIdx = [](const MachineInstr &MI) {
     unsigned i = 0;
-    for (auto &CurMI : *MI.getParent()) {
+    for (const auto &CurMI : *MI.getParent()) {
       if (&CurMI == &MI)
         return i;
       i++;
@@ -416,7 +416,7 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
   MachineRegisterInfo &MRI = MF.getRegInfo();
   VRegRenamer Renamer(MRI);
-  for (auto MBB : RPOList)
+  for (auto *MBB : RPOList)
     Changed |= runOnBasicBlock(MBB, BBNum++, Renamer);
 
   return Changed;
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 0c94e1f7e474..e3d6b59c5077 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -3383,7 +3383,7 @@ static void initSlots2BasicBlocks(
     DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
   ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
   MST.incorporateFunction(F);
-  for (auto &BB : F) {
+  for (const auto &BB : F) {
     if (BB.hasName())
       continue;
     int Slot = MST.getLocalSlot(&BB);
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 4944cb46c5b5..aa9522bc3459 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -437,7 +437,7 @@ void MIRParserImpl::setupDebugValueTracking(
   MF.setDebugInstrNumberingCount(MaxInstrNum);
 
   // Load any substitutions.
-  for (auto &Sub : YamlMF.DebugValueSubstitutions) {
+  for (const auto &Sub : YamlMF.DebugValueSubstitutions) {
     MF.makeDebugValueSubstitution({Sub.SrcInst, Sub.SrcOp},
                                   {Sub.DstInst, Sub.DstOp}, Sub.Subreg);
   }
@@ -975,7 +975,7 @@ bool MIRParserImpl::parseMachineMetadata(PerFunctionMIParsingState &PFS,
 bool MIRParserImpl::parseMachineMetadataNodes(
     PerFunctionMIParsingState &PFS, MachineFunction &MF,
     const yaml::MachineFunction &YMF) {
-  for (auto &MDS : YMF.MachineMetadataNodes) {
+  for (const auto &MDS : YMF.MachineMetadataNodes) {
     if (parseMachineMetadata(PFS, MDS))
       return true;
   }
diff --git a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
index 7daf9025d303..d21d552227cf 100644
--- a/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegallocEvictAdvisor.cpp
@@ -13,10 +13,9 @@
 #include "AllocationOrder.h"
 #include "RegAllocEvictionAdvisor.h"
 #include "RegAllocGreedy.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MLModelRunner.h"
 #include "llvm/Analysis/TensorSpec.h"
-#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) || defined(LLVM_HAVE_TF_API) 
+#if defined(LLVM_HAVE_TF_AOT_REGALLOCEVICTMODEL) || defined(LLVM_HAVE_TF_API)
 #include "llvm/Analysis/ModelUnderTrainingRunner.h"
 #include "llvm/Analysis/NoInferenceModelRunner.h"
 #endif
@@ -91,7 +90,6 @@ public:
     AU.setPreservesAll();
     AU.addRequired<RegAllocEvictionAdvisorAnalysis>();
     AU.addRequired<MachineBlockFrequencyInfo>();
-    AU.addRequired<AAResultsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -891,9 +889,7 @@ bool RegAllocScoring::runOnMachineFunction(MachineFunction &MF) {
           &getAnalysis<RegAllocEvictionAdvisorAnalysis>()))
     if (auto *Log = DevModeAnalysis->getLogger(MF))
       Log->logFloatFinalReward(static_cast<float>(
-          calculateRegAllocScore(
-              MF, getAnalysis<MachineBlockFrequencyInfo>(),
-              getAnalysis<AAResultsWrapperPass>().getAAResults())
+          calculateRegAllocScore(MF, getAnalysis<MachineBlockFrequencyInfo>())
               .getScore()));
 
   return false;
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 02c44fa85cd9..7381c7e6b09c 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1436,7 +1436,7 @@ MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
     // ditribute the complemental of the sum to each unknown probability.
     unsigned KnownProbNum = 0;
     auto Sum = BranchProbability::getZero();
-    for (auto &P : Probs) {
+    for (const auto &P : Probs) {
       if (!P.isUnknown()) {
         Sum += P;
         KnownProbNum++;
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 4cc84f22bdde..9ff5c37627b4 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -965,7 +965,7 @@ bool MachineBlockPlacement::isTrellis(
 
   for (MachineBasicBlock *Succ : ViableSuccs) {
     int PredCount = 0;
-    for (auto SuccPred : Succ->predecessors()) {
+    for (auto *SuccPred : Succ->predecessors()) {
       // Allow triangle successors, but don't count them.
       if (Successors.count(SuccPred)) {
         // Make sure that it is actually a triangle.
@@ -1063,7 +1063,7 @@ MachineBlockPlacement::getBestTrellisSuccessor(
   // Collect the edge frequencies of all edges that form the trellis.
   SmallVector<WeightedEdge, 8> Edges[2];
   int SuccIndex = 0;
-  for (auto Succ : ViableSuccs) {
+  for (auto *Succ : ViableSuccs) {
     for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
       // Skip any placed predecessors that are not BB
       if (SuccPred != BB)
@@ -2451,7 +2451,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   // as the sum of frequencies of exit edges we collect here, excluding the exit
   // edge from the tail of the loop chain.
   SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
-  for (auto BB : LoopChain) {
+  for (auto *BB : LoopChain) {
     auto LargestExitEdgeProb = BranchProbability::getZero();
     for (auto *Succ : BB->successors()) {
       BlockChain *SuccChain = BlockToChain[Succ];
@@ -2561,7 +2561,7 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   // profile data is available.
   if (F->getFunction().hasProfileData() || ForceLoopColdBlock) {
     BlockFrequency LoopFreq(0);
-    for (auto LoopPred : L.getHeader()->predecessors())
+    for (auto *LoopPred : L.getHeader()->predecessors())
       if (!L.contains(LoopPred))
         LoopFreq += MBFI->getBlockFreq(LoopPred) *
                     MBPI->getEdgeProbability(LoopPred, L.getHeader());
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index e60fd9f7883a..c6756b1d3737 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -415,7 +415,7 @@ bool MachineCSE::isCSECandidate(MachineInstr *MI) {
     // Okay, this instruction does a load. As a refinement, we allow the target
     // to decide whether the loaded value is actually a constant. If so, we can
     // actually use it as a load.
-    if (!MI->isDereferenceableInvariantLoad(AA))
+    if (!MI->isDereferenceableInvariantLoad())
       // FIXME: we should be able to hoist loads with no other side effects if
       // there are no other instructions which can change memory in this loop.
       // This is a trivial form of alias analysis.
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 722a709af240..57e2cd20bdd0 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -92,6 +92,7 @@ private:
   bool doSubstitute(unsigned NewSize, unsigned OldSize, bool OptForSize);
   bool combineInstructions(MachineBasicBlock *);
   MachineInstr *getOperandDef(const MachineOperand &MO);
+  bool isTransientMI(const MachineInstr *MI);
   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
                     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                     MachineTraceMetrics::Trace BlockTrace);
@@ -158,6 +159,43 @@ MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
   return DefInstr;
 }
 
+/// Return true if MI is unlikely to generate an actual target instruction.
+bool MachineCombiner::isTransientMI(const MachineInstr *MI) {
+  if (!MI->isCopy())
+    return MI->isTransient();
+
+  // If MI is a COPY, check if its src and dst registers can be coalesced.
+  Register Dst = MI->getOperand(0).getReg();
+  Register Src = MI->getOperand(1).getReg();
+
+  if (!MI->isFullCopy()) {
+    // If src RC contains super registers of dst RC, it can also be coalesced.
+    if (MI->getOperand(0).getSubReg() || Src.isPhysical() || Dst.isPhysical())
+      return false;
+
+    auto SrcSub = MI->getOperand(1).getSubReg();
+    auto SrcRC = MRI->getRegClass(Src);
+    auto DstRC = MRI->getRegClass(Dst);
+    return TRI->getMatchingSuperRegClass(SrcRC, DstRC, SrcSub) != nullptr;
+  }
+
+  if (Src.isPhysical() && Dst.isPhysical())
+    return Src == Dst;
+
+  if (Src.isVirtual() && Dst.isVirtual()) {
+    auto SrcRC = MRI->getRegClass(Src);
+    auto DstRC = MRI->getRegClass(Dst);
+    return SrcRC->hasSuperClassEq(DstRC) || SrcRC->hasSubClassEq(DstRC);
+  }
+
+  if (Src.isVirtual())
+    std::swap(Src, Dst);
+
+  // Now Src is physical register, Dst is virtual register.
+  auto DstRC = MRI->getRegClass(Dst);
+  return DstRC->contains(Src);
+}
+
 /// Computes depth of instructions in vector \InsInstr.
 ///
 /// \param InsInstrs is a vector of machine instructions
@@ -204,9 +242,10 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
         MachineInstr *DefInstr = getOperandDef(MO);
         if (DefInstr) {
           DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
-          LatencyOp = TSchedModel.computeOperandLatency(
-              DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
-              InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+          if (!isTransientMI(DefInstr))
+            LatencyOp = TSchedModel.computeOperandLatency(
+                DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+                InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
         }
       }
       IDepth = std::max(IDepth, DepthOp + LatencyOp);
@@ -305,7 +344,7 @@ std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
   NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
 
   unsigned RootLatency = 0;
-  for (auto I : DelInstrs)
+  for (auto *I : DelInstrs)
     RootLatency += TSchedModel.computeInstrLatency(I);
 
   return {NewRootLatency, RootLatency};
@@ -488,7 +527,7 @@ static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
   for (auto *InstrPtr : DelInstrs) {
     InstrPtr->eraseFromParent();
     // Erase all LiveRegs defined by the removed instruction
-    for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
+    for (auto *I = RegUnits.begin(); I != RegUnits.end();) {
       if (I->MI == InstrPtr)
         I = RegUnits.erase(I);
       else
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index ca5936a14779..f0190812389f 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -127,7 +127,7 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
     BV.set(*CSR);
 
   // Saved CSRs are not pristine.
-  for (auto &I : getCalleeSavedInfo())
+  for (const auto &I : getCalleeSavedInfo())
     for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S)
       BV.reset(*S);
 
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index f58996ea90c6..6b481a374382 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -911,8 +911,8 @@ static const MachineInstr *getCallInstr(const MachineInstr *MI) {
   if (!MI->isBundle())
     return MI;
 
-  for (auto &BMI : make_range(getBundleStart(MI->getIterator()),
-                              getBundleEnd(MI->getIterator())))
+  for (const auto &BMI : make_range(getBundleStart(MI->getIterator()),
+                                    getBundleEnd(MI->getIterator())))
     if (BMI.isCandidateForCallSiteEntry())
       return &BMI;
 
diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index 867a7ed584b2..3e1aace855a5 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -146,7 +146,7 @@ bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
     return X.getSectionID().Type < Y.getSectionID().Type;
   };
   llvm::sortBasicBlocksAndUpdateBranches(MF, Comparator);
-
+  llvm::avoidZeroOffsetLandingPad(MF);
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 31f45e194a97..e92dec5bea48 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1203,7 +1203,7 @@ bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
   // destination. The check for isInvariantLoad gives the target the chance to
   // classify the load as always returning a constant, e.g. a constant pool
   // load.
-  if (mayLoad() && !isDereferenceableInvariantLoad(AA))
+  if (mayLoad() && !isDereferenceableInvariantLoad())
     // Otherwise, this is a real load.  If there is a store between the load and
     // end of block, we can't move it.
     return !SawStore;
@@ -1348,7 +1348,7 @@ bool MachineInstr::hasOrderedMemoryRef() const {
 /// isDereferenceableInvariantLoad - Return true if this instruction will never
 /// trap and is loading from a location whose value is invariant across a run of
 /// this function.
-bool MachineInstr::isDereferenceableInvariantLoad(AAResults *AA) const {
+bool MachineInstr::isDereferenceableInvariantLoad() const {
   // If the instruction doesn't load at all, it isn't an invariant load.
   if (!mayLoad())
     return false;
@@ -1374,12 +1374,6 @@ bool MachineInstr::isDereferenceableInvariantLoad(AAResults *AA) const {
     if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
       if (PSV->isConstant(&MFI))
         continue;
-    } else if (const Value *V = MMO->getValue()) {
-      // If we have an AliasAnalysis, ask it whether the memory is constant.
-      if (AA &&
-          AA->pointsToConstantMemory(
-              MemoryLocation(V, MMO->getSize(), MMO->getAAInfo())))
-        continue;
     }
 
     // Otherwise assume conservatively.
@@ -2273,7 +2267,7 @@ using MMOList = SmallVector<const MachineMemOperand *, 2>;
 static unsigned getSpillSlotSize(const MMOList &Accesses,
                                  const MachineFrameInfo &MFI) {
   unsigned Size = 0;
-  for (auto A : Accesses)
+  for (const auto *A : Accesses)
     if (MFI.isSpillSlotObjectIndex(
             cast<FixedStackPseudoSourceValue>(A->getPseudoValue())
                 ->getFrameIndex()))
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 00d75f8231c7..df7b6c782b91 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -230,8 +230,7 @@ namespace {
 
     bool IsGuaranteedToExecute(MachineBasicBlock *BB);
 
-    bool isTriviallyReMaterializable(const MachineInstr &MI,
-                                     AAResults *AA) const;
+    bool isTriviallyReMaterializable(const MachineInstr &MI) const;
 
     void EnterScope(MachineBasicBlock *MBB);
 
@@ -666,9 +665,9 @@ bool MachineLICMBase::IsGuaranteedToExecute(MachineBasicBlock *BB) {
 /// virtual register uses. Even though rematerializable RA might not actually
 /// rematerialize it in this scenario. In that case we do not want to hoist such
 /// instruction out of the loop in a belief RA will sink it back if needed.
-bool MachineLICMBase::isTriviallyReMaterializable(const MachineInstr &MI,
-                                                  AAResults *AA) const {
-  if (!TII->isTriviallyReMaterializable(MI, AA))
+bool MachineLICMBase::isTriviallyReMaterializable(
+    const MachineInstr &MI) const {
+  if (!TII->isTriviallyReMaterializable(MI))
     return false;
 
   for (const MachineOperand &MO : MI.operands()) {
@@ -1174,7 +1173,7 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
 
   // Rematerializable instructions should always be hoisted providing the
   // register allocator can just pull them down again when needed.
-  if (isTriviallyReMaterializable(MI, AA))
+  if (isTriviallyReMaterializable(MI))
     return true;
 
   // FIXME: If there are long latency loop-invariant instructions inside the
@@ -1227,8 +1226,8 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
 
   // High register pressure situation, only hoist if the instruction is going
   // to be remat'ed.
-  if (!isTriviallyReMaterializable(MI, AA) &&
-      !MI.isDereferenceableInvariantLoad(AA)) {
+  if (!isTriviallyReMaterializable(MI) &&
+      !MI.isDereferenceableInvariantLoad()) {
     LLVM_DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
     return false;
   }
@@ -1247,7 +1246,7 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI) {
   // If not, we may be able to unfold a load and hoist that.
   // First test whether the instruction is loading from an amenable
   // memory location.
-  if (!MI->isDereferenceableInvariantLoad(AA))
+  if (!MI->isDereferenceableInvariantLoad())
     return nullptr;
 
   // Next determine the register class for a temporary register.
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 8d500398f55e..52501ca7c871 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -219,7 +219,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   RegClassInfo.runOnMachineFunction(*MF);
 
-  for (auto &L : *MLI)
+  for (const auto &L : *MLI)
     scheduleLoop(*L);
 
   return false;
@@ -231,7 +231,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
 /// the loop.
 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
   bool Changed = false;
-  for (auto &InnerLoop : L)
+  for (const auto &InnerLoop : L)
     Changed |= scheduleLoop(*InnerLoop);
 
 #ifndef NDEBUG
@@ -689,7 +689,7 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
   Worklist.push_back(SUa);
   while (!Worklist.empty()) {
     const SUnit *SU = Worklist.pop_back_val();
-    for (auto &SI : SU->Succs) {
+    for (const auto &SI : SU->Succs) {
       SUnit *SuccSU = SI.getSUnit();
       if (SI.getKind() == SDep::Order) {
         if (Visited.count(SuccSU))
@@ -706,11 +706,11 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
 
 /// Return true if the instruction causes a chain between memory
 /// references before and after it.
-static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
+static bool isDependenceBarrier(MachineInstr &MI) {
   return MI.isCall() || MI.mayRaiseFPException() ||
          MI.hasUnmodeledSideEffects() ||
          (MI.hasOrderedMemoryRef() &&
-          (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad(AA)));
+          (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
 }
 
 /// Return the underlying objects for the memory references of an instruction.
@@ -743,14 +743,14 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
     UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
   for (auto &SU : SUnits) {
     MachineInstr &MI = *SU.getInstr();
-    if (isDependenceBarrier(MI, AA))
+    if (isDependenceBarrier(MI))
       PendingLoads.clear();
     else if (MI.mayLoad()) {
       SmallVector<const Value *, 4> Objs;
       ::getUnderlyingObjects(&MI, Objs);
       if (Objs.empty())
         Objs.push_back(UnknownValue);
-      for (auto V : Objs) {
+      for (const auto *V : Objs) {
         SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
         SUs.push_back(&SU);
       }
@@ -759,12 +759,12 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
       ::getUnderlyingObjects(&MI, Objs);
       if (Objs.empty())
         Objs.push_back(UnknownValue);
-      for (auto V : Objs) {
+      for (const auto *V : Objs) {
         MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
             PendingLoads.find(V);
         if (I == PendingLoads.end())
           continue;
-        for (auto Load : I->second) {
+        for (auto *Load : I->second) {
           if (isSuccOrder(Load, &SU))
             continue;
           MachineInstr &LdMI = *Load->getInstr();
@@ -1407,8 +1407,8 @@ void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
 
     SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(DAG);
     // Add the artificial dependencies if it does not form a cycle.
-    for (auto I : UseSUs) {
-      for (auto Src : SrcSUs) {
+    for (auto *I : UseSUs) {
+      for (auto *Src : SrcSUs) {
         if (!SDAG->Topo.IsReachable(I, Src) && Src != I) {
           Src->addPred(SDep(I, SDep::Artificial));
           SDAG->Topo.AddPred(Src, I);
@@ -1878,7 +1878,7 @@ void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
       Order = TopDown;
       LLVM_DEBUG(dbgs() << "  Top down (intersect) ");
     } else if (NodeSets.size() == 1) {
-      for (auto &N : Nodes)
+      for (const auto &N : Nodes)
         if (N->Succs.size() == 0)
           R.insert(N);
       Order = BottomUp;
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 5f80445a5a34..96131dc2983e 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1698,7 +1698,7 @@ void BaseMemOpClusterMutation::collectMemOpRecords(
                         << ", Width: " << Width << "\n");
     }
 #ifndef NDEBUG
-    for (auto *Op : BaseOps)
+    for (const auto *Op : BaseOps)
       assert(Op);
 #endif
   }
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 006ba9273dfb..0568bc6a4600 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -446,7 +446,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
       MadeChange |= ProcessBlock(MBB);
 
     // If we have anything we marked as toSplit, split it now.
-    for (auto &Pair : ToSplit) {
+    for (const auto &Pair : ToSplit) {
       auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, *this);
       if (NewSucc != nullptr) {
         LLVM_DEBUG(dbgs() << " *** Splitting critical edge: "
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index a85dbf1de1ee..b546a5082b07 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -200,7 +200,7 @@ stable_hash llvm::stableHashValue(const MachineInstr &MI, bool HashVRegs,
 stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) {
   SmallVector<stable_hash> HashComponents;
   // TODO: Hash more stuff like block alignment and branch probabilities.
-  for (auto &MI : MBB)
+  for (const auto &MI : MBB)
     HashComponents.push_back(stableHashValue(MI));
   return stable_hash_combine_range(HashComponents.begin(),
                                    HashComponents.end());
@@ -209,7 +209,7 @@ stable_hash llvm::stableHashValue(const MachineBasicBlock &MBB) {
 stable_hash llvm::stableHashValue(const MachineFunction &MF) {
   SmallVector<stable_hash> HashComponents;
   // TODO: Hash lots more stuff like function alignment and stack objects.
-  for (auto &MBB : MF)
+  for (const auto &MBB : MF)
     HashComponents.push_back(stableHashValue(MBB));
   return stable_hash_combine_range(HashComponents.begin(),
                                    HashComponents.end());
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 0a5ff276fedc..715e5da26989 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -484,7 +484,7 @@ void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
   // Run an upwards post-order search for the trace start.
   Bounds.Downward = false;
   Bounds.Visited.clear();
-  for (auto I : inverse_post_order_ext(MBB, Bounds)) {
+  for (const auto *I : inverse_post_order_ext(MBB, Bounds)) {
     LLVM_DEBUG(dbgs() << "  pred for " << printMBBReference(*I) << ": ");
     TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
     // All the predecessors have been visited, pick the preferred one.
@@ -502,7 +502,7 @@ void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
   // Run a downwards post-order search for the trace end.
   Bounds.Downward = true;
   Bounds.Visited.clear();
-  for (auto I : post_order_ext(MBB, Bounds)) {
+  for (const auto *I : post_order_ext(MBB, Bounds)) {
     LLVM_DEBUG(dbgs() << "  succ for " << printMBBReference(*I) << ": ");
     TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
     // All the successors have been visited, pick the preferred one.
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 7a008bae726e..93e68918b632 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2802,8 +2802,8 @@ void MachineVerifier::visitMachineFunctionAfter() {
   // tracking numbers.
   if (MF->getFunction().getSubprogram()) {
     DenseSet<unsigned> SeenNumbers;
-    for (auto &MBB : *MF) {
-      for (auto &MI : MBB) {
+    for (const auto &MBB : *MF) {
+      for (const auto &MI : MBB) {
         if (auto Num = MI.peekDebugInstrNum()) {
           auto Result = SeenNumbers.insert((unsigned)Num);
           if (!Result.second)
diff --git a/llvm/lib/CodeGen/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp
index ec383b9b1c65..51de99b81057 100644
--- a/llvm/lib/CodeGen/RDFGraph.cpp
+++ b/llvm/lib/CodeGen/RDFGraph.cpp
@@ -1395,7 +1395,7 @@ void DataFlowGraph::recordDefsForDF(BlockRefsMap &PhiM,
 
   // Finally, add the set of defs to each block in the iterated dominance
   // frontier.
-  for (auto DB : IDF) {
+  for (auto *DB : IDF) {
     NodeAddr<BlockNode*> DBA = findBlock(DB);
     PhiM[DBA.Id].insert(Defs.begin(), Defs.end());
   }
@@ -1657,7 +1657,7 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
 
   // Recursively process all children in the dominator tree.
   MachineDomTreeNode *N = MDT.getNode(BA.Addr->getCode());
-  for (auto I : *N) {
+  for (auto *I : *N) {
     MachineBasicBlock *SB = I->getBlock();
     NodeAddr<BlockNode*> SBA = findBlock(SB);
     linkBlockRefs(DefM, SBA);
diff --git a/llvm/lib/CodeGen/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp
index 2fd947086b4d..d8eac20d16b6 100644
--- a/llvm/lib/CodeGen/RDFLiveness.cpp
+++ b/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -61,7 +61,7 @@ namespace rdf {
 
   raw_ostream &operator<< (raw_ostream &OS, const Print<Liveness::RefMap> &P) {
     OS << '{';
-    for (auto &I : P.Obj) {
+    for (const auto &I : P.Obj) {
       OS << ' ' << printReg(I.first, &P.G.getTRI()) << '{';
       for (auto J = I.second.begin(), E = I.second.end(); J != E; ) {
         OS << Print<NodeId>(J->first, P.G) << PrintLaneMaskOpt(J->second);
@@ -767,7 +767,7 @@ void Liveness::computeLiveIns() {
   }
 
   for (auto I : IDF)
-    for (auto S : I.second)
+    for (auto *S : I.second)
       IIDF[S].insert(I.first);
 
   computePhiInfo();
@@ -926,7 +926,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
 
   BitVector LiveIn(TRI.getNumRegs()), Live(TRI.getNumRegs());
   CopyLiveIns(B, LiveIn);
-  for (auto SI : B->successors())
+  for (auto *SI : B->successors())
     CopyLiveIns(SI, Live);
 
   for (MachineInstr &MI : llvm::reverse(*B)) {
@@ -1003,7 +1003,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
 
   // Go up the dominator tree (depth-first).
   MachineDomTreeNode *N = MDT.getNode(B);
-  for (auto I : *N) {
+  for (auto *I : *N) {
     RefMap L;
     MachineBasicBlock *SB = I->getBlock();
     traverse(SB, L);
@@ -1015,7 +1015,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
   if (Trace) {
     dbgs() << "\n-- " << printMBBReference(*B) << ": " << __func__
            << " after recursion into: {";
-    for (auto I : *N)
+    for (auto *I : *N)
       dbgs() << ' ' << I->getBlock()->getNumber();
     dbgs() << " }\n";
     dbgs() << "  LiveIn: " << Print<RefMap>(LiveIn, DFG) << '\n';
@@ -1155,7 +1155,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
     dbgs() << "  Local:  " << Print<RegisterAggr>(Local, DFG) << '\n';
   }
 
-  for (auto C : IIDF[B]) {
+  for (auto *C : IIDF[B]) {
     RegisterAggr &LiveC = LiveMap[C];
     for (const std::pair<const RegisterId, NodeRefSet> &S : LiveIn)
       for (auto R : S.second)
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 69db8bad54f9..d9ced9191fae 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -635,7 +635,7 @@ ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited,
     SmallPtrSet<MachineInstr*, 4> Uses;
     getGlobalUses(MI, MO.getReg(), Uses);
 
-    for (auto I : Uses) {
+    for (auto *I : Uses) {
       if (Ignore.count(I) || ToRemove.count(I))
         continue;
       if (!isSafeToRemove(I, Visited, ToRemove, Ignore))
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index 0c18814189eb..990dd84c829d 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -166,7 +166,7 @@ void RegAllocBase::allocatePhysRegs() {
 
 void RegAllocBase::postOptimization() {
   spiller().postOptimization();
-  for (auto DeadInst : DeadRemats) {
+  for (auto *DeadInst : DeadRemats) {
     LIS->RemoveMachineInstrFromMaps(*DeadInst);
     DeadInst->eraseFromParent();
   }
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 7defdf04aec8..91795f3d27fe 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -135,6 +135,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
 INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
 INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 72ceaa768803..9e4e26f1392e 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -1478,7 +1478,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 
-  for (auto &LiveReg : MBB.liveouts())
+  for (const auto &LiveReg : MBB.liveouts())
     setPhysRegState(LiveReg.PhysReg, regPreAssigned);
 
   Coalesced.clear();
@@ -1580,8 +1580,7 @@ FunctionPass *llvm::createFastRegisterAllocator() {
   return new RegAllocFast();
 }
 
-FunctionPass *llvm::createFastRegisterAllocator(
-  std::function<bool(const TargetRegisterInfo &TRI,
-                     const TargetRegisterClass &RC)> Ftor, bool ClearVirtRegs) {
+FunctionPass *llvm::createFastRegisterAllocator(RegClassFilterFunc Ftor,
+                                                bool ClearVirtRegs) {
   return new RegAllocFast(Ftor, ClearVirtRegs);
 }
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 2efb98ae200d..4a54d7ebf8a9 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -180,16 +180,7 @@ FunctionPass* llvm::createGreedyRegisterAllocator() {
   return new RAGreedy();
 }
 
-namespace llvm {
-FunctionPass* createGreedyRegisterAllocator(
-  std::function<bool(const TargetRegisterInfo &TRI,
-                     const TargetRegisterClass &RC)> Ftor);
-
-}
-
-FunctionPass* llvm::createGreedyRegisterAllocator(
-  std::function<bool(const TargetRegisterInfo &TRI,
-                     const TargetRegisterClass &RC)> Ftor) {
+FunctionPass *llvm::createGreedyRegisterAllocator(RegClassFilterFunc Ftor) {
   return new RAGreedy(Ftor);
 }
 
@@ -202,8 +193,6 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<MachineBlockFrequencyInfo>();
   AU.addPreserved<MachineBlockFrequencyInfo>();
-  AU.addRequired<AAResultsWrapperPass>();
-  AU.addPreserved<AAResultsWrapperPass>();
   AU.addRequired<LiveIntervals>();
   AU.addPreserved<LiveIntervals>();
   AU.addRequired<SlotIndexes>();
@@ -2530,7 +2519,6 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   Bundles = &getAnalysis<EdgeBundles>();
   SpillPlacer = &getAnalysis<SpillPlacement>();
   DebugVars = &getAnalysis<LiveDebugVariables>();
-  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   initializeCSRCost();
 
@@ -2552,7 +2540,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   LLVM_DEBUG(LIS->dump());
 
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
-  SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI, *VRAI));
+  SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree, *MBFI, *VRAI));
 
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 358e74541a54..316b12d0213b 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -25,7 +25,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
@@ -54,7 +53,6 @@ class MachineLoop;
 class MachineLoopInfo;
 class MachineOptimizationRemarkEmitter;
 class MachineOptimizationRemarkMissed;
-class SlotIndex;
 class SlotIndexes;
 class TargetInstrInfo;
 class VirtRegMap;
@@ -174,7 +172,6 @@ private:
   EdgeBundles *Bundles;
   SpillPlacement *SpillPlacer;
   LiveDebugVariables *DebugVars;
-  AliasAnalysis *AA;
 
   // state
   std::unique_ptr<Spiller> SpillerInstance;
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 8c262130fb70..b3d926eeb552 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -783,7 +783,7 @@ void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
 void RegAllocPBQP::postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS) {
   VRegSpiller.postOptimization();
   /// Remove dead defs because of rematerialization.
-  for (auto DeadInst : DeadRemats) {
+  for (auto *DeadInst : DeadRemats) {
     LIS.RemoveMachineInstrFromMaps(*DeadInst);
     DeadInst->eraseFromParent();
   }
diff --git a/llvm/lib/CodeGen/RegAllocScore.cpp b/llvm/lib/CodeGen/RegAllocScore.cpp
index 32fa5e07dd16..17e3eeef664b 100644
--- a/llvm/lib/CodeGen/RegAllocScore.cpp
+++ b/llvm/lib/CodeGen/RegAllocScore.cpp
@@ -74,8 +74,7 @@ double RegAllocScore::getScore() const {
 
 RegAllocScore
 llvm::calculateRegAllocScore(const MachineFunction &MF,
-                             const MachineBlockFrequencyInfo &MBFI,
-                             AAResults &AAResults) {
+                             const MachineBlockFrequencyInfo &MBFI) {
   return calculateRegAllocScore(
       MF,
       [&](const MachineBasicBlock &MBB) {
@@ -83,7 +82,7 @@ llvm::calculateRegAllocScore(const MachineFunction &MF,
       },
       [&](const MachineInstr &MI) {
         return MF.getSubtarget().getInstrInfo()->isTriviallyReMaterializable(
-            MI, &AAResults);
+            MI);
       });
 }
 
diff --git a/llvm/lib/CodeGen/RegAllocScore.h b/llvm/lib/CodeGen/RegAllocScore.h
index 2bcd0b5895bf..b80adae29f23 100644
--- a/llvm/lib/CodeGen/RegAllocScore.h
+++ b/llvm/lib/CodeGen/RegAllocScore.h
@@ -19,7 +19,6 @@
 
 namespace llvm {
 
-class AAResults;
 class MachineBasicBlock;
 class MachineBlockFrequencyInfo;
 class MachineFunction;
@@ -62,8 +61,7 @@ public:
 /// different policies, the better policy would have a smaller score.
 /// The implementation is the overload below (which is also easily unittestable)
 RegAllocScore calculateRegAllocScore(const MachineFunction &MF,
-                                     const MachineBlockFrequencyInfo &MBFI,
-                                     AAResults &AAResults);
+                                     const MachineBlockFrequencyInfo &MBFI);
 
 /// Implementation of the above, which is also more easily unittestable.
 RegAllocScore calculateRegAllocScore(
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 930d05324440..8a6f823c8a0c 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1148,7 +1148,7 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
     // we need to keep the copy of B = A at the end of Pred if we remove
     // B = A from MBB.
     bool ValB_Changed = false;
-    for (auto VNI : IntB.valnos) {
+    for (auto *VNI : IntB.valnos) {
       if (VNI->isUnused())
         continue;
       if (PVal->def < VNI->def && VNI->def < LIS->getMBBEndIdx(Pred)) {
@@ -1306,7 +1306,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   }
   if (!TII->isAsCheapAsAMove(*DefMI))
     return false;
-  if (!TII->isTriviallyReMaterializable(*DefMI, AA))
+  if (!TII->isTriviallyReMaterializable(*DefMI))
     return false;
   if (!definesFullReg(*DefMI, SrcReg))
     return false;
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index 62a459fca611..b14a36e4eeb4 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -581,7 +581,7 @@ void RegisterOperands::collect(const MachineInstr &MI,
 void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
                                       const LiveIntervals &LIS) {
   SlotIndex SlotIdx = LIS.getInstructionIndex(MI);
-  for (auto RI = Defs.begin(); RI != Defs.end(); /*empty*/) {
+  for (auto *RI = Defs.begin(); RI != Defs.end(); /*empty*/) {
     Register Reg = RI->RegUnit;
     const LiveRange *LR = getLiveRange(LIS, Reg);
     if (LR != nullptr) {
@@ -602,7 +602,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
                                           const MachineRegisterInfo &MRI,
                                           SlotIndex Pos,
                                           MachineInstr *AddFlagsMI) {
-  for (auto I = Defs.begin(); I != Defs.end(); ) {
+  for (auto *I = Defs.begin(); I != Defs.end();) {
     LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
                                            Pos.getDeadSlot());
     // If the def is all that is live after the instruction, then in case
@@ -620,7 +620,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
       ++I;
     }
   }
-  for (auto I = Uses.begin(); I != Uses.end(); ) {
+  for (auto *I = Uses.begin(); I != Uses.end();) {
     LaneBitmask LiveBefore = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
                                             Pos.getBaseIndex());
     LaneBitmask LaneMask = I->LaneMask & LiveBefore;
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index e7116ec3ea28..00a551ade213 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -340,7 +340,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
         // analysis here, which would look at all uses of an argument inside
         // the function being called.
         auto B = CS.arg_begin(), E = CS.arg_end();
-        for (auto A = B; A != E; ++A)
+        for (const auto *A = B; A != E; ++A)
           if (A->get() == V)
             if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
                                                CS.doesNotAccessMemory()))) {
@@ -498,7 +498,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
   if (ClColoring)
     SSC.run();
 
-  for (auto *I : SSC.getMarkers()) {
+  for (const auto *I : SSC.getMarkers()) {
     auto *Op = dyn_cast<Instruction>(I->getOperand(1));
     const_cast<IntrinsicInst *>(I)->eraseFromParent();
     // Remove the operand bitcast, too, if it has no more uses left.
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index 07dcc34fbf15..4fc9399c2b9e 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -530,9 +530,9 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 
 /// Returns true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
-static inline bool isGlobalMemoryObject(AAResults *AA, MachineInstr *MI) {
+static inline bool isGlobalMemoryObject(MachineInstr *MI) {
   return MI->isCall() || MI->hasUnmodeledSideEffects() ||
-         (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
+         (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad());
 }
 
 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
@@ -880,7 +880,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
     // actual addresses).
 
     // This is a barrier event that acts as a pivotal node in the DAG.
-    if (isGlobalMemoryObject(AA, &MI)) {
+    if (isGlobalMemoryObject(&MI)) {
 
       // Become the barrier chain.
       if (BarrierChain)
@@ -917,7 +917,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
 
     // If it's not a store or a variant load, we're done.
     if (!MI.mayStore() &&
-        !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad(AA)))
+        !(MI.mayLoad() && !MI.isDereferenceableInvariantLoad()))
       continue;
 
     // Always add dependecy edge to BarrierChain if present.
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index d627519a34aa..011f55efce1d 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -433,7 +433,7 @@ void SelectOptimize::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
         DebugPseudoINS.push_back(&*DIt);
       DIt++;
     }
-    for (auto DI : DebugPseudoINS) {
+    for (auto *DI : DebugPseudoINS) {
       DI->moveBefore(&*EndBlock->getFirstInsertionPt());
     }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2654c00929d8..edb0756e8c3b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1868,8 +1868,7 @@ SDValue DAGCombiner::combine(SDNode *N) {
 
   // If N is a commutative binary node, try to eliminate it if the commuted
   // version is already present in the DAG.
-  if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
-      N->getNumValues() == 1) {
+  if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode())) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
 
@@ -4159,6 +4158,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
     return RMUL;
 
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -5978,44 +5981,64 @@ static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
   if (!TLI.isTypeLegal(VT))
     return SDValue();
 
-  // Look through an optional extension and find a 'not'.
-  // TODO: Should we favor test+set even without the 'not' op?
-  SDValue Not = And->getOperand(0), And1 = And->getOperand(1);
-  if (Not.getOpcode() == ISD::ANY_EXTEND)
-    Not = Not.getOperand(0);
-  if (!isBitwiseNot(Not) || !Not.hasOneUse() || !isOneConstant(And1))
+  // Look through an optional extension.
+  SDValue And0 = And->getOperand(0), And1 = And->getOperand(1);
+  if (And0.getOpcode() == ISD::ANY_EXTEND && And0.hasOneUse())
+    And0 = And0.getOperand(0);
+  if (!isOneConstant(And1) || !And0.hasOneUse())
     return SDValue();
 
-  // Look though an optional truncation. The source operand may not be the same
-  // type as the original 'and', but that is ok because we are masking off
-  // everything but the low bit.
-  SDValue Srl = Not.getOperand(0);
-  if (Srl.getOpcode() == ISD::TRUNCATE)
-    Srl = Srl.getOperand(0);
+  SDValue Src = And0;
+
+  // Attempt to find a 'not' op.
+  // TODO: Should we favor test+set even without the 'not' op?
+  bool FoundNot = false;
+  if (isBitwiseNot(Src)) {
+    FoundNot = true;
+    Src = Src.getOperand(0);
+
+    // Look though an optional truncation. The source operand may not be the
+    // same type as the original 'and', but that is ok because we are masking
+    // off everything but the low bit.
+    if (Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse())
+      Src = Src.getOperand(0);
+  }
 
   // Match a shift-right by constant.
-  if (Srl.getOpcode() != ISD::SRL || !Srl.hasOneUse() ||
-      !isa<ConstantSDNode>(Srl.getOperand(1)))
+  if (Src.getOpcode() != ISD::SRL || !Src.hasOneUse())
     return SDValue();
 
   // We might have looked through casts that make this transform invalid.
   // TODO: If the source type is wider than the result type, do the mask and
   //       compare in the source type.
-  const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1);
-  unsigned VTBitWidth = VT.getSizeInBits();
-  if (ShiftAmt.uge(VTBitWidth))
+  unsigned VTBitWidth = VT.getScalarSizeInBits();
+  SDValue ShiftAmt = Src.getOperand(1);
+  auto *ShiftAmtC = dyn_cast<ConstantSDNode>(ShiftAmt);
+  if (!ShiftAmtC || !ShiftAmtC->getAPIntValue().ult(VTBitWidth))
     return SDValue();
 
-  if (!TLI.hasBitTest(Srl.getOperand(0), Srl.getOperand(1)))
+  // Set source to shift source.
+  Src = Src.getOperand(0);
+
+  // Try again to find a 'not' op.
+  // TODO: Should we favor test+set even with two 'not' ops?
+  if (!FoundNot) {
+    if (!isBitwiseNot(Src))
+      return SDValue();
+    Src = Src.getOperand(0);
+  }
+
+  if (!TLI.hasBitTest(Src, ShiftAmt))
     return SDValue();
 
   // Turn this into a bit-test pattern using mask op + setcc:
   // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
+  // and (srl (not X), C)), 1 --> (and X, 1<<C) == 0
   SDLoc DL(And);
-  SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT);
+  SDValue X = DAG.getZExtOrTrunc(Src, DL, VT);
   EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   SDValue Mask = DAG.getConstant(
-      APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT);
+      APInt::getOneBitSet(VTBitWidth, ShiftAmtC->getZExtValue()), DL, VT);
   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask);
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
@@ -6229,7 +6252,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     // This can be a pure constant or a vector splat, in which case we treat the
     // vector as a scalar and use the splat value.
     APInt Constant = APInt::getZero(1);
-    if (const ConstantSDNode *C = isConstOrConstSplat(N1)) {
+    if (const ConstantSDNode *C = isConstOrConstSplat(
+            N1, /*AllowUndef=*/false, /*AllowTruncation=*/true)) {
       Constant = C->getAPIntValue();
     } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
       APInt SplatValue, SplatUndef;
@@ -6339,18 +6363,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
   // fold (and (load x), 255) -> (zextload x, i8)
   // fold (and (extload x, i16), 255) -> (zextload x, i8)
-  // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
-  if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD ||
-                                (N0.getOpcode() == ISD::ANY_EXTEND &&
-                                 N0.getOperand(0).getOpcode() == ISD::LOAD))) {
-    if (SDValue Res = reduceLoadWidth(N)) {
-      LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
-        ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);
-      AddToWorklist(N);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 0), Res);
-      return SDValue(N, 0);
-    }
-  }
+  if (N1C && N0.getOpcode() == ISD::LOAD && !VT.isVector())
+    if (SDValue Res = reduceLoadWidth(N))
+      return Res;
 
   if (LegalTypes) {
     // Attempt to propagate the AND back up to the leaves which, if they're
@@ -6856,20 +6871,23 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
 }
 
 /// OR combines for which the commuted variant will be tried as well.
-static SDValue visitORCommutative(
-    SelectionDAG &DAG, SDValue N0, SDValue N1, SDNode *N) {
+static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
+                                  SDNode *N) {
   EVT VT = N0.getValueType();
   if (N0.getOpcode() == ISD::AND) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+
     // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y)
     // TODO: Set AllowUndefs = true.
-    if (getBitwiseNotOperand(N0.getOperand(1), N0.getOperand(0),
+    if (getBitwiseNotOperand(N01, N00,
                              /* AllowUndefs */ false) == N1)
-      return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(0), N1);
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N1);
 
     // fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
-    if (getBitwiseNotOperand(N0.getOperand(0), N0.getOperand(1),
+    if (getBitwiseNotOperand(N00, N01,
                              /* AllowUndefs */ false) == N1)
-      return DAG.getNode(ISD::OR, SDLoc(N), VT, N0.getOperand(1), N1);
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, N01, N1);
   }
 
   if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
@@ -7915,7 +7933,7 @@ SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
   int64_t FirstOffset = INT64_MAX;
   StoreSDNode *FirstStore = nullptr;
   Optional<BaseIndexOffset> Base;
-  for (auto Store : Stores) {
+  for (auto *Store : Stores) {
     // All the stores store different parts of the CombinedValue. A truncate is
     // required to get the partial value.
     SDValue Trunc = Store->getValue();
@@ -8488,28 +8506,6 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     return DAG.getNode(ISD::AND, DL, VT, NotX, N1);
   }
 
-  if ((N0Opcode == ISD::SRL || N0Opcode == ISD::SHL) && N0.hasOneUse()) {
-    ConstantSDNode *XorC = isConstOrConstSplat(N1);
-    ConstantSDNode *ShiftC = isConstOrConstSplat(N0.getOperand(1));
-    unsigned BitWidth = VT.getScalarSizeInBits();
-    if (XorC && ShiftC) {
-      // Don't crash on an oversized shift. We can not guarantee that a bogus
-      // shift has been simplified to undef.
-      uint64_t ShiftAmt = ShiftC->getLimitedValue();
-      if (ShiftAmt < BitWidth) {
-        APInt Ones = APInt::getAllOnes(BitWidth);
-        Ones = N0Opcode == ISD::SHL ? Ones.shl(ShiftAmt) : Ones.lshr(ShiftAmt);
-        if (XorC->getAPIntValue() == Ones) {
-          // If the xor constant is a shifted -1, do a 'not' before the shift:
-          // xor (X << ShiftC), XorC --> (not X) << ShiftC
-          // xor (X >> ShiftC), XorC --> (not X) >> ShiftC
-          SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
-          return DAG.getNode(N0Opcode, DL, VT, Not, N0.getOperand(1));
-        }
-      }
-    }
-  }
-
   // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
   if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
     SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
@@ -11817,6 +11813,9 @@ SDValue DAGCombiner::foldSextSetcc(SDNode *N) {
   EVT N00VT = N00.getValueType();
   SDLoc DL(N);
 
+  // Propagate fast-math-flags.
+  SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
+
   // On some architectures (such as SSE/NEON/etc) the SETCC result type is
   // the same size as the compared operands. Try to optimize sext(setcc())
   // if this is the case.
@@ -12358,6 +12357,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     return V;
 
   if (N0.getOpcode() == ISD::SETCC) {
+    // Propagate fast-math-flags.
+    SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
+
     // Only do this before legalize for now.
     if (!LegalOperations && VT.isVector() &&
         N0.getValueType().getVectorElementType() == MVT::i1) {
@@ -12549,6 +12551,9 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
+    // Propagate fast-math-flags.
+    SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
+
     // For vectors:
     // aext(setcc) -> vsetcc
     // aext(setcc) -> truncate(vsetcc)
@@ -13155,6 +13160,19 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     return N0.getOperand(0);
   }
 
+  // Try to narrow a truncate-of-sext_in_reg to the destination type:
+  // trunc (sign_ext_inreg X, iM) to iN --> sign_ext_inreg (trunc X to iN), iM
+  if (!LegalTypes && N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      N0.hasOneUse()) {
+    SDValue X = N0.getOperand(0);
+    SDValue ExtVal = N0.getOperand(1);
+    EVT ExtVT = cast<VTSDNode>(ExtVal)->getVT();
+    if (ExtVT.bitsLT(VT)) {
+      SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal);
+    }
+  }
+
   // If this is anyext(trunc), don't fold it, allow ourselves to be folded.
   if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
     return SDValue();
@@ -19478,7 +19496,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     return Shuf;
 
   // Handle <1 x ???> vector insertion special cases.
-  if (VT.getVectorNumElements() == 1) {
+  if (NumElts == 1) {
     // insert_vector_elt(x, extract_vector_elt(y, 0), 0) -> y
     if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         InVal.getOperand(0).getValueType() == VT &&
@@ -19506,80 +19524,77 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     }
   }
 
-  // Attempt to fold the insertion into a legal BUILD_VECTOR.
+  // Attempt to convert an insert_vector_elt chain into a legal build_vector.
   if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
-    auto UpdateBuildVector = [&](SmallVectorImpl<SDValue> &Ops) {
-      assert(Ops.size() == NumElts && "Unexpected vector size");
-
-      // Insert the element
-      if (Elt < Ops.size()) {
-        // All the operands of BUILD_VECTOR must have the same type;
-        // we enforce that here.
-        EVT OpVT = Ops[0].getValueType();
-        Ops[Elt] =
-            OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
+    // vXi1 vector - we don't need to recurse.
+    if (NumElts == 1)
+      return DAG.getBuildVector(VT, DL, {InVal});
+
+    // If we haven't already collected the element, insert into the op list.
+    EVT MaxEltVT = InVal.getValueType();
+    auto AddBuildVectorOp = [&](SmallVectorImpl<SDValue> &Ops, SDValue Elt,
+                                unsigned Idx) {
+      if (!Ops[Idx]) {
+        Ops[Idx] = Elt;
+        if (VT.isInteger()) {
+          EVT EltVT = Elt.getValueType();
+          MaxEltVT = MaxEltVT.bitsGE(EltVT) ? MaxEltVT : EltVT;
+        }
       }
+    };
 
-      // Return the new vector
+    // Ensure all the operands are the same value type, fill any missing
+    // operands with UNDEF and create the BUILD_VECTOR.
+    auto CanonicalizeBuildVector = [&](SmallVectorImpl<SDValue> &Ops) {
+      assert(Ops.size() == NumElts && "Unexpected vector size");
+      for (SDValue &Op : Ops) {
+        if (Op)
+          Op = VT.isInteger() ? DAG.getAnyExtOrTrunc(Op, DL, MaxEltVT) : Op;
+        else
+          Op = DAG.getUNDEF(MaxEltVT);
+      }
       return DAG.getBuildVector(VT, DL, Ops);
     };
 
-    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
-    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
-    // vector elements.
-    SmallVector<SDValue, 8> Ops;
+    SmallVector<SDValue, 8> Ops(NumElts, SDValue());
+    Ops[Elt] = InVal;
 
-    // Do not combine these two vectors if the output vector will not replace
-    // the input vector.
-    if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
-      Ops.append(InVec->op_begin(), InVec->op_end());
-      return UpdateBuildVector(Ops);
-    }
+    // Recurse up a INSERT_VECTOR_ELT chain to build a BUILD_VECTOR.
+    for (SDValue CurVec = InVec; CurVec;) {
+      // UNDEF - build new BUILD_VECTOR from already inserted operands.
+      if (CurVec.isUndef())
+        return CanonicalizeBuildVector(Ops);
 
-    if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && InVec.hasOneUse()) {
-      Ops.push_back(InVec.getOperand(0));
-      Ops.append(NumElts - 1, DAG.getUNDEF(InVec.getOperand(0).getValueType()));
-      return UpdateBuildVector(Ops);
-    }
+      // BUILD_VECTOR - insert unused operands and build new BUILD_VECTOR.
+      if (CurVec.getOpcode() == ISD::BUILD_VECTOR && CurVec.hasOneUse()) {
+        for (unsigned I = 0; I != NumElts; ++I)
+          AddBuildVectorOp(Ops, CurVec.getOperand(I), I);
+        return CanonicalizeBuildVector(Ops);
+      }
 
-    if (InVec.isUndef()) {
-      Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
-      return UpdateBuildVector(Ops);
-    }
+      // SCALAR_TO_VECTOR - insert unused scalar and build new BUILD_VECTOR.
+      if (CurVec.getOpcode() == ISD::SCALAR_TO_VECTOR && CurVec.hasOneUse()) {
+        AddBuildVectorOp(Ops, CurVec.getOperand(0), 0);
+        return CanonicalizeBuildVector(Ops);
+      }
 
-    // If we're inserting into the end of a vector as part of an sequence, see
-    // if we can create a BUILD_VECTOR by following the sequence back up the
-    // chain.
-    if (Elt == (NumElts - 1)) {
-      SmallVector<SDValue> ReverseInsertions;
-      ReverseInsertions.push_back(InVal);
-
-      EVT MaxEltVT = InVal.getValueType();
-      SDValue CurVec = InVec;
-      for (unsigned I = 1; I != NumElts; ++I) {
-        if (CurVec.getOpcode() != ISD::INSERT_VECTOR_ELT || !CurVec.hasOneUse())
-          break;
+      // INSERT_VECTOR_ELT - insert operand and continue up the chain.
+      if (CurVec.getOpcode() == ISD::INSERT_VECTOR_ELT && CurVec.hasOneUse())
+        if (auto *CurIdx = dyn_cast<ConstantSDNode>(CurVec.getOperand(2)))
+          if (CurIdx->getAPIntValue().ult(NumElts)) {
+            unsigned Idx = CurIdx->getZExtValue();
+            AddBuildVectorOp(Ops, CurVec.getOperand(1), Idx);
 
-        auto *CurIdx = dyn_cast<ConstantSDNode>(CurVec.getOperand(2));
-        if (!CurIdx || CurIdx->getAPIntValue() != ((NumElts - 1) - I))
-          break;
-        SDValue CurVal = CurVec.getOperand(1);
-        ReverseInsertions.push_back(CurVal);
-        if (VT.isInteger()) {
-          EVT CurValVT = CurVal.getValueType();
-          MaxEltVT = MaxEltVT.bitsGE(CurValVT) ? MaxEltVT : CurValVT;
-        }
-        CurVec = CurVec.getOperand(0);
-      }
+            // Found entire BUILD_VECTOR.
+            if (all_of(Ops, [](SDValue Op) { return !!Op; }))
+              return CanonicalizeBuildVector(Ops);
 
-      if (ReverseInsertions.size() == NumElts) {
-        for (unsigned I = 0; I != NumElts; ++I) {
-          SDValue Val = ReverseInsertions[(NumElts - 1) - I];
-          Val = VT.isInteger() ? DAG.getAnyExtOrTrunc(Val, DL, MaxEltVT) : Val;
-          Ops.push_back(Val);
-        }
-        return DAG.getBuildVector(VT, DL, Ops);
-      }
+            CurVec = CurVec->getOperand(0);
+            continue;
+          }
+
+      // Failed to find a match in the chain - bail.
+      break;
     }
   }
 
@@ -22643,6 +22658,56 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     }
   }
 
+  // If we're not performing a select/blend shuffle, see if we can convert the
+  // shuffle into a AND node, with all the out-of-lane elements are known zero.
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
+    bool IsInLaneMask = true;
+    ArrayRef<int> Mask = SVN->getMask();
+    SmallVector<int, 16> ClearMask(NumElts, -1);
+    APInt DemandedLHS = APInt::getNullValue(NumElts);
+    APInt DemandedRHS = APInt::getNullValue(NumElts);
+    for (int I = 0; I != (int)NumElts; ++I) {
+      int M = Mask[I];
+      if (M < 0)
+        continue;
+      ClearMask[I] = M == I ? I : (I + NumElts);
+      IsInLaneMask &= (M == I) || (M == (int)(I + NumElts));
+      if (M != I) {
+        APInt &Demanded = M < (int)NumElts ? DemandedLHS : DemandedRHS;
+        Demanded.setBit(M % NumElts);
+      }
+    }
+    // TODO: Should we try to mask with N1 as well?
+    if (!IsInLaneMask &&
+        (!DemandedLHS.isNullValue() || !DemandedRHS.isNullValue()) &&
+        (DemandedLHS.isNullValue() ||
+         DAG.MaskedVectorIsZero(N0, DemandedLHS)) &&
+        (DemandedRHS.isNullValue() ||
+         DAG.MaskedVectorIsZero(N1, DemandedRHS))) {
+      SDLoc DL(N);
+      EVT IntVT = VT.changeVectorElementTypeToInteger();
+      EVT IntSVT = VT.getVectorElementType().changeTypeToInteger();
+      SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT);
+      SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT);
+      SmallVector<SDValue, 16> AndMask(NumElts, DAG.getUNDEF(IntSVT));
+      for (int I = 0; I != (int)NumElts; ++I)
+        if (0 <= Mask[I])
+          AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt;
+
+      // See if a clear mask is legal instead of going via
+      // XformToShuffleWithZero which loses UNDEF mask elements.
+      if (TLI.isVectorClearMaskLegal(ClearMask, IntVT))
+        return DAG.getBitcast(
+            VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0),
+                                     DAG.getConstant(0, DL, IntVT), ClearMask));
+
+      if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
+        return DAG.getBitcast(
+            VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0),
+                            DAG.getBuildVector(IntVT, DL, AndMask)));
+    }
+  }
+
   // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
   // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
   if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
@@ -23385,10 +23450,14 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   int Index0, Index1;
   SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
   SDValue Src1 = DAG.getSplatSourceVector(N1, Index1);
+  // Extract element from splat_vector should be free.
+  // TODO: use DAG.isSplatValue instead?
+  bool IsBothSplatVector = N0.getOpcode() == ISD::SPLAT_VECTOR &&
+                           N1.getOpcode() == ISD::SPLAT_VECTOR;
   if (!Src0 || !Src1 || Index0 != Index1 ||
       Src0.getValueType().getVectorElementType() != EltVT ||
       Src1.getValueType().getVectorElementType() != EltVT ||
-      !TLI.isExtractVecEltCheap(VT, Index0) ||
+      !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) ||
       !TLI.isOperationLegalOrCustom(Opcode, EltVT))
     return SDValue();
 
@@ -23410,6 +23479,8 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
   }
 
   // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
+  if (VT.isScalableVector())
+    return DAG.getSplatVector(VT, DL, ScalarBO);
   SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
   return DAG.getBuildVector(VT, DL, Ops);
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 8bdc9410d131..56d35dfe8701 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1404,17 +1404,21 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   }
 
   SDValue NewLoad;
+  Align ElementAlignment =
+      std::min(cast<StoreSDNode>(Ch)->getAlign(),
+               DAG.getDataLayout().getPrefTypeAlign(
+                   Op.getValueType().getTypeForEVT(*DAG.getContext())));
 
   if (Op.getValueType().isVector()) {
     StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, VecVT,
                                           Op.getValueType(), Idx);
-    NewLoad =
-        DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
+    NewLoad = DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr,
+                          MachinePointerInfo(), ElementAlignment);
   } else {
     StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
     NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
-                             MachinePointerInfo(),
-                             VecVT.getVectorElementType());
+                             MachinePointerInfo(), VecVT.getVectorElementType(),
+                             ElementAlignment);
   }
 
   // Replace the chain going out of the store, by the one out of the load.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 6c136bdfc652..b2df67f45c72 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -2918,6 +2918,9 @@ bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
   case ISD::STACKMAP:
     Res = SoftPromoteHalfOp_STACKMAP(N, OpNo);
     break;
+  case ISD::PATCHPOINT:
+    Res = SoftPromoteHalfOp_PATCHPOINT(N, OpNo);
+    break;
   }
 
   if (!Res.getNode())
@@ -3059,3 +3062,18 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo) {
 
   return SDValue(); // Signal that we replaced the node ourselves.
 }
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_PATCHPOINT(SDNode *N,
+                                                       unsigned OpNo) {
+  assert(OpNo >= 7);
+  SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end());
+  SDValue Op = N->getOperand(OpNo);
+  NewOps[OpNo] = GetSoftPromotedHalf(Op);
+  SDValue NewNode =
+      DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), NewOps);
+
+  for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++)
+    ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum));
+
+  return SDValue(); // Signal that we replaced the node ourselves.
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 343722a97c3c..228d4a43ccde 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1727,6 +1727,13 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::STACKMAP:
     Res = PromoteIntOp_STACKMAP(N, OpNo);
     break;
+  case ISD::PATCHPOINT:
+    Res = PromoteIntOp_PATCHPOINT(N, OpNo);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Res = PromoteIntOp_VP_STRIDED(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -2341,6 +2348,25 @@ SDValue DAGTypeLegalizer::PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+  assert(OpNo >= 7);
+  SmallVector<SDValue> NewOps(N->ops().begin(), N->ops().end());
+  SDValue Operand = N->getOperand(OpNo);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Operand.getValueType());
+  NewOps[OpNo] = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), NVT, Operand);
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
+  assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) ||
+         (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4));
+
+  SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
+  NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Result Expansion
 //===----------------------------------------------------------------------===//
@@ -2886,11 +2912,15 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
     if (N->getOpcode() == ISD::ADD) {
       Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
-      Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, HiOps);
+      Hi = DAG.computeKnownBits(HiOps[2]).isZero()
+               ? DAG.getNode(ISD::UADDO, dl, VTList, makeArrayRef(HiOps, 2))
+               : DAG.getNode(ISD::ADDCARRY, dl, VTList, HiOps);
     } else {
       Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
       HiOps[2] = Lo.getValue(1);
-      Hi = DAG.getNode(ISD::SUBCARRY, dl, VTList, HiOps);
+      Hi = DAG.computeKnownBits(HiOps[2]).isZero()
+               ? DAG.getNode(ISD::USUBO, dl, VTList, makeArrayRef(HiOps, 2))
+               : DAG.getNode(ISD::SUBCARRY, dl, VTList, HiOps);
     }
     return;
   }
@@ -4693,6 +4723,13 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::STACKMAP:
     Res = ExpandIntOp_STACKMAP(N, OpNo);
     break;
+  case ISD::PATCHPOINT:
+    Res = ExpandIntOp_PATCHPOINT(N, OpNo);
+    break;
+  case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
+  case ISD::EXPERIMENTAL_VP_STRIDED_STORE:
+    Res = ExpandIntOp_VP_STRIDED(N, OpNo);
+    break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -5108,6 +5145,17 @@ SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
   return Swap.getValue(1);
 }
 
+SDValue DAGTypeLegalizer::ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo) {
+  assert((N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_LOAD && OpNo == 3) ||
+         (N->getOpcode() == ISD::EXPERIMENTAL_VP_STRIDED_STORE && OpNo == 4));
+
+  SDValue Hi; // The upper half is dropped out.
+  SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
+  GetExpandedInteger(NewOps[OpNo], NewOps[OpNo], Hi);
+
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) {
   SDLoc dl(N);
 
@@ -5253,21 +5301,28 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
   unsigned NumElems = N->getNumOperands();
   EVT NOutVTElem = NOutVT.getVectorElementType();
-
+  TargetLoweringBase::BooleanContent NOutBoolType = TLI.getBooleanContents(NOutVT);
+  unsigned NOutExtOpc = TargetLowering::getExtendForContent(NOutBoolType);
   SDLoc dl(N);
 
   SmallVector<SDValue, 8> Ops;
   Ops.reserve(NumElems);
   for (unsigned i = 0; i != NumElems; ++i) {
-    SDValue Op;
+    SDValue Op = N->getOperand(i);
+    EVT OpVT = Op.getValueType();
     // BUILD_VECTOR integer operand types are allowed to be larger than the
     // result's element type. This may still be true after the promotion. For
     // example, we might be promoting (<v?i1> = BV <i32>, <i32>, ...) to
     // (v?i16 = BV <i32>, <i32>, ...), and we can't any_extend <i32> to <i16>.
-    if (N->getOperand(i).getValueType().bitsLT(NOutVTElem))
-      Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
-    else
-      Op = N->getOperand(i);
+    if (OpVT.bitsLT(NOutVTElem)) {
+      unsigned ExtOpc = ISD::ANY_EXTEND;
+      // Attempt to extend constant bool vectors to match target's BooleanContent.
+      // While not necessary, this improves chances of the constant correctly
+      // folding with compare results (e.g. for NOT patterns).
+      if (OpVT == MVT::i1 && Op.getOpcode() == ISD::Constant)
+        ExtOpc = NOutExtOpc;
+      Op = DAG.getNode(ExtOpc, dl, NOutVTElem, Op);
+    }
     Ops.push_back(Op);
   }
 
@@ -5524,30 +5579,67 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
   assert(OpNo > 1);
-
   SDValue Op = N->getOperand(OpNo);
-  SDLoc DL = SDLoc(N);
+
+  // FIXME: Non-constant operands are not yet handled:
+  //  - https://github.com/llvm/llvm-project/issues/26431
+  //  - https://github.com/llvm/llvm-project/issues/55957
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op);
+  if (!CN)
+    return SDValue();
+
+  // Copy operands before the one being expanded.
   SmallVector<SDValue> NewOps;
+  for (unsigned I = 0; I < OpNo; I++)
+    NewOps.push_back(N->getOperand(I));
+
+  EVT Ty = Op.getValueType();
+  SDLoc DL = SDLoc(N);
+  if (CN->getConstantIntValue()->getValue().getActiveBits() < 64) {
+    NewOps.push_back(
+        DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
+    NewOps.push_back(DAG.getTargetConstant(CN->getZExtValue(), DL, Ty));
+  } else {
+    // FIXME: https://github.com/llvm/llvm-project/issues/55609
+    return SDValue();
+  }
+
+  // Copy remaining operands.
+  for (unsigned I = OpNo + 1; I < N->getNumOperands(); I++)
+    NewOps.push_back(N->getOperand(I));
+
+  SDValue NewNode = DAG.getNode(N->getOpcode(), DL, N->getVTList(), NewOps);
+
+  for (unsigned ResNum = 0; ResNum < N->getNumValues(); ResNum++)
+    ReplaceValueWith(SDValue(N, ResNum), NewNode.getValue(ResNum));
+
+  return SDValue(); // Signal that we have replaced the node already.
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+  assert(OpNo >= 7);
+  SDValue Op = N->getOperand(OpNo);
+
+  // FIXME: Non-constant operands are not yet handled:
+  //  - https://github.com/llvm/llvm-project/issues/26431
+  //  - https://github.com/llvm/llvm-project/issues/55957
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op);
+  if (!CN)
+    return SDValue();
 
   // Copy operands before the one being expanded.
+  SmallVector<SDValue> NewOps;
   for (unsigned I = 0; I < OpNo; I++)
     NewOps.push_back(N->getOperand(I));
 
-  if (Op->getOpcode() == ISD::Constant) {
-    ConstantSDNode *CN = cast<ConstantSDNode>(Op);
-    EVT Ty = Op.getValueType();
-    if (CN->getConstantIntValue()->getValue().getActiveBits() < 64) {
-      NewOps.push_back(
-          DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
-      NewOps.push_back(DAG.getTargetConstant(CN->getZExtValue(), DL, Ty));
-    } else {
-      // FIXME: https://github.com/llvm/llvm-project/issues/55609
-      return SDValue();
-    }
+  EVT Ty = Op.getValueType();
+  SDLoc DL = SDLoc(N);
+  if (CN->getConstantIntValue()->getValue().getActiveBits() < 64) {
+    NewOps.push_back(
+        DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
+    NewOps.push_back(DAG.getTargetConstant(CN->getZExtValue(), DL, Ty));
   } else {
-    // FIXME: Non-constant operands are not yet handled:
-    //  - https://github.com/llvm/llvm-project/issues/26431
-    //  - https://github.com/llvm/llvm-project/issues/55957
+    // FIXME: https://github.com/llvm/llvm-project/issues/55609
     return SDValue();
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 2807b7f5ae68..6696b79cf885 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -403,6 +403,8 @@ private:
   SDValue PromoteIntOp_VP_REDUCE(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SET_ROUNDING(SDNode *N);
   SDValue PromoteIntOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -495,6 +497,8 @@ private:
   SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
   SDValue ExpandIntOp_SPLAT_VECTOR(SDNode *N);
   SDValue ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_PATCHPOINT(SDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_VP_STRIDED(SDNode *N, unsigned OpNo);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
                                   ISD::CondCode &CCCode, const SDLoc &dl);
@@ -744,6 +748,7 @@ private:
   SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo);
   SDValue SoftPromoteHalfOp_STACKMAP(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_PATCHPOINT(SDNode *N, unsigned OpNo);
 
   //===--------------------------------------------------------------------===//
   // Scalarization Support: LegalizeVectorTypes.cpp
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 842ffa2aa23e..f5a1eae1e7fe 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -737,6 +737,20 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::SELECT:
     Results.push_back(ExpandSELECT(Node));
     return;
+  case ISD::SELECT_CC: {
+    if (Node->getValueType(0).isScalableVector()) {
+      EVT CondVT = TLI.getSetCCResultType(
+          DAG.getDataLayout(), *DAG.getContext(), Node->getValueType(0));
+      SDValue SetCC =
+          DAG.getNode(ISD::SETCC, SDLoc(Node), CondVT, Node->getOperand(0),
+                      Node->getOperand(1), Node->getOperand(4));
+      Results.push_back(DAG.getSelect(SDLoc(Node), Node->getValueType(0), SetCC,
+                                      Node->getOperand(2),
+                                      Node->getOperand(3)));
+      return;
+    }
+    break;
+  }
   case ISD::FP_TO_UINT:
     ExpandFP_TO_UINT(Node, Results);
     return;
@@ -833,6 +847,16 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
     }
     break;
+  case ISD::FP_TO_SINT_SAT:
+  case ISD::FP_TO_UINT_SAT:
+    // Expand the fpsosisat if it is scalable to prevent it from unrolling below.
+    if (Node->getValueType(0).isScalableVector()) {
+      if (SDValue Expanded = TLI.expandFP_TO_INT_SAT(Node, DAG)) {
+        Results.push_back(Expanded);
+        return;
+      }
+    }
+    break;
   case ISD::SMULFIX:
   case ISD::UMULFIX:
     if (SDValue Expanded = TLI.expandFixedPointMul(Node, DAG)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 78fc407e9573..3ac2a7bddc5a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -793,7 +793,7 @@ ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     // Emit any debug values associated with the node.
     if (N->getHasDebugValue()) {
       MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
-      for (auto DV : DAG->GetDbgValues(N)) {
+      for (auto *DV : DAG->GetDbgValues(N)) {
         if (!DV->isEmitted())
           if (auto *DbgMI = Emitter.EmitDbgValue(DV, VRBaseMap))
             BB->insert(InsertPos, DbgMI);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 2a10157b404e..5166db033c62 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -749,7 +749,7 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
   // source order number as N.
   MachineBasicBlock *BB = Emitter.getBlock();
   MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
-  for (auto DV : DAG->GetDbgValues(N)) {
+  for (auto *DV : DAG->GetDbgValues(N)) {
     if (DV->isEmitted())
       continue;
     unsigned DVOrder = DV->getOrder();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c8d0f5faf647..441437351852 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -602,7 +603,7 @@ static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDValue> Ops) {
-  for (auto& Op : Ops) {
+  for (const auto &Op : Ops) {
     ID.AddPointer(Op.getNode());
     ID.AddInteger(Op.getResNo());
   }
@@ -611,7 +612,7 @@ static void AddNodeIDOperands(FoldingSetNodeID &ID,
 /// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
 static void AddNodeIDOperands(FoldingSetNodeID &ID,
                               ArrayRef<SDUse> Ops) {
-  for (auto& Op : Ops) {
+  for (const auto &Op : Ops) {
     ID.AddPointer(Op.getNode());
     ID.AddInteger(Op.getResNo());
   }
@@ -2711,16 +2712,9 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
         SubDemandedElts &= ScaledDemandedElts;
         if (!isSplatValue(Src, SubDemandedElts, SubUndefElts, Depth + 1))
           return false;
-
-        // Here we can't do "MatchAnyBits" operation merge for undef bits.
-        // Because some operation only use part value of the source.
-        // Take llvm.fshl.* for example:
-        // t1: v4i32 = Constant:i32<12>, undef:i32, Constant:i32<12>, undef:i32
-        // t2: v2i64 = bitcast t1
-        // t5: v2i64 = fshl t3, t4, t2
-        // We can not convert t2 to {i64 undef, i64 undef}
-        UndefElts |= APIntOps::ScaleBitMask(SubUndefElts, NumElts,
-                                            /*MatchAllBits=*/true);
+        // TODO: Add support for merging sub undef elements.
+        if (!SubUndefElts.isZero())
+          return false;
       }
       return true;
     }
@@ -2947,6 +2941,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
   unsigned Opcode = Op.getOpcode();
   switch (Opcode) {
+  case ISD::MERGE_VALUES:
+    return computeKnownBits(Op.getOperand(Op.getResNo()), DemandedElts,
+                            Depth + 1);
   case ISD::BUILD_VECTOR:
     // Collect the known bits that are shared by every demanded vector element.
     Known.Zero.setAllBits(); Known.One.setAllBits();
@@ -3219,12 +3216,6 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known = KnownBits::mulhs(Known, Known2);
     break;
   }
-  case ISD::UDIV: {
-    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
-    Known = KnownBits::udiv(Known, Known2);
-    break;
-  }
   case ISD::AVGCEILU: {
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -3339,6 +3330,38 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known.Zero |= Known2.Zero;
     }
     break;
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS: {
+    assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result");
+
+    // Collect lo/hi source values and concatenate.
+    // TODO: Would a KnownBits::concatBits helper be useful?
+    unsigned LoBits = Op.getOperand(0).getScalarValueSizeInBits();
+    unsigned HiBits = Op.getOperand(1).getScalarValueSizeInBits();
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = Known.anyext(LoBits + HiBits);
+    Known.insertBits(Known2, LoBits);
+
+    // Collect shift amount.
+    Known2 = computeKnownBits(Op.getOperand(2), DemandedElts, Depth + 1);
+
+    if (Opcode == ISD::SHL_PARTS)
+      Known = KnownBits::shl(Known, Known2);
+    else if (Opcode == ISD::SRA_PARTS)
+      Known = KnownBits::ashr(Known, Known2);
+    else // if (Opcode == ISD::SRL_PARTS)
+      Known = KnownBits::lshr(Known, Known2);
+
+    // TODO: Minimum shift low/high bits are known zero.
+
+    if (Op.getResNo() == 0)
+      Known = Known.extractBits(LoBits, 0);
+    else
+      Known = Known.extractBits(HiBits, LoBits);
+    break;
+  }
   case ISD::SIGN_EXTEND_INREG: {
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
@@ -3570,6 +3593,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known = KnownBits::computeForAddCarry(Known, Known2, Carry);
     break;
   }
+  case ISD::UDIV: {
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    Known = KnownBits::udiv(Known, Known2);
+    break;
+  }
   case ISD::SREM: {
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
@@ -3925,7 +3954,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::AssertZext:
     Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
     return VTBits-Tmp;
-
+  case ISD::MERGE_VALUES:
+    return ComputeNumSignBits(Op.getOperand(Op.getResNo()), DemandedElts,
+                              Depth + 1);
   case ISD::BUILD_VECTOR:
     Tmp = VTBits;
     for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
@@ -6105,8 +6136,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(N1.getValueType().isVector() == VT.isVector() &&
            "FP_TO_*INT_SAT type should be vector iff the operand type is "
            "vector!");
-    assert((!VT.isVector() || VT.getVectorNumElements() ==
-                                  N1.getValueType().getVectorNumElements()) &&
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
            "Vector element counts must match in FP_TO_*INT_SAT");
     assert(!cast<VTSDNode>(N2)->getVT().isVector() &&
            "Type to saturate to must be a scalar.");
@@ -6719,7 +6750,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        bool isVol, bool AlwaysInline,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo,
-                                       const AAMDNodes &AAInfo) {
+                                       const AAMDNodes &AAInfo, AAResults *AA) {
   // Turn a memcpy of undef to nop.
   // FIXME: We need to honor volatile even is Src is undef.
   if (Src.isUndef())
@@ -6782,6 +6813,11 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   AAMDNodes NewAAInfo = AAInfo;
   NewAAInfo.TBAA = NewAAInfo.TBAAStruct = nullptr;
 
+  const Value *SrcVal = SrcPtrInfo.V.dyn_cast<const Value *>();
+  bool isConstant =
+      AA && SrcVal &&
+      AA->pointsToConstantMemory(MemoryLocation(SrcVal, Size, AAInfo));
+
   MachineMemOperand::Flags MMOFlags =
       isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
   SmallVector<SDValue, 16> OutLoadChains;
@@ -6843,6 +6879,8 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       MachineMemOperand::Flags SrcMMOFlags = MMOFlags;
       if (isDereferenceable)
         SrcMMOFlags |= MachineMemOperand::MODereferenceable;
+      if (isConstant)
+        SrcMMOFlags |= MachineMemOperand::MOInvariant;
 
       Value = DAG.getExtLoad(
           ISD::EXTLOAD, dl, NVT, Chain,
@@ -7131,7 +7169,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                 bool isVol, bool AlwaysInline, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo,
                                 MachinePointerInfo SrcPtrInfo,
-                                const AAMDNodes &AAInfo) {
+                                const AAMDNodes &AAInfo, AAResults *AA) {
   // Check to see if we should lower the memcpy to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -7142,7 +7180,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
 
     SDValue Result = getMemcpyLoadsAndStores(
         *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
-        isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo);
+        isVol, false, DstPtrInfo, SrcPtrInfo, AAInfo, AA);
     if (Result.getNode())
       return Result;
   }
@@ -7161,9 +7199,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   // use a (potentially long) sequence of loads and stores.
   if (AlwaysInline) {
     assert(ConstantSize && "AlwaysInline requires a constant size!");
-    return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
-                                   ConstantSize->getZExtValue(), Alignment,
-                                   isVol, true, DstPtrInfo, SrcPtrInfo, AAInfo);
+    return getMemcpyLoadsAndStores(
+        *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
+        isVol, true, DstPtrInfo, SrcPtrInfo, AAInfo, AA);
   }
 
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
@@ -7245,7 +7283,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                  bool isVol, bool isTailCall,
                                  MachinePointerInfo DstPtrInfo,
                                  MachinePointerInfo SrcPtrInfo,
-                                 const AAMDNodes &AAInfo) {
+                                 const AAMDNodes &AAInfo, AAResults *AA) {
   // Check to see if we should lower the memmove to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -8904,7 +8942,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   }
 
 #ifndef NDEBUG
-  for (auto &Op : Ops)
+  for (const auto &Op : Ops)
     assert(Op.getOpcode() != ISD::DELETED_NODE &&
            "Operand is DELETED_NODE!");
 #endif
@@ -8928,6 +8966,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "True and False arms of SelectCC must have same type!");
     assert(Ops[2].getValueType() == VT &&
            "select_cc node must be of same type as true and false value!");
+    assert((!Ops[0].getValueType().isVector() ||
+            Ops[0].getValueType().getVectorElementCount() ==
+                VT.getVectorElementCount()) &&
+           "Expected select_cc with vector result to have the same sized "
+           "comparison type!");
     break;
   case ISD::BR_CC:
     assert(NumOps == 5 && "BR_CC takes 5 operands!");
@@ -9018,12 +9061,34 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
     return getNode(Opcode, DL, VTList.VTs[0], Ops, Flags);
 
 #ifndef NDEBUG
-  for (auto &Op : Ops)
+  for (const auto &Op : Ops)
     assert(Op.getOpcode() != ISD::DELETED_NODE &&
            "Operand is DELETED_NODE!");
 #endif
 
   switch (Opcode) {
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO: {
+    assert(VTList.NumVTs == 2 && Ops.size() == 2 &&
+           "Invalid add/sub overflow op!");
+    assert(VTList.VTs[0].isInteger() && VTList.VTs[1].isInteger() &&
+           Ops[0].getValueType() == Ops[1].getValueType() &&
+           Ops[0].getValueType() == VTList.VTs[0] &&
+           "Binary operator types must match!");
+    SDValue N1 = Ops[0], N2 = Ops[1];
+    canonicalizeCommutativeBinop(Opcode, N1, N2);
+
+    // (X +- 0) -> X with zero-overflow.
+    ConstantSDNode *N2CV = isConstOrConstSplat(N2, /*AllowUndefs*/ false,
+                                               /*AllowTruncation*/ true);
+    if (N2CV && N2CV->isZero()) {
+      SDValue ZeroOverFlow = getConstant(0, DL, VTList.VTs[1]);
+      return getNode(ISD::MERGE_VALUES, DL, VTList, {N1, ZeroOverFlow}, Flags);
+    }
+    break;
+  }
   case ISD::STRICT_FP_EXTEND:
     assert(VTList.NumVTs == 2 && Ops.size() == 2 &&
            "Invalid STRICT_FP_EXTEND!");
@@ -9914,7 +9979,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
     return;
 
   SmallVector<SDDbgValue *, 2> ClonedDVs;
-  for (auto DV : GetDbgValues(&N)) {
+  for (auto *DV : GetDbgValues(&N)) {
     if (DV->isInvalidated())
       continue;
     switch (N.getOpcode()) {
@@ -10268,7 +10333,7 @@ bool SelectionDAG::calculateDivergence(SDNode *N) {
   }
   if (TLI->isSDNodeSourceOfDivergence(N, FLI, DA))
     return true;
-  for (auto &Op : N->ops()) {
+  for (const auto &Op : N->ops()) {
     if (Op.Val.getValueType() != MVT::Other && Op.getNode()->isDivergent())
       return true;
   }
@@ -10298,7 +10363,7 @@ void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
   }
   for (size_t I = 0; I != Order.size(); ++I) {
     SDNode *N = Order[I];
-    for (auto U : N->uses()) {
+    for (auto *U : N->uses()) {
       unsigned &UnsortedOps = Degree[U];
       if (0 == --UnsortedOps)
         Order.push_back(U);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index fe3c38ec590d..35650b9bd00e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1789,7 +1789,7 @@ static void findWasmUnwindDestinations(
       UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
       UnwindDests.back().first->setIsEHScopeEntry();
       break;
-    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+    } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
       // Add the catchpad handlers to the possible destinations. We don't
       // continue to the unwind destination of the catchswitch for wasm.
       for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
@@ -1844,7 +1844,7 @@ static void findUnwindDestinations(
       UnwindDests.back().first->setIsEHScopeEntry();
       UnwindDests.back().first->setIsEHFuncletEntry();
       break;
-    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+    } else if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
       // Add the catchpad handlers to the possible destinations.
       for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
         UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
@@ -2990,14 +2990,20 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
   CopyToExportRegsIfNeeded(&I);
 
   // Retrieve successors.
+  SmallPtrSet<BasicBlock *, 8> Dests;
+  Dests.insert(I.getDefaultDest());
   MachineBasicBlock *Return = FuncInfo.MBBMap[I.getDefaultDest()];
 
   // Update successor info.
   addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
   for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
-    MachineBasicBlock *Target = FuncInfo.MBBMap[I.getIndirectDest(i)];
-    addSuccessorWithProb(CallBrMBB, Target, BranchProbability::getZero());
+    BasicBlock *Dest = I.getIndirectDest(i);
+    MachineBasicBlock *Target = FuncInfo.MBBMap[Dest];
     Target->setIsInlineAsmBrIndirectTarget();
+    Target->setHasAddressTaken();
+    // Don't add duplicate machine successors.
+    if (Dests.insert(Dest).second)
+      addSuccessorWithProb(CallBrMBB, Target, BranchProbability::getZero());
   }
   CallBrMBB->normalizeSuccProbs();
 
@@ -4075,6 +4081,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     return;
 
   bool isVolatile = I.isVolatile();
+  MachineMemOperand::Flags MMOFlags =
+      TLI.getLoadMemOperandFlags(I, DAG.getDataLayout());
 
   SDValue Root;
   bool ConstantMemory = false;
@@ -4091,6 +4099,12 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     // Do not serialize (non-volatile) loads of constant memory with anything.
     Root = DAG.getEntryNode();
     ConstantMemory = true;
+    MMOFlags |= MachineMemOperand::MOInvariant;
+
+    // FIXME: pointsToConstantMemory probably does not imply dereferenceable,
+    // but the previous usage implied it did. Probably should check
+    // isDereferenceableAndAlignedPointer.
+    MMOFlags |= MachineMemOperand::MODereferenceable;
   } else {
     // Do not serialize non-volatile loads against each other.
     Root = DAG.getRoot();
@@ -4110,9 +4124,6 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
   EVT PtrVT = Ptr.getValueType();
 
-  MachineMemOperand::Flags MMOFlags
-    = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout());
-
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // Serializing loads here may result in excessive register pressure, and
@@ -5766,7 +5777,7 @@ static const CallBase *FindPreallocatedCall(const Value *PreallocatedSetup) {
                  ->getCalledFunction()
                  ->getIntrinsicID() == Intrinsic::call_preallocated_setup &&
          "expected call_preallocated_setup Value");
-  for (auto *U : PreallocatedSetup->users()) {
+  for (const auto *U : PreallocatedSetup->users()) {
     auto *UseCall = cast<CallBase>(U);
     const Function *Fn = UseCall->getCalledFunction();
     if (!Fn || Fn->getIntrinsicID() != Intrinsic::call_preallocated_arg) {
@@ -5859,11 +5870,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
     // node.
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
-                               /* AlwaysInline */ false, isTC,
-                               MachinePointerInfo(I.getArgOperand(0)),
-                               MachinePointerInfo(I.getArgOperand(1)),
-                               I.getAAMetadata());
+    SDValue MC = DAG.getMemcpy(
+        Root, sdl, Op1, Op2, Op3, Alignment, isVol,
+        /* AlwaysInline */ false, isTC, MachinePointerInfo(I.getArgOperand(0)),
+        MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata(), AA);
     updateDAGForMaybeTailCall(MC);
     return;
   }
@@ -5881,11 +5891,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
     // node.
-    SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Alignment, isVol,
-                               /* AlwaysInline */ true, isTC,
-                               MachinePointerInfo(I.getArgOperand(0)),
-                               MachinePointerInfo(I.getArgOperand(1)),
-                               I.getAAMetadata());
+    SDValue MC = DAG.getMemcpy(
+        getRoot(), sdl, Dst, Src, Size, Alignment, isVol,
+        /* AlwaysInline */ true, isTC, MachinePointerInfo(I.getArgOperand(0)),
+        MachinePointerInfo(I.getArgOperand(1)), I.getAAMetadata(), AA);
     updateDAGForMaybeTailCall(MC);
     return;
   }
@@ -5940,7 +5949,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
                                 isTC, MachinePointerInfo(I.getArgOperand(0)),
                                 MachinePointerInfo(I.getArgOperand(1)),
-                                I.getAAMetadata());
+                                I.getAAMetadata(), AA);
     updateDAGForMaybeTailCall(MM);
     return;
   }
@@ -8855,7 +8864,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
       }
       break;
 
-    case InlineAsm::isInput: {
+    case InlineAsm::isInput:
+    case InlineAsm::isLabel: {
       SDValue InOperandVal = OpInfo.CallOperand;
 
       if (OpInfo.isMatchingInputConstraint()) {
@@ -9295,19 +9305,18 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
 static void addStackMapLiveVars(const CallBase &Call, unsigned StartIdx,
                                 const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
                                 SelectionDAGBuilder &Builder) {
-  for (unsigned i = StartIdx, e = Call.arg_size(); i != e; ++i) {
-    SDValue OpVal = Builder.getValue(Call.getArgOperand(i));
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
-      Ops.push_back(
-        Builder.DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
-      Ops.push_back(
-        Builder.DAG.getTargetConstant(C->getSExtValue(), DL, MVT::i64));
-    } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
-      const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
-      Ops.push_back(Builder.DAG.getTargetFrameIndex(
-          FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout())));
-    } else
-      Ops.push_back(OpVal);
+  SelectionDAG &DAG = Builder.DAG;
+  for (unsigned I = StartIdx; I < Call.arg_size(); I++) {
+    SDValue Op = Builder.getValue(Call.getArgOperand(I));
+
+    // Things on the stack are pointer-typed, meaning that they are already
+    // legal and can be emitted directly to target nodes.
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
+      Ops.push_back(DAG.getTargetFrameIndex(FI->getIndex(), Op.getValueType()));
+    } else {
+      // Otherwise emit a target independent node to be legalised.
+      Ops.push_back(Builder.getValue(Call.getArgOperand(I)));
+    }
   }
 }
 
@@ -9359,20 +9368,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   Ops.push_back(ShadConst);
 
   // Add the live variables.
-  for (unsigned I = 2; I < CI.arg_size(); I++) {
-    SDValue Op = getValue(CI.getArgOperand(I));
-
-    // Things on the stack are pointer-typed, meaning that they are already
-    // legal and can be emitted directly to target nodes.
-    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      Ops.push_back(DAG.getTargetFrameIndex(
-          FI->getIndex(), TLI.getFrameIndexTy(DAG.getDataLayout())));
-    } else {
-      // Otherwise emit a target independent node to be legalised.
-      Ops.push_back(getValue(CI.getArgOperand(I)));
-    }
-  }
+  addStackMapLiveVars(CI, 2, DL, Ops, *this);
 
   // Create the STACKMAP node.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -9449,6 +9445,19 @@ void SelectionDAGBuilder::visitPatchpoint(const CallBase &CB,
   // Replace the target specific call node with the patchable intrinsic.
   SmallVector<SDValue, 8> Ops;
 
+  // Push the chain.
+  Ops.push_back(*(Call->op_begin()));
+
+  // Optionally, push the glue (if any).
+  if (HasGlue)
+    Ops.push_back(*(Call->op_end() - 1));
+
+  // Push the register mask info.
+  if (HasGlue)
+    Ops.push_back(*(Call->op_end() - 2));
+  else
+    Ops.push_back(*(Call->op_end() - 1));
+
   // Add the <id> and <numBytes> constants.
   SDValue IDVal = getValue(CB.getArgOperand(PatchPointOpers::IDPos));
   Ops.push_back(DAG.getTargetConstant(
@@ -9477,27 +9486,13 @@ void SelectionDAGBuilder::visitPatchpoint(const CallBase &CB,
     for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
       Ops.push_back(getValue(CB.getArgOperand(i)));
 
-  // Push the arguments from the call instruction up to the register mask.
+  // Push the arguments from the call instruction.
   SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
   Ops.append(Call->op_begin() + 2, e);
 
   // Push live variables for the stack map.
   addStackMapLiveVars(CB, NumMetaOpers + NumArgs, dl, Ops, *this);
 
-  // Push the register mask info.
-  if (HasGlue)
-    Ops.push_back(*(Call->op_end()-2));
-  else
-    Ops.push_back(*(Call->op_end()-1));
-
-  // Push the chain (this is originally the first operand of the call, but
-  // becomes now the last or second to last operand).
-  Ops.push_back(*(Call->op_begin()));
-
-  // Push the glue flag (last operand).
-  if (HasGlue)
-    Ops.push_back(*(Call->op_end()-1));
-
   SDVTList NodeTys;
   if (IsAnyRegCC && HasDef) {
     // Create the return types based on the intrinsic definition
@@ -9514,13 +9509,12 @@ void SelectionDAGBuilder::visitPatchpoint(const CallBase &CB,
     NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
   // Replace the target specific call node with a PATCHPOINT node.
-  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
-                                         dl, NodeTys, Ops);
+  SDValue PPV = DAG.getNode(ISD::PATCHPOINT, dl, NodeTys, Ops);
 
   // Update the NodeMap.
   if (HasDef) {
     if (IsAnyRegCC)
-      setValue(&CB, SDValue(MN, 0));
+      setValue(&CB, SDValue(PPV.getNode(), 0));
     else
       setValue(&CB, Result.first);
   }
@@ -9531,10 +9525,10 @@ void SelectionDAGBuilder::visitPatchpoint(const CallBase &CB,
   // value.
   if (IsAnyRegCC && HasDef) {
     SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
-    SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
+    SDValue To[] = {PPV.getValue(1), PPV.getValue(2)};
     DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
   } else
-    DAG.ReplaceAllUsesWith(Call, MN);
+    DAG.ReplaceAllUsesWith(Call, PPV.getNode());
   DAG.DeleteNode(Call);
 
   // Inform the Frame Information that we have a patchpoint in this function.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 9df0b64c26c3..6ba01664e756 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -488,6 +488,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::VECREDUCE_FMIN:             return "vecreduce_fmin";
   case ISD::STACKMAP:
     return "stackmap";
+  case ISD::PATCHPOINT:
+    return "patchpoint";
 
     // Vector Predication
 #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...)                    \
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 7f453f081982..d46a0a23cca3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2193,8 +2193,27 @@ void SelectionDAGISel::Select_ARITH_FENCE(SDNode *N) {
                        N->getOperand(0));
 }
 
+void SelectionDAGISel::pushStackMapLiveVariable(SmallVectorImpl<SDValue> &Ops,
+                                                SDValue OpVal, SDLoc DL) {
+  SDNode *OpNode = OpVal.getNode();
+
+  // FrameIndex nodes should have been directly emitted to TargetFrameIndex
+  // nodes at DAG-construction time.
+  assert(OpNode->getOpcode() != ISD::FrameIndex);
+
+  if (OpNode->getOpcode() == ISD::Constant) {
+    Ops.push_back(
+        CurDAG->getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
+    Ops.push_back(
+        CurDAG->getTargetConstant(cast<ConstantSDNode>(OpNode)->getZExtValue(),
+                                  DL, OpVal.getValueType()));
+  } else {
+    Ops.push_back(OpVal);
+  }
+}
+
 void SelectionDAGISel::Select_STACKMAP(SDNode *N) {
-  std::vector<SDValue> Ops;
+  SmallVector<SDValue, 32> Ops;
   auto *It = N->op_begin();
   SDLoc DL(N);
 
@@ -2213,24 +2232,8 @@ void SelectionDAGISel::Select_STACKMAP(SDNode *N) {
   Ops.push_back(Shad);
 
   // Live variable operands.
-  for (; It != N->op_end(); It++) {
-    SDNode *OpNode = It->getNode();
-    SDValue O;
-
-    // FrameIndex nodes should have been directly emitted to TargetFrameIndex
-    // nodes at DAG-construction time.
-    assert(OpNode->getOpcode() != ISD::FrameIndex);
-
-    if (OpNode->getOpcode() == ISD::Constant) {
-      Ops.push_back(
-          CurDAG->getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
-      O = CurDAG->getTargetConstant(
-          cast<ConstantSDNode>(OpNode)->getZExtValue(), DL, It->getValueType());
-    } else {
-      O = *It;
-    }
-    Ops.push_back(O);
-  }
+  for (; It != N->op_end(); It++)
+    pushStackMapLiveVariable(Ops, *It, DL);
 
   Ops.push_back(Chain);
   Ops.push_back(InFlag);
@@ -2239,6 +2242,57 @@ void SelectionDAGISel::Select_STACKMAP(SDNode *N) {
   CurDAG->SelectNodeTo(N, TargetOpcode::STACKMAP, NodeTys, Ops);
 }
 
+void SelectionDAGISel::Select_PATCHPOINT(SDNode *N) {
+  SmallVector<SDValue, 32> Ops;
+  auto *It = N->op_begin();
+  SDLoc DL(N);
+
+  // Cache arguments that will be moved to the end in the target node.
+  SDValue Chain = *It++;
+  Optional<SDValue> Glue;
+  if (It->getValueType() == MVT::Glue)
+    Glue = *It++;
+  SDValue RegMask = *It++;
+
+  // <id> operand.
+  SDValue ID = *It++;
+  assert(ID.getValueType() == MVT::i64);
+  Ops.push_back(ID);
+
+  // <numShadowBytes> operand.
+  SDValue Shad = *It++;
+  assert(Shad.getValueType() == MVT::i32);
+  Ops.push_back(Shad);
+
+  // Add the callee.
+  Ops.push_back(*It++);
+
+  // Add <numArgs>.
+  SDValue NumArgs = *It++;
+  assert(NumArgs.getValueType() == MVT::i32);
+  Ops.push_back(NumArgs);
+
+  // Calling convention.
+  Ops.push_back(*It++);
+
+  // Push the args for the call.
+  for (uint64_t I = cast<ConstantSDNode>(NumArgs)->getZExtValue(); I != 0; I--)
+    Ops.push_back(*It++);
+
+  // Now push the live variables.
+  for (; It != N->op_end(); It++)
+    pushStackMapLiveVariable(Ops, *It, DL);
+
+  // Finally, the regmask, chain and (if present) glue are moved to the end.
+  Ops.push_back(RegMask);
+  Ops.push_back(Chain);
+  if (Glue.has_value())
+    Ops.push_back(Glue.value());
+
+  SDVTList NodeTys = N->getVTList();
+  CurDAG->SelectNodeTo(N, TargetOpcode::PATCHPOINT, NodeTys, Ops);
+}
+
 /// GetVBR - decode a vbr encoding whose top bit is set.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static uint64_t
 GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
@@ -2796,6 +2850,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   case ISD::STACKMAP:
     Select_STACKMAP(NodeToMatch);
     return;
+  case ISD::PATCHPOINT:
+    Select_PATCHPOINT(NodeToMatch);
+    return;
   }
 
   assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 3061158eea30..c5c093ae228f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -169,8 +169,14 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
 
   // Spill location is known for gc relocates
   if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
-    const auto &RelocationMap =
-        Builder.FuncInfo.StatepointRelocationMaps[Relocate->getStatepoint()];
+    const Value *Statepoint = Relocate->getStatepoint();
+    assert((isa<GCStatepointInst>(Statepoint) || isa<UndefValue>(Statepoint)) &&
+           "GetStatepoint must return one of two types");
+    if (isa<UndefValue>(Statepoint))
+      return None;
+
+    const auto &RelocationMap = Builder.FuncInfo.StatepointRelocationMaps
+                                    [cast<GCStatepointInst>(Statepoint)];
 
     auto It = RelocationMap.find(Relocate);
     if (It == RelocationMap.end())
@@ -193,7 +199,7 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
   if (const PHINode *Phi = dyn_cast<PHINode>(Val)) {
     Optional<int> MergedResult = None;
 
-    for (auto &IncomingValue : Phi->incoming_values()) {
+    for (const auto &IncomingValue : Phi->incoming_values()) {
       Optional<int> SpillSlot =
           findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth - 1);
       if (!SpillSlot)
@@ -569,9 +575,10 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // We cannot assing them to VRegs.
   SmallSet<SDValue, 8> LPadPointers;
   if (!UseRegistersForGCPointersInLandingPad)
-    if (auto *StInvoke = dyn_cast_or_null<InvokeInst>(SI.StatepointInstr)) {
+    if (const auto *StInvoke =
+            dyn_cast_or_null<InvokeInst>(SI.StatepointInstr)) {
       LandingPadInst *LPI = StInvoke->getLandingPadInst();
-      for (auto *Relocate : SI.GCRelocates)
+      for (const auto *Relocate : SI.GCRelocates)
         if (Relocate->getOperand(0) == LPI) {
           LPadPointers.insert(Builder.getValue(Relocate->getBasePtr()));
           LPadPointers.insert(Builder.getValue(Relocate->getDerivedPtr()));
@@ -739,7 +746,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
 
   LLVM_DEBUG(dbgs() << "Lowering statepoint " << *SI.StatepointInstr << "\n");
 #ifndef NDEBUG
-  for (auto *Reloc : SI.GCRelocates)
+  for (const auto *Reloc : SI.GCRelocates)
     if (Reloc->getParent() == SI.StatepointInstr->getParent())
       StatepointLowering.scheduleRelocCall(*Reloc);
 #endif
@@ -1017,7 +1024,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
 static std::pair<const GCResultInst*, const GCResultInst*>
 getGCResultLocality(const GCStatepointInst &S) {
   std::pair<const GCResultInst *, const GCResultInst*> Res(nullptr, nullptr);
-  for (auto *U : S.users()) {
+  for (const auto *U : S.users()) {
     auto *GRI = dyn_cast<GCResultInst>(U);
     if (!GRI)
       continue;
@@ -1195,9 +1202,13 @@ void SelectionDAGBuilder::LowerCallSiteWithDeoptBundle(
 void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
   // The result value of the gc_result is simply the result of the actual
   // call.  We've already emitted this, so just grab the value.
-  const GCStatepointInst *SI = CI.getStatepoint();
+  const Value *SI = CI.getStatepoint();
+  assert((isa<GCStatepointInst>(SI) || isa<UndefValue>(SI)) &&
+         "GetStatepoint must return one of two types");
+  if (isa<UndefValue>(SI))
+    return;
 
-  if (SI->getParent() == CI.getParent()) {
+  if (cast<GCStatepointInst>(SI)->getParent() == CI.getParent()) {
     setValue(&CI, getValue(SI));
     return;
   }
@@ -1215,12 +1226,18 @@ void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
 }
 
 void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
+  const Value *Statepoint = Relocate.getStatepoint();
 #ifndef NDEBUG
   // Consistency check
   // We skip this check for relocates not in the same basic block as their
   // statepoint. It would be too expensive to preserve validation info through
   // different basic blocks.
-  if (Relocate.getStatepoint()->getParent() == Relocate.getParent())
+  assert((isa<GCStatepointInst>(Statepoint) || isa<UndefValue>(Statepoint)) &&
+         "GetStatepoint must return one of two types");
+  if (isa<UndefValue>(Statepoint))
+    return;
+
+  if (cast<GCStatepointInst>(Statepoint)->getParent() == Relocate.getParent())
     StatepointLowering.relocCallVisited(Relocate);
 
   auto *Ty = Relocate.getType()->getScalarType();
@@ -1230,14 +1247,15 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
 
   const Value *DerivedPtr = Relocate.getDerivedPtr();
   auto &RelocationMap =
-    FuncInfo.StatepointRelocationMaps[Relocate.getStatepoint()];
+      FuncInfo.StatepointRelocationMaps[cast<GCStatepointInst>(Statepoint)];
   auto SlotIt = RelocationMap.find(&Relocate);
   assert(SlotIt != RelocationMap.end() && "Relocating not lowered gc value");
   const RecordType &Record = SlotIt->second;
 
   // If relocation was done via virtual register..
   if (Record.type == RecordType::SDValueNode) {
-    assert(Relocate.getStatepoint()->getParent() == Relocate.getParent() &&
+    assert(cast<GCStatepointInst>(Statepoint)->getParent() ==
+               Relocate.getParent() &&
            "Nonlocal gc.relocate mapped via SDValue");
     SDValue SDV = StatepointLowering.getLocation(getValue(DerivedPtr));
     assert(SDV.getNode() && "empty SDValue");
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 66389a57f780..cd4f0ae42bcd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1056,13 +1056,13 @@ bool TargetLowering::SimplifyDemandedBits(
   // TODO: We can probably do more work on calculating the known bits and
   // simplifying the operations for scalable vectors, but for now we just
   // bail out.
-  if (Op.getValueType().isScalableVector())
+  EVT VT = Op.getValueType();
+  if (VT.isScalableVector())
     return false;
 
   bool IsLE = TLO.DAG.getDataLayout().isLittleEndian();
   unsigned NumElts = OriginalDemandedElts.getBitWidth();
-  assert((!Op.getValueType().isVector() ||
-          NumElts == Op.getValueType().getVectorNumElements()) &&
+  assert((!VT.isVector() || NumElts == VT.getVectorNumElements()) &&
          "Unexpected vector size");
 
   APInt DemandedBits = OriginalDemandedBits;
@@ -1088,7 +1088,6 @@ bool TargetLowering::SimplifyDemandedBits(
   }
 
   // Other users may use these bits.
-  EVT VT = Op.getValueType();
   if (!Op.getNode()->hasOneUse() && !AssumeSingleUse) {
     if (Depth != 0) {
       // If not at the root, Just compute the Known bits to
@@ -1468,6 +1467,33 @@ bool TargetLowering::SimplifyDemandedBits(
       }
     }
 
+    // (or (and X, C1), (and (or X, Y), C2)) -> (or (and X, C1|C2), (and Y, C2))
+    // TODO: Use SimplifyMultipleUseDemandedBits to peek through masks.
+    if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::AND &&
+        Op0->hasOneUse() && Op1->hasOneUse()) {
+      // Attempt to match all commutations - m_c_Or would've been useful!
+      for (int I = 0; I != 2; ++I) {
+        SDValue X = Op.getOperand(I).getOperand(0);
+        SDValue C1 = Op.getOperand(I).getOperand(1);
+        SDValue Alt = Op.getOperand(1 - I).getOperand(0);
+        SDValue C2 = Op.getOperand(1 - I).getOperand(1);
+        if (Alt.getOpcode() == ISD::OR) {
+          for (int J = 0; J != 2; ++J) {
+            if (X == Alt.getOperand(J)) {
+              SDValue Y = Alt.getOperand(1 - J);
+              if (SDValue C12 = TLO.DAG.FoldConstantArithmetic(ISD::OR, dl, VT,
+                                                               {C1, C2})) {
+                SDValue MaskX = TLO.DAG.getNode(ISD::AND, dl, VT, X, C12);
+                SDValue MaskY = TLO.DAG.getNode(ISD::AND, dl, VT, Y, C2);
+                return TLO.CombineTo(
+                    Op, TLO.DAG.getNode(ISD::OR, dl, VT, MaskX, MaskY));
+              }
+            }
+          }
+        }
+      }
+    }
+
     Known |= Known2;
     break;
   }
@@ -1500,7 +1526,7 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1));
 
-    ConstantSDNode* C = isConstOrConstSplat(Op1, DemandedElts);
+    ConstantSDNode *C = isConstOrConstSplat(Op1, DemandedElts);
     if (C) {
       // If one side is a constant, and all of the set bits in the constant are
       // also known set on the other side, turn this into an AND, as we know
@@ -1521,6 +1547,32 @@ bool TargetLowering::SimplifyDemandedBits(
         SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
         return TLO.CombineTo(Op, New);
       }
+
+      unsigned Op0Opcode = Op0.getOpcode();
+      if ((Op0Opcode == ISD::SRL || Op0Opcode == ISD::SHL) && Op0.hasOneUse()) {
+        if (ConstantSDNode *ShiftC =
+                isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
+          // Don't crash on an oversized shift. We can not guarantee that a
+          // bogus shift has been simplified to undef.
+          if (ShiftC->getAPIntValue().ult(BitWidth)) {
+            uint64_t ShiftAmt = ShiftC->getZExtValue();
+            APInt Ones = APInt::getAllOnes(BitWidth);
+            Ones = Op0Opcode == ISD::SHL ? Ones.shl(ShiftAmt)
+                                         : Ones.lshr(ShiftAmt);
+            const TargetLowering &TLI = TLO.DAG.getTargetLoweringInfo();
+            if ((DemandedBits & C->getAPIntValue()) == (DemandedBits & Ones) &&
+                TLI.isDesirableToCommuteXorWithShift(Op.getNode())) {
+              // If the xor constant is a demanded mask, do a 'not' before the
+              // shift:
+              // xor (X << ShiftC), XorC --> (not X) << ShiftC
+              // xor (X >> ShiftC), XorC --> (not X) >> ShiftC
+              SDValue Not = TLO.DAG.getNOT(dl, Op0.getOperand(0), VT);
+              return TLO.CombineTo(Op, TLO.DAG.getNode(Op0Opcode, dl, VT, Not,
+                                                       Op0.getOperand(1)));
+            }
+          }
+        }
+      }
     }
 
     // If we can't turn this into a 'not', try to shrink the constant.
@@ -1723,6 +1775,26 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((ShAmt < DemandedBits.getActiveBits()) &&
           ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
         return true;
+    } else {
+      // This is a variable shift, so we can't shift the demand mask by a known
+      // amount. But if we are not demanding high bits, then we are not
+      // demanding those bits from the pre-shifted operand either.
+      if (unsigned CTLZ = DemandedBits.countLeadingZeros()) {
+        APInt DemandedFromOp(APInt::getLowBitsSet(BitWidth, BitWidth - CTLZ));
+        if (SimplifyDemandedBits(Op0, DemandedFromOp, DemandedElts, Known, TLO,
+                                 Depth + 1)) {
+          SDNodeFlags Flags = Op.getNode()->getFlags();
+          if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
+            // Disable the nsw and nuw flags. We can no longer guarantee that we
+            // won't wrap after simplification.
+            Flags.setNoSignedWrap(false);
+            Flags.setNoUnsignedWrap(false);
+            Op->setFlags(Flags);
+          }
+          return true;
+        }
+        Known.resetAll();
+      }
     }
 
     // If we are only demanding sign bits then we can use the shift source
@@ -3292,6 +3364,12 @@ bool TargetLowering::SimplifyDemandedVectorElts(
                                    TLO, Depth + 1))
       return true;
 
+    // If every element pair has a zero/undef then just fold to zero.
+    // fold (and x, undef) -> 0  /  (and x, 0) -> 0
+    // fold (mul x, undef) -> 0  /  (mul x, 0) -> 0
+    if (DemandedElts.isSubsetOf(SrcZero | KnownZero | SrcUndef | KnownUndef))
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
     // If either side has a zero element, then the result element is zero, even
     // if the other is an UNDEF.
     // TODO: Extend getKnownUndefForVectorBinop to also deal with known zeros
@@ -3301,7 +3379,6 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     KnownUndef &= ~KnownZero;
 
     // Attempt to avoid multi-use ops if we don't need anything from them.
-    // TODO - use KnownUndef to relax the demandedelts?
     if (!DemandedElts.isAllOnes())
       if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
         return true;
@@ -5204,6 +5281,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
   // ConstraintOperands list.
   unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
   unsigned ResNo = 0; // ResNo - The result number of the next output.
+  unsigned LabelNo = 0; // LabelNo - CallBr indirect dest number.
 
   for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
     ConstraintOperands.emplace_back(std::move(CI));
@@ -5240,6 +5318,14 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
     case InlineAsm::isInput:
       OpInfo.CallOperandVal = Call.getArgOperand(ArgNo);
       break;
+    case InlineAsm::isLabel:
+      OpInfo.CallOperandVal =
+          cast<CallBrInst>(&Call)->getBlockAddressForIndirectDest(LabelNo);
+      OpInfo.ConstraintVT =
+          getAsmOperandValueType(DL, OpInfo.CallOperandVal->getType())
+              .getSimpleVT();
+      ++LabelNo;
+      continue;
     case InlineAsm::isClobber:
       // Nothing to do.
       break;
@@ -5852,22 +5938,22 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     // FIXME: We should use a narrower constant when the upper
     // bits are known to be zero.
     const APInt& Divisor = C->getAPIntValue();
-    UnsignedDivisonByConstantInfo magics = UnsignedDivisonByConstantInfo::get(Divisor);
+    UnsignedDivisionByConstantInfo magics =
+        UnsignedDivisionByConstantInfo::get(Divisor);
     unsigned PreShift = 0, PostShift = 0;
 
     // If the divisor is even, we can avoid using the expensive fixup by
     // shifting the divided value upfront.
-    if (magics.IsAdd != 0 && !Divisor[0]) {
+    if (magics.IsAdd && !Divisor[0]) {
       PreShift = Divisor.countTrailingZeros();
       // Get magic number for the shifted divisor.
-      magics = UnsignedDivisonByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
-      assert(magics.IsAdd == 0 && "Should use cheap fixup now");
+      magics =
+          UnsignedDivisionByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
+      assert(!magics.IsAdd && "Should use cheap fixup now");
     }
 
-    APInt Magic = magics.Magic;
-
     unsigned SelNPQ;
-    if (magics.IsAdd == 0 || Divisor.isOne()) {
+    if (!magics.IsAdd || Divisor.isOne()) {
       assert(magics.ShiftAmount < Divisor.getBitWidth() &&
              "We shouldn't generate an undefined shift!");
       PostShift = magics.ShiftAmount;
@@ -5878,7 +5964,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     }
 
     PreShifts.push_back(DAG.getConstant(PreShift, dl, ShSVT));
-    MagicFactors.push_back(DAG.getConstant(Magic, dl, SVT));
+    MagicFactors.push_back(DAG.getConstant(magics.Magic, dl, SVT));
     NPQFactors.push_back(
         DAG.getConstant(SelNPQ ? APInt::getOneBitSet(EltBits, EltBits - 1)
                                : APInt::getZero(EltBits),
diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp
index ffac68a223bf..ee3a0164564e 100644
--- a/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -179,21 +179,12 @@ void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
 void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
                                        MachineBasicBlock::iterator Begin,
                                        MachineBasicBlock::iterator End) {
-  // FIXME: Is this really necessary? The only caller repairIntervalsForRange()
-  // does the same thing.
-  // Find anchor points, which are at the beginning/end of blocks or at
-  // instructions that already have indexes.
-  while (Begin != MBB->begin() && !hasIndex(*Begin))
-    --Begin;
-  while (End != MBB->end() && !hasIndex(*End))
-    ++End;
-
   bool includeStart = (Begin == MBB->begin());
   SlotIndex startIdx;
   if (includeStart)
     startIdx = getMBBStartIdx(MBB);
   else
-    startIdx = getInstructionIndex(*Begin);
+    startIdx = getInstructionIndex(*--Begin);
 
   SlotIndex endIdx;
   if (End == MBB->end())
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 140a91ae342b..94149f56e703 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -347,13 +347,11 @@ void SplitAnalysis::analyze(const LiveInterval *li) {
 //===----------------------------------------------------------------------===//
 
 /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
-SplitEditor::SplitEditor(SplitAnalysis &SA, AliasAnalysis &AA,
-                         LiveIntervals &LIS, VirtRegMap &VRM,
+SplitEditor::SplitEditor(SplitAnalysis &SA, LiveIntervals &LIS, VirtRegMap &VRM,
                          MachineDominatorTree &MDT,
                          MachineBlockFrequencyInfo &MBFI, VirtRegAuxInfo &VRAI)
-    : SA(SA), AA(AA), LIS(LIS), VRM(VRM),
-      MRI(VRM.getMachineFunction().getRegInfo()), MDT(MDT),
-      TII(*VRM.getMachineFunction().getSubtarget().getInstrInfo()),
+    : SA(SA), LIS(LIS), VRM(VRM), MRI(VRM.getMachineFunction().getRegInfo()),
+      MDT(MDT), TII(*VRM.getMachineFunction().getSubtarget().getInstrInfo()),
       TRI(*VRM.getMachineFunction().getSubtarget().getRegisterInfo()),
       MBFI(MBFI), VRAI(VRAI), RegAssign(Allocator) {}
 
@@ -371,9 +369,7 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
     LICalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
                     &LIS.getVNInfoAllocator());
 
-  // We don't need an AliasAnalysis since we will only be performing
-  // cheap-as-a-copy remats anyway.
-  Edit->anyRematerializable(nullptr);
+  Edit->anyRematerializable();
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1454,7 +1450,7 @@ void SplitEditor::deleteRematVictims() {
   if (Dead.empty())
     return;
 
-  Edit->eliminateDeadDefs(Dead, None, &AA);
+  Edit->eliminateDeadDefs(Dead, None);
 }
 
 void SplitEditor::forceRecomputeVNI(const VNInfo &ParentVNI) {
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index 4400a797d38e..556b022b93fb 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -257,7 +257,6 @@ public:
 ///
 class LLVM_LIBRARY_VISIBILITY SplitEditor {
   SplitAnalysis &SA;
-  AAResults &AA;
   LiveIntervals &LIS;
   VirtRegMap &VRM;
   MachineRegisterInfo &MRI;
@@ -436,9 +435,9 @@ private:
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
   /// Newly created intervals will be appended to newIntervals.
-  SplitEditor(SplitAnalysis &SA, AAResults &AA, LiveIntervals &LIS,
-              VirtRegMap &VRM, MachineDominatorTree &MDT,
-              MachineBlockFrequencyInfo &MBFI, VirtRegAuxInfo &VRAI);
+  SplitEditor(SplitAnalysis &SA, LiveIntervals &LIS, VirtRegMap &VRM,
+              MachineDominatorTree &MDT, MachineBlockFrequencyInfo &MBFI,
+              VirtRegAuxInfo &VRAI);
 
   /// reset - Prepare for a new split.
   void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp
index 6757d6ca4f88..ccaff862fa3f 100644
--- a/llvm/lib/CodeGen/StackMaps.cpp
+++ b/llvm/lib/CodeGen/StackMaps.cpp
@@ -365,7 +365,7 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   });
 
   for (auto I = LiveOuts.begin(), E = LiveOuts.end(); I != E; ++I) {
-    for (auto II = std::next(I); II != E; ++II) {
+    for (auto *II = std::next(I); II != E; ++II) {
       if (I->DwarfRegNum != II->DwarfRegNum) {
         // Skip all the now invalid entries.
         I = --II;
diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
index 4408011c95c0..2282d53e8ffd 100644
--- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -267,7 +267,7 @@ void SwiftErrorValueTracking::preassignVRegs(
     if (auto *CB = dyn_cast<CallBase>(&*It)) {
       // A call-site with a swifterror argument is both use and def.
       const Value *SwiftErrorAddr = nullptr;
-      for (auto &Arg : CB->args()) {
+      for (const auto &Arg : CB->args()) {
         if (!Arg->isSwiftError())
           continue;
         // Use of swifterror.
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index ba533a491b9c..18507b8fa84f 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -653,7 +653,7 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // demonstrated by test/CodeGen/Hexagon/tail-dup-subreg-abort.ll.
   // Disable tail duplication for this case for now, until the problem is
   // fixed.
-  for (auto SB : TailBB.successors()) {
+  for (auto *SB : TailBB.successors()) {
     for (auto &I : *SB) {
       if (!I.isPHI())
         break;
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index 2a987ee3eedf..4116231c005f 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -916,7 +916,7 @@ void TargetInstrInfo::genAlternativeCodeSequence(
 }
 
 bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
-    const MachineInstr &MI, AAResults *AA) const {
+    const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
@@ -952,7 +952,7 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
     return false;
 
   // Avoid instructions which load from potentially varying memory.
-  if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad(AA))
+  if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad())
     return false;
 
   // If any of the registers accessed are non-constant, conservatively assume
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6a595a4c748b..a342a4dd1e25 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1145,7 +1145,7 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
 /// specified register class are all legal.
 bool TargetLoweringBase::isLegalRC(const TargetRegisterInfo &TRI,
                                    const TargetRegisterClass &RC) const {
-  for (auto I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I)
+  for (const auto *I = TRI.legalclasstypes_begin(RC); *I != MVT::Other; ++I)
     if (isTypeLegal(*I))
       return true;
   return false;
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index c44fd9f97383..17fe819fa900 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1450,9 +1450,9 @@ void
 TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
                                             TiedPairList &TiedPairs,
                                             unsigned &Dist) {
-  bool IsEarlyClobber = llvm::find_if(TiedPairs, [MI](auto const &TP) {
-                          return MI->getOperand(TP.second).isEarlyClobber();
-                        }) != TiedPairs.end();
+  bool IsEarlyClobber = llvm::any_of(TiedPairs, [MI](auto const &TP) {
+    return MI->getOperand(TP.second).isEarlyClobber();
+  });
 
   bool RemovedKillFlag = false;
   bool AllUsesCopied = true;
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 166a3c413f6a..8dc8d381ad16 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -446,7 +446,7 @@ void IRPromoter::ExtendSources() {
 
   // Now, insert extending instructions between the sources and their users.
   LLVM_DEBUG(dbgs() << "IR Promotion: Promoting sources:\n");
-  for (auto V : Sources) {
+  for (auto *V : Sources) {
     LLVM_DEBUG(dbgs() << " - " << *V << "\n");
     if (auto *I = dyn_cast<Instruction>(V))
       InsertZExt(I, I);
@@ -524,7 +524,7 @@ void IRPromoter::TruncateSinks() {
 
   // Fix up any stores or returns that use the results of the promoted
   // chain.
-  for (auto I : Sinks) {
+  for (auto *I : Sinks) {
     LLVM_DEBUG(dbgs() << "IR Promotion: For Sink: " << *I << "\n");
 
     // Handle calls separately as we need to iterate over arg operands.
@@ -570,7 +570,7 @@ void IRPromoter::Cleanup() {
   LLVM_DEBUG(dbgs() << "IR Promotion: Cleanup..\n");
   // Some zexts will now have become redundant, along with their trunc
   // operands, so remove them
-  for (auto V : Visited) {
+  for (auto *V : Visited) {
     if (!isa<ZExtInst>(V))
       continue;
 
diff --git a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
index 8b5b585090f5..8225d4ea6996 100644
--- a/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
+++ b/llvm/lib/CodeGen/VLIWMachineScheduler.cpp
@@ -579,7 +579,7 @@ static inline bool isSingleUnscheduledSucc(SUnit *SU, SUnit *SU2) {
 /// pressure, then return 0.
 int ConvergingVLIWScheduler::pressureChange(const SUnit *SU, bool isBotUp) {
   PressureDiff &PD = DAG->getPressureDiff(SU);
-  for (auto &P : PD) {
+  for (const auto &P : PD) {
     if (!P.isValid())
       continue;
     // The pressure differences are computed bottom-up, so the comparision for
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 298359dea9af..62b7f629f403 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -504,9 +504,14 @@ unsigned DWARFLinker::shouldKeepSubprogramDIE(
                   &DIE);
     return Flags;
   }
+  if (*LowPc > *HighPc) {
+    reportWarning("low_pc greater than high_pc. Range will be discarded.\n",
+                  File, &DIE);
+    return Flags;
+  }
 
   // Replace the debug map range with a more accurate one.
-  Ranges[*LowPc] = ObjFileAddressRange(*HighPc, MyInfo.AddrAdjust);
+  Ranges.insert({*LowPc, *HighPc}, MyInfo.AddrAdjust);
   Unit.addFunctionRange(*LowPc, *HighPc, MyInfo.AddrAdjust);
   return Flags;
 }
@@ -1575,7 +1580,7 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit,
   DWARFDataExtractor RangeExtractor(OrigDwarf.getDWARFObj(),
                                     OrigDwarf.getDWARFObj().getRangesSection(),
                                     OrigDwarf.isLittleEndian(), AddressSize);
-  auto InvalidRange = FunctionRanges.end(), CurrRange = InvalidRange;
+  Optional<std::pair<AddressRange, int64_t>> CurrRange;
   DWARFUnit &OrigUnit = Unit.getOrigUnit();
   auto OrigUnitDie = OrigUnit.getUnitDIE(false);
   uint64_t OrigLowPc =
@@ -1598,12 +1603,11 @@ void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit,
     if (!Entries.empty()) {
       const DWARFDebugRangeList::RangeListEntry &First = Entries.front();
 
-      if (CurrRange == InvalidRange ||
-          First.StartAddress + OrigLowPc < CurrRange.start() ||
-          First.StartAddress + OrigLowPc >= CurrRange.stop()) {
-        CurrRange = FunctionRanges.find(First.StartAddress + OrigLowPc);
-        if (CurrRange == InvalidRange ||
-            CurrRange.start() > First.StartAddress + OrigLowPc) {
+      if (!CurrRange ||
+          !CurrRange->first.contains(First.StartAddress + OrigLowPc)) {
+        CurrRange = FunctionRanges.getRangeValueThatContains(
+            First.StartAddress + OrigLowPc);
+        if (!CurrRange) {
           reportWarning("no mapping for range.", File);
           continue;
         }
@@ -1710,7 +1714,7 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
   // in NewRows.
   std::vector<DWARFDebugLine::Row> Seq;
   const auto &FunctionRanges = Unit.getFunctionRanges();
-  auto InvalidRange = FunctionRanges.end(), CurrRange = InvalidRange;
+  Optional<std::pair<AddressRange, int64_t>> CurrRange;
 
   // FIXME: This logic is meant to generate exactly the same output as
   // Darwin's classic dsymutil. There is a nicer way to implement this
@@ -1729,19 +1733,14 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
     // it is marked as end_sequence in the input (because in that
     // case, the relocation offset is accurate and that entry won't
     // serve as the start of another function).
-    if (CurrRange == InvalidRange || Row.Address.Address < CurrRange.start() ||
-        Row.Address.Address > CurrRange.stop() ||
-        (Row.Address.Address == CurrRange.stop() && !Row.EndSequence)) {
+    if (!CurrRange || !CurrRange->first.contains(Row.Address.Address) ||
+        (Row.Address.Address == CurrRange->first.end() && !Row.EndSequence)) {
       // We just stepped out of a known range. Insert a end_sequence
       // corresponding to the end of the range.
-      uint64_t StopAddress = CurrRange != InvalidRange
-                                 ? CurrRange.stop() + CurrRange.value()
-                                 : -1ULL;
-      CurrRange = FunctionRanges.find(Row.Address.Address);
-      bool CurrRangeValid =
-          CurrRange != InvalidRange && CurrRange.start() <= Row.Address.Address;
-      if (!CurrRangeValid) {
-        CurrRange = InvalidRange;
+      uint64_t StopAddress =
+          CurrRange ? CurrRange->first.end() + CurrRange->second : -1ULL;
+      CurrRange = FunctionRanges.getRangeValueThatContains(Row.Address.Address);
+      if (!CurrRange) {
         if (StopAddress != -1ULL) {
           // Try harder by looking in the Address ranges map.
           // There are corner cases where this finds a
@@ -1749,14 +1748,9 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
           // for now do as dsymutil.
           // FIXME: Understand exactly what cases this addresses and
           // potentially remove it along with the Ranges map.
-          auto Range = Ranges.lower_bound(Row.Address.Address);
-          if (Range != Ranges.begin() && Range != Ranges.end())
-            --Range;
-
-          if (Range != Ranges.end() && Range->first <= Row.Address.Address &&
-              Range->second.HighPC >= Row.Address.Address) {
-            StopAddress = Row.Address.Address + Range->second.Offset;
-          }
+          if (Optional<std::pair<AddressRange, int64_t>> Range =
+                  Ranges.getRangeValueThatContains(Row.Address.Address))
+            StopAddress = Row.Address.Address + (*Range).second;
         }
       }
       if (StopAddress != -1ULL && !Seq.empty()) {
@@ -1772,7 +1766,7 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
         insertLineSequence(Seq, NewRows);
       }
 
-      if (!CurrRangeValid)
+      if (!CurrRange)
         continue;
     }
 
@@ -1781,7 +1775,7 @@ void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
       continue;
 
     // Relocate row address and add it to the current sequence.
-    Row.Address.Address += CurrRange.value();
+    Row.Address.Address += CurrRange->second;
     Seq.emplace_back(Row);
 
     if (Row.EndSequence)
@@ -1921,11 +1915,9 @@ void DWARFLinker::patchFrameInfoForObject(const DWARFFile &File,
     // the function entry point, thus we can't just lookup the address
     // in the debug map. Use the AddressInfo's range map to see if the FDE
     // describes something that we can relocate.
-    auto Range = Ranges.upper_bound(Loc);
-    if (Range != Ranges.begin())
-      --Range;
-    if (Range == Ranges.end() || Range->first > Loc ||
-        Range->second.HighPC <= Loc) {
+    Optional<std::pair<AddressRange, int64_t>> Range =
+        Ranges.getRangeValueThatContains(Loc);
+    if (!Range) {
       // The +4 is to account for the size of the InitialLength field itself.
       InputOffset = EntryOffset + InitialLength + 4;
       continue;
@@ -1953,7 +1945,7 @@ void DWARFLinker::patchFrameInfoForObject(const DWARFFile &File,
     // fields that will get reconstructed by emitFDE().
     unsigned FDERemainingBytes = InitialLength - (4 + AddrSize);
     TheDwarfEmitter->emitFDE(IteratorInserted.first->getValue(), AddrSize,
-                             Loc + Range->second.Offset,
+                             Loc + Range->second,
                              FrameData.substr(InputOffset, FDERemainingBytes));
     InputOffset += FDERemainingBytes;
   }
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
index e9e8be7fd008..1cb20c0bb948 100644
--- a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
@@ -105,11 +105,7 @@ void CompileUnit::addLabelLowPc(uint64_t LabelLowPc, int64_t PcOffset) {
 
 void CompileUnit::addFunctionRange(uint64_t FuncLowPc, uint64_t FuncHighPc,
                                    int64_t PcOffset) {
-  //  Don't add empty ranges to the interval map.  They are a problem because
-  //  the interval map expects half open intervals. This is safe because they
-  //  are empty anyway.
-  if (FuncHighPc != FuncLowPc)
-    Ranges.insert(FuncLowPc, FuncHighPc, PcOffset);
+  Ranges.insert({FuncLowPc, FuncHighPc}, PcOffset);
   this->LowPc = std::min(LowPc, FuncLowPc + PcOffset);
   this->HighPc = std::max(HighPc, FuncHighPc + PcOffset);
 }
diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
index 55ff6b14f945..a00e51fcf135 100644
--- a/llvm/lib/DWARFLinker/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -321,13 +321,14 @@ void DwarfStreamer::emitSwiftReflectionSection(
 /// sized addresses describing the ranges.
 void DwarfStreamer::emitRangesEntries(
     int64_t UnitPcOffset, uint64_t OrigLowPc,
-    const FunctionIntervals::const_iterator &FuncRange,
+    Optional<std::pair<AddressRange, int64_t>> FuncRange,
     const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
     unsigned AddressSize) {
   MS->switchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
 
   // Offset each range by the right amount.
-  int64_t PcOffset = Entries.empty() ? 0 : FuncRange.value() + UnitPcOffset;
+  int64_t PcOffset =
+      (Entries.empty() || !FuncRange) ? 0 : FuncRange->second + UnitPcOffset;
   for (const auto &Range : Entries) {
     if (Range.isBaseAddressSelectionEntry(AddressSize)) {
       warn("unsupported base address selection operation",
@@ -339,8 +340,7 @@ void DwarfStreamer::emitRangesEntries(
       continue;
 
     // All range entries should lie in the function range.
-    if (!(Range.StartAddress + OrigLowPc >= FuncRange.start() &&
-          Range.EndAddress + OrigLowPc <= FuncRange.stop()))
+    if (!FuncRange->first.contains(Range.StartAddress + OrigLowPc))
       warn("inconsistent range data.", "emitting debug_ranges");
     MS->emitIntValue(Range.StartAddress + PcOffset, AddressSize);
     MS->emitIntValue(Range.EndAddress + PcOffset, AddressSize);
@@ -365,11 +365,13 @@ void DwarfStreamer::emitUnitRangesEntries(CompileUnit &Unit,
   // IntervalMap will have coalesced the non-linked ranges, but here
   // we want to coalesce the linked addresses.
   std::vector<std::pair<uint64_t, uint64_t>> Ranges;
-  const auto &FunctionRanges = Unit.getFunctionRanges();
-  for (auto Range = FunctionRanges.begin(), End = FunctionRanges.end();
-       Range != End; ++Range)
-    Ranges.push_back(std::make_pair(Range.start() + Range.value(),
-                                    Range.stop() + Range.value()));
+  const RangesTy &FunctionRanges = Unit.getFunctionRanges();
+  for (size_t Idx = 0; Idx < FunctionRanges.size(); Idx++) {
+    std::pair<AddressRange, int64_t> CurRange = FunctionRanges[Idx];
+
+    Ranges.push_back(std::make_pair(CurRange.first.start() + CurRange.second,
+                                    CurRange.first.end() + CurRange.second));
+  }
 
   // The object addresses where sorted, but again, the linked
   // addresses might end up in a different order.
diff --git a/llvm/lib/DWP/DWP.cpp b/llvm/lib/DWP/DWP.cpp
index 34615a73e328..44e39c019e0c 100644
--- a/llvm/lib/DWP/DWP.cpp
+++ b/llvm/lib/DWP/DWP.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCTargetOptionsCommandFlags.h"
 #include "llvm/Object/Decompressor.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
@@ -273,12 +274,16 @@ static Error createError(StringRef Name, Error E) {
 
 static Error
 handleCompressedSection(std::deque<SmallString<32>> &UncompressedSections,
-                        StringRef &Name, StringRef &Contents) {
-  if (!Decompressor::isGnuStyle(Name))
+                        SectionRef Sec, StringRef Name, StringRef &Contents) {
+  auto *Obj = dyn_cast<ELFObjectFileBase>(Sec.getObject());
+  if (!Obj ||
+      !(static_cast<ELFSectionRef>(Sec).getFlags() & ELF::SHF_COMPRESSED))
     return Error::success();
-
-  Expected<Decompressor> Dec =
-      Decompressor::create(Name, Contents, false /*IsLE*/, false /*Is64Bit*/);
+  bool IsLE = isa<object::ELF32LEObjectFile>(Obj) ||
+              isa<object::ELF64LEObjectFile>(Obj);
+  bool Is64 = isa<object::ELF64LEObjectFile>(Obj) ||
+              isa<object::ELF64BEObjectFile>(Obj);
+  Expected<Decompressor> Dec = Decompressor::create(Name, Contents, IsLE, Is64);
   if (!Dec)
     return createError(Name, Dec.takeError());
 
@@ -286,7 +291,6 @@ handleCompressedSection(std::deque<SmallString<32>> &UncompressedSections,
   if (Error E = Dec->resizeAndDecompress(UncompressedSections.back()))
     return createError(Name, std::move(E));
 
-  Name = Name.substr(2); // Drop ".z"
   Contents = UncompressedSections.back();
   return Error::success();
 }
@@ -494,7 +498,8 @@ Error handleSection(
     return ContentsOrErr.takeError();
   StringRef Contents = *ContentsOrErr;
 
-  if (auto Err = handleCompressedSection(UncompressedSections, Name, Contents))
+  if (auto Err = handleCompressedSection(UncompressedSections, Section, Name,
+                                         Contents))
     return Err;
 
   Name = Name.substr(Name.find_first_not_of("._"));
diff --git a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index 27f63b9edcd0..7f4511258c64 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -36,8 +36,10 @@ using namespace llvm::codeview;
 namespace {
 
 #define error(X)                                                               \
-  if (auto EC = X)                                                             \
-    return EC;
+  do {                                                                         \
+    if (auto EC = X)                                                           \
+      return EC;                                                               \
+  } while (false)
 
 static const EnumEntry<TypeLeafKind> LeafTypeNames[] = {
 #define CV_TYPE(enum, val) {#enum, enum},
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index 2e567d8bc7ee..19d7d659a86a 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1645,7 +1645,7 @@ class DWARFObjInMemory final : public DWARFObject {
   /// provided by Data. Otherwise leaves it unchanged.
   Error maybeDecompress(const object::SectionRef &Sec, StringRef Name,
                         StringRef &Data) {
-    if (!Decompressor::isCompressed(Sec))
+    if (!Sec.isCompressed())
       return Error::success();
 
     Expected<Decompressor> Decompressor =
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 33856c12b3c9..d2ed4fe018b5 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -29,10 +29,6 @@
 using namespace llvm;
 using namespace dwarf;
 
-namespace llvm {
-class DwarfContext;
-}
-
 using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
 
 namespace {
diff --git a/llvm/lib/DebugInfo/Symbolize/Markup.cpp b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
index 9bc65e763287..aa8a89812227 100644
--- a/llvm/lib/DebugInfo/Symbolize/Markup.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Markup.cpp
@@ -100,6 +100,9 @@ Optional<MarkupNode> MarkupParser::nextNode() {
 }
 
 void MarkupParser::flush() {
+  Buffer.clear();
+  NextIdx = 0;
+  Line = {};
   if (InProgressMultiline.empty())
     return;
   FinishedMultiline.swap(InProgressMultiline);
diff --git a/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp b/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
index 3363fe5e531f..91a51485026e 100644
--- a/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/MarkupFilter.cpp
@@ -10,14 +10,22 @@
 /// This file defines the implementation of a filter that replaces symbolizer
 /// markup with human-readable expressions.
 ///
+/// See https://llvm.org/docs/SymbolizerMarkupFormat.html
+///
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/Symbolize/MarkupFilter.h"
 
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/DebugInfo/Symbolize/Markup.h"
+#include "llvm/Debuginfod/Debuginfod.h"
 #include "llvm/Demangle/Demangle.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -28,30 +36,195 @@ MarkupFilter::MarkupFilter(raw_ostream &OS, Optional<bool> ColorsEnabled)
     : OS(OS), ColorsEnabled(ColorsEnabled.value_or(
                   WithColor::defaultAutoDetectFunction()(OS))) {}
 
-void MarkupFilter::beginLine(StringRef Line) {
+void MarkupFilter::filter(StringRef Line) {
   this->Line = Line;
   resetColor();
+
+  Parser.parseLine(Line);
+  SmallVector<MarkupNode> DeferredNodes;
+  // See if the line is a contextual (i.e. contains a contextual element).
+  // In this case, anything after the contextual element is elided, or the whole
+  // line may be elided.
+  while (Optional<MarkupNode> Node = Parser.nextNode()) {
+    // If this was a contextual line, then summarily stop processing.
+    if (tryContextualElement(*Node, DeferredNodes))
+      return;
+    // This node may yet be part of an elided contextual line.
+    DeferredNodes.push_back(*Node);
+  }
+
+  // This was not a contextual line, so nothing in it should be elided.
+  endAnyModuleInfoLine();
+  for (const MarkupNode &Node : DeferredNodes)
+    filterNode(Node);
 }
 
-void MarkupFilter::filter(const MarkupNode &Node) {
-  if (!checkTag(Node))
-    return;
+void MarkupFilter::finish() {
+  Parser.flush();
+  while (Optional<MarkupNode> Node = Parser.nextNode())
+    filterNode(*Node);
+  endAnyModuleInfoLine();
+  resetColor();
+  Modules.clear();
+  MMaps.clear();
+}
 
-  if (trySGR(Node))
-    return;
+// See if the given node is a contextual element and handle it if so. This may
+// either output or defer the element; in the former case, it will first emit
+// any DeferredNodes.
+//
+// Returns true if the given element was a contextual element. In this case,
+// DeferredNodes should be considered handled and should not be emitted. The
+// rest of the containing line must also be ignored in case the element was
+// deferred to a following line.
+bool MarkupFilter::tryContextualElement(
+    const MarkupNode &Node, const SmallVector<MarkupNode> &DeferredNodes) {
+  if (tryMMap(Node, DeferredNodes))
+    return true;
+  if (tryReset(Node, DeferredNodes))
+    return true;
+  return tryModule(Node, DeferredNodes);
+}
 
-  if (Node.Tag == "symbol") {
-    if (!checkNumFields(Node, 1))
-      return;
+bool MarkupFilter::tryMMap(const MarkupNode &Node,
+                           const SmallVector<MarkupNode> &DeferredNodes) {
+  if (Node.Tag != "mmap")
+    return false;
+  Optional<MMap> ParsedMMap = parseMMap(Node);
+  if (!ParsedMMap)
+    return true;
+
+  if (const MMap *M = overlappingMMap(*ParsedMMap)) {
+    WithColor::error(errs())
+        << formatv("overlapping mmap: #{0:x} [{1:x},{2:x})\n", M->Mod->ID,
+                   M->Addr, M->Addr + M->Size);
+    reportLocation(Node.Fields[0].begin());
+    return true;
+  }
+
+  auto Res = MMaps.emplace(ParsedMMap->Addr, std::move(*ParsedMMap));
+  assert(Res.second && "Overlap check should ensure emplace succeeds.");
+  MMap &MMap = Res.first->second;
+
+  if (!MIL || MIL->Mod != MMap.Mod) {
+    endAnyModuleInfoLine();
+    for (const MarkupNode &Node : DeferredNodes)
+      filterNode(Node);
+    beginModuleInfoLine(MMap.Mod);
+    OS << "; adds";
+  }
+  MIL->MMaps.push_back(&MMap);
+  return true;
+}
+
+bool MarkupFilter::tryReset(const MarkupNode &Node,
+                            const SmallVector<MarkupNode> &DeferredNodes) {
+  if (Node.Tag != "reset")
+    return false;
+  if (!checkNumFields(Node, 0))
+    return true;
+
+  if (!Modules.empty() || !MMaps.empty()) {
+    endAnyModuleInfoLine();
+    for (const MarkupNode &Node : DeferredNodes)
+      filterNode(Node);
     highlight();
-    OS << llvm::demangle(Node.Fields.front().str());
+    OS << "[[[reset]]]" << lineEnding();
     restoreColor();
+
+    Modules.clear();
+    MMaps.clear();
+  }
+  return true;
+}
+
+bool MarkupFilter::tryModule(const MarkupNode &Node,
+                             const SmallVector<MarkupNode> &DeferredNodes) {
+  if (Node.Tag != "module")
+    return false;
+  Optional<Module> ParsedModule = parseModule(Node);
+  if (!ParsedModule)
+    return true;
+
+  auto Res = Modules.try_emplace(
+      ParsedModule->ID, std::make_unique<Module>(std::move(*ParsedModule)));
+  if (!Res.second) {
+    WithColor::error(errs()) << "duplicate module ID\n";
+    reportLocation(Node.Fields[0].begin());
+    return true;
+  }
+  Module &Module = *Res.first->second;
+
+  endAnyModuleInfoLine();
+  for (const MarkupNode &Node : DeferredNodes)
+    filterNode(Node);
+  beginModuleInfoLine(&Module);
+  OS << "; BuildID=";
+  highlightValue();
+  OS << toHex(Module.BuildID, /*LowerCase=*/true);
+  highlight();
+  return true;
+}
+
+void MarkupFilter::beginModuleInfoLine(const Module *M) {
+  highlight();
+  OS << "[[[ELF module";
+  highlightValue();
+  OS << formatv(" #{0:x} \"{1}\"", M->ID, M->Name);
+  highlight();
+  MIL = ModuleInfoLine{M};
+}
+
+void MarkupFilter::endAnyModuleInfoLine() {
+  if (!MIL)
     return;
+  llvm::stable_sort(MIL->MMaps, [](const MMap *A, const MMap *B) {
+    return A->Addr < B->Addr;
+  });
+  for (const MMap *M : MIL->MMaps) {
+    OS << (M == MIL->MMaps.front() ? ' ' : '-');
+    highlightValue();
+    OS << formatv("{0:x}", M->Addr);
+    highlight();
+    OS << '(';
+    highlightValue();
+    OS << M->Mode;
+    highlight();
+    OS << ')';
   }
+  OS << "]]]" << lineEnding();
+  restoreColor();
+  MIL.reset();
+}
+
+// Handle a node that is known not to be a contextual element.
+void MarkupFilter::filterNode(const MarkupNode &Node) {
+  if (!checkTag(Node))
+    return;
+  if (tryPresentation(Node))
+    return;
+  if (trySGR(Node))
+    return;
 
   OS << Node.Text;
 }
 
+bool MarkupFilter::tryPresentation(const MarkupNode &Node) {
+  return trySymbol(Node);
+}
+
+bool MarkupFilter::trySymbol(const MarkupNode &Node) {
+  if (Node.Tag != "symbol")
+    return false;
+  if (!checkNumFields(Node, 1))
+    return true;
+
+  highlight();
+  OS << llvm::demangle(Node.Fields.front().str());
+  restoreColor();
+  return true;
+}
+
 bool MarkupFilter::trySGR(const MarkupNode &Node) {
   if (Node.Text == "\033[0m") {
     resetColor();
@@ -93,6 +266,13 @@ void MarkupFilter::highlight() {
                  Bold);
 }
 
+// Begin highlighting a field within a highlighted markup string.
+void MarkupFilter::highlightValue() {
+  if (!ColorsEnabled)
+    return;
+  OS.changeColor(raw_ostream::Colors::GREEN, Bold);
+}
+
 // Set the output stream's color to the current color and bold state of the SGR
 // abstract machine.
 void MarkupFilter::restoreColor() {
@@ -117,6 +297,139 @@ void MarkupFilter::resetColor() {
     OS.resetColor();
 }
 
+// This macro helps reduce the amount of indirection done through Optional
+// below, since the usual case upon returning a None Optional is to return None.
+#define ASSIGN_OR_RETURN_NONE(TYPE, NAME, EXPR)                                \
+  auto NAME##Opt = (EXPR);                                                     \
+  if (!NAME##Opt)                                                              \
+    return None;                                                               \
+  TYPE NAME = std::move(*NAME##Opt)
+
+Optional<MarkupFilter::Module>
+MarkupFilter::parseModule(const MarkupNode &Element) const {
+  if (!checkNumFieldsAtLeast(Element, 3))
+    return None;
+  ASSIGN_OR_RETURN_NONE(uint64_t, ID, parseModuleID(Element.Fields[0]));
+  StringRef Name = Element.Fields[1];
+  StringRef Type = Element.Fields[2];
+  if (Type != "elf") {
+    WithColor::error() << "unknown module type\n";
+    reportLocation(Type.begin());
+    return None;
+  }
+  if (!checkNumFields(Element, 4))
+    return None;
+  ASSIGN_OR_RETURN_NONE(SmallVector<uint8_t>, BuildID,
+                        parseBuildID(Element.Fields[3]));
+  return Module{ID, Name.str(), std::move(BuildID)};
+}
+
+Optional<MarkupFilter::MMap>
+MarkupFilter::parseMMap(const MarkupNode &Element) const {
+  if (!checkNumFieldsAtLeast(Element, 3))
+    return None;
+  ASSIGN_OR_RETURN_NONE(uint64_t, Addr, parseAddr(Element.Fields[0]));
+  ASSIGN_OR_RETURN_NONE(uint64_t, Size, parseSize(Element.Fields[1]));
+  StringRef Type = Element.Fields[2];
+  if (Type != "load") {
+    WithColor::error() << "unknown mmap type\n";
+    reportLocation(Type.begin());
+    return None;
+  }
+  if (!checkNumFields(Element, 6))
+    return None;
+  ASSIGN_OR_RETURN_NONE(uint64_t, ID, parseModuleID(Element.Fields[3]));
+  ASSIGN_OR_RETURN_NONE(std::string, Mode, parseMode(Element.Fields[4]));
+  auto It = Modules.find(ID);
+  if (It == Modules.end()) {
+    WithColor::error() << "unknown module ID\n";
+    reportLocation(Element.Fields[3].begin());
+    return None;
+  }
+  ASSIGN_OR_RETURN_NONE(uint64_t, ModuleRelativeAddr,
+                        parseAddr(Element.Fields[5]));
+  return MMap{Addr, Size, It->second.get(), std::move(Mode),
+              ModuleRelativeAddr};
+}
+
+// Parse an address (%p in the spec).
+Optional<uint64_t> MarkupFilter::parseAddr(StringRef Str) const {
+  if (Str.empty()) {
+    reportTypeError(Str, "address");
+    return None;
+  }
+  if (all_of(Str, [](char C) { return C == '0'; }))
+    return 0;
+  if (!Str.startswith("0x")) {
+    reportTypeError(Str, "address");
+    return None;
+  }
+  uint64_t Addr;
+  if (Str.drop_front(2).getAsInteger(16, Addr)) {
+    reportTypeError(Str, "address");
+    return None;
+  }
+  return Addr;
+}
+
+// Parse a module ID (%i in the spec).
+Optional<uint64_t> MarkupFilter::parseModuleID(StringRef Str) const {
+  uint64_t ID;
+  if (Str.getAsInteger(0, ID)) {
+    reportTypeError(Str, "module ID");
+    return None;
+  }
+  return ID;
+}
+
+// Parse a size (%i in the spec).
+Optional<uint64_t> MarkupFilter::parseSize(StringRef Str) const {
+  uint64_t ID;
+  if (Str.getAsInteger(0, ID)) {
+    reportTypeError(Str, "size");
+    return None;
+  }
+  return ID;
+}
+
+// Parse a build ID (%x in the spec).
+Optional<SmallVector<uint8_t>> MarkupFilter::parseBuildID(StringRef Str) const {
+  std::string Bytes;
+  if (Str.empty() || Str.size() % 2 || !tryGetFromHex(Str, Bytes)) {
+    reportTypeError(Str, "build ID");
+    return None;
+  }
+  ArrayRef<uint8_t> BuildID(reinterpret_cast<const uint8_t *>(Bytes.data()),
+                            Bytes.size());
+  return SmallVector<uint8_t>(BuildID.begin(), BuildID.end());
+}
+
+// Parses the mode string for an mmap element.
+Optional<std::string> MarkupFilter::parseMode(StringRef Str) const {
+  if (Str.empty()) {
+    reportTypeError(Str, "mode");
+    return None;
+  }
+
+  // Pop off each of r/R, w/W, and x/X from the front, in that order.
+  StringRef Remainder = Str;
+  if (!Remainder.empty() && tolower(Remainder.front()) == 'r')
+    Remainder = Remainder.drop_front();
+  if (!Remainder.empty() && tolower(Remainder.front()) == 'w')
+    Remainder = Remainder.drop_front();
+  if (!Remainder.empty() && tolower(Remainder.front()) == 'x')
+    Remainder = Remainder.drop_front();
+
+  // If anything remains, then the string wasn't a mode.
+  if (!Remainder.empty()) {
+    reportTypeError(Str, "mode");
+    return None;
+  }
+
+  // Normalize the mode.
+  return Str.lower();
+}
+
 bool MarkupFilter::checkTag(const MarkupNode &Node) const {
   if (any_of(Node.Tag, [](char C) { return C < 'a' || C > 'z'; })) {
     WithColor::error(errs()) << "tags must be all lowercase characters\n";
@@ -126,18 +439,66 @@ bool MarkupFilter::checkTag(const MarkupNode &Node) const {
   return true;
 }
 
-bool MarkupFilter::checkNumFields(const MarkupNode &Node, size_t Size) const {
-  if (Node.Fields.size() != Size) {
+bool MarkupFilter::checkNumFields(const MarkupNode &Element,
+                                  size_t Size) const {
+  if (Element.Fields.size() != Size) {
     WithColor::error(errs()) << "expected " << Size << " fields; found "
-                             << Node.Fields.size() << "\n";
-    reportLocation(Node.Tag.end());
+                             << Element.Fields.size() << "\n";
+    reportLocation(Element.Tag.end());
     return false;
   }
   return true;
 }
 
+bool MarkupFilter::checkNumFieldsAtLeast(const MarkupNode &Element,
+                                         size_t Size) const {
+  if (Element.Fields.size() < Size) {
+    WithColor::error(errs())
+        << "expected at least " << Size << " fields; found "
+        << Element.Fields.size() << "\n";
+    reportLocation(Element.Tag.end());
+    return false;
+  }
+  return true;
+}
+
+void MarkupFilter::reportTypeError(StringRef Str, StringRef TypeName) const {
+  WithColor::error(errs()) << "expected " << TypeName << "; found '" << Str
+                           << "'\n";
+  reportLocation(Str.begin());
+}
+
+// Prints two lines that point out the given location in the current Line using
+// a caret. The iterator must be within the bounds of the most recent line
+// passed to beginLine().
 void MarkupFilter::reportLocation(StringRef::iterator Loc) const {
   errs() << Line;
   WithColor(errs().indent(Loc - Line.begin()), HighlightColor::String) << '^';
   errs() << '\n';
 }
+
+// Checks for an existing mmap that overlaps the given one and returns a
+// pointer to one of them.
+const MarkupFilter::MMap *MarkupFilter::overlappingMMap(const MMap &Map) const {
+  // If the given map contains the start of another mmap, they overlap.
+  auto I = MMaps.upper_bound(Map.Addr);
+  if (I != MMaps.end() && Map.contains(I->second.Addr))
+    return &I->second;
+
+  // If no element starts inside the given mmap, the only possible overlap would
+  // be if the preceding mmap contains the start point of the given mmap.
+  if (I != MMaps.begin()) {
+    --I;
+    if (I->second.contains(Map.Addr))
+      return &I->second;
+  }
+  return nullptr;
+}
+
+StringRef MarkupFilter::lineEnding() const {
+  return Line.endswith("\r\n") ? "\r\n" : "\n";
+}
+
+bool MarkupFilter::MMap::contains(uint64_t Addr) const {
+  return this->Addr <= Addr && Addr < this->Addr + Size;
+}
diff --git a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
index 43b9c2ba400b..dc07eaeaf615 100644
--- a/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/COFFLinkGraphBuilder.cpp
@@ -524,4 +524,4 @@ COFFLinkGraphBuilder::exportCOMDATSymbol(COFFSymbolIndex SymIndex,
 }
 
 } // namespace jitlink
-} // namespace llvm
-\ No newline at end of file
+} // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp b/llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp
index 0fc366bf505f..2a60d8206f63 100644
--- a/llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.cpp
@@ -107,7 +107,7 @@ Error DWARFRecordSectionSplitter::processBlock(
     }
 
     uint64_t BlockSize = BlockReader.getOffset() - RecordStartOffset;
-    auto &NewBlock = G.splitBlock(B, BlockSize);
+    auto &NewBlock = G.splitBlock(B, BlockSize, &Cache);
     (void)NewBlock;
     LLVM_DEBUG(dbgs() << "      Extracted " << NewBlock << "\n");
   }
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index c60f4b3b263c..70a3c404d836 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -48,6 +48,10 @@ JITTargetMachineBuilder::createTargetMachine() {
   if (!TheTarget)
     return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
 
+  if (!TheTarget->hasJIT())
+    return make_error<StringError>("Target has no JIT support",
+                                   inconvertibleErrorCode());
+
   auto *TM =
       TheTarget->createTargetMachine(TT.getTriple(), CPU, Features.getString(),
                                      Options, RM, CM, OptLevel, /*JIT*/ true);
diff --git a/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
new file mode 100644
index 000000000000..c2e7baabb994
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/MapperJITLinkMemoryManager.cpp
@@ -0,0 +1,135 @@
+//=== MapperJITLinkMemoryManager.cpp - Memory management with MemoryMapper ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h"
+
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/Support/Process.h"
+
+#include <limits>
+
+using namespace llvm::jitlink;
+
+namespace llvm {
+namespace orc {
+
+class MapperJITLinkMemoryManager::InFlightAlloc
+    : public JITLinkMemoryManager::InFlightAlloc {
+public:
+  InFlightAlloc(MapperJITLinkMemoryManager &Parent, LinkGraph &G,
+                ExecutorAddr AllocAddr,
+                std::vector<MemoryMapper::AllocInfo::SegInfo> Segs)
+      : Parent(Parent), G(G), AllocAddr(AllocAddr), Segs(std::move(Segs)) {}
+
+  void finalize(OnFinalizedFunction OnFinalize) override {
+    MemoryMapper::AllocInfo AI;
+    AI.MappingBase = AllocAddr;
+
+    std::swap(AI.Segments, Segs);
+    std::swap(AI.Actions, G.allocActions());
+
+    Parent.Mapper->initialize(AI, [&](Expected<ExecutorAddr> Result) {
+      if (!Result) {
+        OnFinalize(Result.takeError());
+        return;
+      }
+
+      OnFinalize(FinalizedAlloc(*Result));
+    });
+  }
+
+  void abandon(OnAbandonedFunction OnFinalize) override {
+    Parent.Mapper->release({AllocAddr}, std::move(OnFinalize));
+  }
+
+private:
+  MapperJITLinkMemoryManager &Parent;
+  LinkGraph &G;
+  ExecutorAddr AllocAddr;
+  std::vector<MemoryMapper::AllocInfo::SegInfo> Segs;
+};
+
+MapperJITLinkMemoryManager::MapperJITLinkMemoryManager(
+    std::unique_ptr<MemoryMapper> Mapper)
+    : Mapper(std::move(Mapper)) {}
+
+void MapperJITLinkMemoryManager::allocate(const JITLinkDylib *JD, LinkGraph &G,
+                                          OnAllocatedFunction OnAllocated) {
+  BasicLayout BL(G);
+
+  // find required address space
+  auto SegsSizes = BL.getContiguousPageBasedLayoutSizes(Mapper->getPageSize());
+  if (!SegsSizes) {
+    OnAllocated(SegsSizes.takeError());
+    return;
+  }
+
+  // Check if total size fits in address space
+  if (SegsSizes->total() > std::numeric_limits<size_t>::max()) {
+    OnAllocated(make_error<JITLinkError>(
+        formatv("Total requested size {:x} for graph {} exceeds address space",
+                SegsSizes->total(), G.getName())));
+    return;
+  }
+
+  Mapper->reserve(
+      SegsSizes->total(),
+      [this, &G, BL = std::move(BL), OnAllocated = std::move(OnAllocated)](
+          Expected<ExecutorAddrRange> Result) mutable {
+        if (!Result) {
+          return OnAllocated(Result.takeError());
+        }
+
+        auto NextSegAddr = Result->Start;
+
+        std::vector<MemoryMapper::AllocInfo::SegInfo> SegInfos;
+
+        for (auto &KV : BL.segments()) {
+          auto &AG = KV.first;
+          auto &Seg = KV.second;
+
+          auto TotalSize = Seg.ContentSize + Seg.ZeroFillSize;
+
+          Seg.Addr = NextSegAddr;
+          Seg.WorkingMem = Mapper->prepare(NextSegAddr, TotalSize);
+
+          NextSegAddr += alignTo(TotalSize, Mapper->getPageSize());
+
+          MemoryMapper::AllocInfo::SegInfo SI;
+          SI.Offset = Seg.Addr - Result->Start;
+          SI.ContentSize = Seg.ContentSize;
+          SI.ZeroFillSize = Seg.ZeroFillSize;
+          SI.Prot = (toSysMemoryProtectionFlags(AG.getMemProt()));
+          SI.WorkingMem = Seg.WorkingMem;
+
+          SegInfos.push_back(SI);
+        }
+
+        if (auto Err = BL.apply()) {
+          OnAllocated(std::move(Err));
+          return;
+        }
+
+        OnAllocated(std::make_unique<InFlightAlloc>(*this, G, Result->Start,
+                                                    std::move(SegInfos)));
+      });
+}
+
+void MapperJITLinkMemoryManager::deallocate(
+    std::vector<FinalizedAlloc> Allocs, OnDeallocatedFunction OnDeallocated) {
+  std::vector<ExecutorAddr> Bases;
+  Bases.reserve(Allocs.size());
+  for (auto &FA : Allocs) {
+    Bases.push_back(FA.getAddress());
+    FA.release();
+  }
+  Mapper->release(Bases, std::move(OnDeallocated));
+}
+
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
index 8b3fbd7117e2..ca3f64b8a409 100644
--- a/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MemoryMapper.cpp
@@ -8,11 +8,33 @@
 
 #include "llvm/ExecutionEngine/Orc/MemoryMapper.h"
 
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/Support/WindowsError.h"
+
+#if defined(LLVM_ON_UNIX)
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#elif defined(_WIN32)
+#include <windows.h>
+#endif
+
 namespace llvm {
 namespace orc {
 
 MemoryMapper::~MemoryMapper() {}
 
+InProcessMemoryMapper::InProcessMemoryMapper(size_t PageSize)
+    : PageSize(PageSize) {}
+
+Expected<std::unique_ptr<InProcessMemoryMapper>>
+InProcessMemoryMapper::Create() {
+  auto PageSize = sys::Process::getPageSize();
+  if (!PageSize)
+    return PageSize.takeError();
+  return std::make_unique<InProcessMemoryMapper>(*PageSize);
+}
+
 void InProcessMemoryMapper::reserve(size_t NumBytes,
                                     OnReservedFunction OnReserved) {
   std::error_code EC;
@@ -147,6 +169,238 @@ InProcessMemoryMapper::~InProcessMemoryMapper() {
   cantFail(F.get());
 }
 
+// SharedMemoryMapper
+
+SharedMemoryMapper::SharedMemoryMapper(ExecutorProcessControl &EPC,
+                                       SymbolAddrs SAs, size_t PageSize)
+    : EPC(EPC), SAs(SAs), PageSize(PageSize) {}
+
+Expected<std::unique_ptr<SharedMemoryMapper>>
+SharedMemoryMapper::Create(ExecutorProcessControl &EPC, SymbolAddrs SAs) {
+  auto PageSize = sys::Process::getPageSize();
+  if (!PageSize)
+    return PageSize.takeError();
+
+  return std::make_unique<SharedMemoryMapper>(EPC, SAs, *PageSize);
+}
+
+void SharedMemoryMapper::reserve(size_t NumBytes,
+                                 OnReservedFunction OnReserved) {
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
+
+  EPC.callSPSWrapperAsync<
+      rt::SPSExecutorSharedMemoryMapperServiceReserveSignature>(
+      SAs.Reserve,
+      [this, NumBytes, OnReserved = std::move(OnReserved)](
+          Error SerializationErr,
+          Expected<std::pair<ExecutorAddr, std::string>> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnReserved(std::move(SerializationErr));
+        }
+
+        if (!Result)
+          return OnReserved(Result.takeError());
+
+        ExecutorAddr RemoteAddr;
+        std::string SharedMemoryName;
+        std::tie(RemoteAddr, SharedMemoryName) = std::move(*Result);
+
+        void *LocalAddr = nullptr;
+
+#if defined(LLVM_ON_UNIX)
+
+        int SharedMemoryFile = shm_open(SharedMemoryName.c_str(), O_RDWR, 0700);
+        if (SharedMemoryFile < 0) {
+          return OnReserved(errorCodeToError(
+              std::error_code(errno, std::generic_category())));
+        }
+
+        // this prevents other processes from accessing it by name
+        shm_unlink(SharedMemoryName.c_str());
+
+        LocalAddr = mmap(nullptr, NumBytes, PROT_READ | PROT_WRITE, MAP_SHARED,
+                         SharedMemoryFile, 0);
+        if (LocalAddr == MAP_FAILED) {
+          return OnReserved(errorCodeToError(
+              std::error_code(errno, std::generic_category())));
+        }
+
+        close(SharedMemoryFile);
+
+#elif defined(_WIN32)
+
+        std::wstring WideSharedMemoryName(SharedMemoryName.begin(),
+                                          SharedMemoryName.end());
+        HANDLE SharedMemoryFile = OpenFileMappingW(
+            FILE_MAP_ALL_ACCESS, FALSE, WideSharedMemoryName.c_str());
+        if (!SharedMemoryFile)
+          return OnReserved(errorCodeToError(mapWindowsError(GetLastError())));
+
+        LocalAddr =
+            MapViewOfFile(SharedMemoryFile, FILE_MAP_ALL_ACCESS, 0, 0, 0);
+        if (!LocalAddr) {
+          CloseHandle(SharedMemoryFile);
+          return OnReserved(errorCodeToError(mapWindowsError(GetLastError())));
+        }
+
+        CloseHandle(SharedMemoryFile);
+
+#endif
+        {
+          std::lock_guard<std::mutex> Lock(Mutex);
+          Reservations.insert({RemoteAddr, {LocalAddr, NumBytes}});
+        }
+
+        OnReserved(ExecutorAddrRange(RemoteAddr, NumBytes));
+      },
+      SAs.Instance, static_cast<uint64_t>(NumBytes));
+
+#else
+  OnReserved(make_error<StringError>(
+      "SharedMemoryMapper is not supported on this platform yet",
+      inconvertibleErrorCode()));
+#endif
+}
+
+char *SharedMemoryMapper::prepare(ExecutorAddr Addr, size_t ContentSize) {
+  auto R = Reservations.upper_bound(Addr);
+  assert(R != Reservations.begin() && "Attempt to prepare unknown range");
+  R--;
+
+  ExecutorAddrDiff Offset = Addr - R->first;
+
+  return static_cast<char *>(R->second.LocalAddr) + Offset;
+}
+
+void SharedMemoryMapper::initialize(MemoryMapper::AllocInfo &AI,
+                                    OnInitializedFunction OnInitialized) {
+  auto Reservation = Reservations.find(AI.MappingBase);
+  assert(Reservation != Reservations.end() &&
+         "Attempt to initialize unreserved range");
+
+  tpctypes::SharedMemoryFinalizeRequest FR;
+
+  AI.Actions.swap(FR.Actions);
+
+  FR.Segments.reserve(AI.Segments.size());
+
+  for (auto Segment : AI.Segments) {
+    char *Base =
+        static_cast<char *>(Reservation->second.LocalAddr) + Segment.Offset;
+    std::memset(Base + Segment.ContentSize, 0, Segment.ZeroFillSize);
+
+    tpctypes::SharedMemorySegFinalizeRequest SegReq;
+    SegReq.Prot = tpctypes::toWireProtectionFlags(
+        static_cast<sys::Memory::ProtectionFlags>(Segment.Prot));
+    SegReq.Addr = AI.MappingBase + Segment.Offset;
+    SegReq.Size = Segment.ContentSize + Segment.ZeroFillSize;
+
+    FR.Segments.push_back(SegReq);
+  }
+
+  EPC.callSPSWrapperAsync<
+      rt::SPSExecutorSharedMemoryMapperServiceInitializeSignature>(
+      SAs.Initialize,
+      [OnInitialized = std::move(OnInitialized)](
+          Error SerializationErr, Expected<ExecutorAddr> Result) mutable {
+        if (SerializationErr) {
+          cantFail(Result.takeError());
+          return OnInitialized(std::move(SerializationErr));
+        }
+
+        OnInitialized(std::move(Result));
+      },
+      SAs.Instance, AI.MappingBase, std::move(FR));
+}
+
+void SharedMemoryMapper::deinitialize(
+    ArrayRef<ExecutorAddr> Allocations,
+    MemoryMapper::OnDeinitializedFunction OnDeinitialized) {
+  EPC.callSPSWrapperAsync<
+      rt::SPSExecutorSharedMemoryMapperServiceDeinitializeSignature>(
+      SAs.Deinitialize,
+      [OnDeinitialized = std::move(OnDeinitialized)](Error SerializationErr,
+                                                     Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnDeinitialized(std::move(SerializationErr));
+        }
+
+        OnDeinitialized(std::move(Result));
+      },
+      SAs.Instance, Allocations);
+}
+
+void SharedMemoryMapper::release(ArrayRef<ExecutorAddr> Bases,
+                                 OnReleasedFunction OnReleased) {
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
+  Error Err = Error::success();
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    for (auto Base : Bases) {
+
+#if defined(LLVM_ON_UNIX)
+
+      if (munmap(Reservations[Base].LocalAddr, Reservations[Base].Size) != 0)
+        Err = joinErrors(std::move(Err), errorCodeToError(std::error_code(
+                                             errno, std::generic_category())));
+
+#elif defined(_WIN32)
+
+      if (!UnmapViewOfFile(Reservations[Base].LocalAddr))
+        joinErrors(std::move(Err),
+                   errorCodeToError(mapWindowsError(GetLastError())));
+
+#endif
+
+      Reservations.erase(Base);
+    }
+  }
+
+  EPC.callSPSWrapperAsync<
+      rt::SPSExecutorSharedMemoryMapperServiceReleaseSignature>(
+      SAs.Release,
+      [OnReleased = std::move(OnReleased),
+       Err = std::move(Err)](Error SerializationErr, Error Result) mutable {
+        if (SerializationErr) {
+          cantFail(std::move(Result));
+          return OnReleased(
+              joinErrors(std::move(Err), std::move(SerializationErr)));
+        }
+
+        return OnReleased(joinErrors(std::move(Err), std::move(Result)));
+      },
+      SAs.Instance, Bases);
+#else
+  OnReleased(make_error<StringError>(
+      "SharedMemoryMapper is not supported on this platform yet",
+      inconvertibleErrorCode()));
+#endif
+}
+
+SharedMemoryMapper::~SharedMemoryMapper() {
+  std::vector<ExecutorAddr> ReservationAddrs;
+  if (!Reservations.empty()) {
+    std::lock_guard<std::mutex> Lock(Mutex);
+    {
+      ReservationAddrs.reserve(Reservations.size());
+      for (const auto &R : Reservations) {
+        ReservationAddrs.push_back(R.first);
+      }
+    }
+  }
+
+  std::promise<MSVCPError> P;
+  auto F = P.get_future();
+  release(ReservationAddrs, [&](Error Err) { P.set_value(std::move(Err)); });
+  // FIXME: Release can actually fail. The error should be propagated.
+  // Meanwhile, a better option is to explicitly call release().
+  cantFail(F.get());
+}
+
 } // namespace orc
 
 } // namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
index 5eae33121eb9..dfdd846c46a7 100644
--- a/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Shared/OrcRTBridge.cpp
@@ -18,6 +18,7 @@ const char *SimpleExecutorDylibManagerOpenWrapperName =
     "__llvm_orc_SimpleExecutorDylibManager_open_wrapper";
 const char *SimpleExecutorDylibManagerLookupWrapperName =
     "__llvm_orc_SimpleExecutorDylibManager_lookup_wrapper";
+
 const char *SimpleExecutorMemoryManagerInstanceName =
     "__llvm_orc_SimpleExecutorMemoryManager_Instance";
 const char *SimpleExecutorMemoryManagerReserveWrapperName =
@@ -26,6 +27,18 @@ const char *SimpleExecutorMemoryManagerFinalizeWrapperName =
     "__llvm_orc_SimpleExecutorMemoryManager_finalize_wrapper";
 const char *SimpleExecutorMemoryManagerDeallocateWrapperName =
     "__llvm_orc_SimpleExecutorMemoryManager_deallocate_wrapper";
+
+const char *ExecutorSharedMemoryMapperServiceInstanceName =
+    "__llvm_orc_ExecutorSharedMemoryMapperService_Instance";
+const char *ExecutorSharedMemoryMapperServiceReserveWrapperName =
+    "__llvm_orc_ExecutorSharedMemoryMapperService_Reserve";
+const char *ExecutorSharedMemoryMapperServiceInitializeWrapperName =
+    "__llvm_orc_ExecutorSharedMemoryMapperService_Initialize";
+const char *ExecutorSharedMemoryMapperServiceDeinitializeWrapperName =
+    "__llvm_orc_ExecutorSharedMemoryMapperService_Deinitialize";
+const char *ExecutorSharedMemoryMapperServiceReleaseWrapperName =
+    "__llvm_orc_ExecutorSharedMemoryMapperService_Release";
+
 const char *MemoryWriteUInt8sWrapperName =
     "__llvm_orc_bootstrap_mem_write_uint8s_wrapper";
 const char *MemoryWriteUInt16sWrapperName =
@@ -36,10 +49,12 @@ const char *MemoryWriteUInt64sWrapperName =
     "__llvm_orc_bootstrap_mem_write_uint64s_wrapper";
 const char *MemoryWriteBuffersWrapperName =
     "__llvm_orc_bootstrap_mem_write_buffers_wrapper";
+
 const char *RegisterEHFrameSectionWrapperName =
     "__llvm_orc_bootstrap_register_ehframe_section_wrapper";
 const char *DeregisterEHFrameSectionWrapperName =
     "__llvm_orc_bootstrap_deregister_ehframe_section_wrapper";
+
 const char *RunAsMainWrapperName = "__llvm_orc_bootstrap_run_as_main_wrapper";
 
 } // end namespace rt
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
new file mode 100644
index 000000000000..6c9f099061ae
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.cpp
@@ -0,0 +1,341 @@
+//===---------- ExecutorSharedMemoryMapperService.cpp -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h"
+
+#include "llvm/ExecutionEngine/Orc/Shared/OrcRTBridge.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/WindowsError.h"
+
+#include <sstream>
+
+#if defined(LLVM_ON_UNIX)
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#endif
+
+#if defined(_WIN32)
+static DWORD getWindowsProtectionFlags(unsigned Flags) {
+  switch (Flags & llvm::sys::Memory::MF_RWE_MASK) {
+  case llvm::sys::Memory::MF_READ:
+    return PAGE_READONLY;
+  case llvm::sys::Memory::MF_WRITE:
+    // Note: PAGE_WRITE is not supported by VirtualProtect
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ | llvm::sys::Memory::MF_WRITE:
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ | llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READ;
+  case llvm::sys::Memory::MF_READ | llvm::sys::Memory::MF_WRITE |
+      llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READWRITE;
+  case llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE;
+  default:
+    llvm_unreachable("Illegal memory protection flag specified!");
+  }
+  // Provide a default return value as required by some compilers.
+  return PAGE_NOACCESS;
+}
+#endif
+
+namespace llvm {
+namespace orc {
+namespace rt_bootstrap {
+
+Expected<std::pair<ExecutorAddr, std::string>>
+ExecutorSharedMemoryMapperService::reserve(uint64_t Size) {
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
+
+#if defined(LLVM_ON_UNIX)
+
+  std::string SharedMemoryName;
+  {
+    std::stringstream SharedMemoryNameStream;
+    SharedMemoryNameStream << "/jitlink_" << sys::Process::getProcessId() << '_'
+                           << (++SharedMemoryCount);
+    SharedMemoryName = SharedMemoryNameStream.str();
+  }
+
+  int SharedMemoryFile =
+      shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT | O_EXCL, 0700);
+  if (SharedMemoryFile < 0)
+    return errorCodeToError(std::error_code(errno, std::generic_category()));
+
+  // by default size is 0
+  if (ftruncate(SharedMemoryFile, Size) < 0)
+    return errorCodeToError(std::error_code(errno, std::generic_category()));
+
+  void *Addr = mmap(nullptr, Size, PROT_NONE, MAP_SHARED, SharedMemoryFile, 0);
+  if (Addr == MAP_FAILED)
+    return errorCodeToError(std::error_code(errno, std::generic_category()));
+
+  close(SharedMemoryFile);
+
+#elif defined(_WIN32)
+
+  std::string SharedMemoryName;
+  {
+    std::stringstream SharedMemoryNameStream;
+    SharedMemoryNameStream << "jitlink_" << sys::Process::getProcessId() << '_'
+                           << (++SharedMemoryCount);
+    SharedMemoryName = SharedMemoryNameStream.str();
+  }
+
+  std::wstring WideSharedMemoryName(SharedMemoryName.begin(),
+                                    SharedMemoryName.end());
+  HANDLE SharedMemoryFile = CreateFileMappingW(
+      INVALID_HANDLE_VALUE, NULL, PAGE_EXECUTE_READWRITE, Size >> 32,
+      Size & 0xffffffff, WideSharedMemoryName.c_str());
+  if (!SharedMemoryFile)
+    return errorCodeToError(mapWindowsError(GetLastError()));
+
+  void *Addr = MapViewOfFile(SharedMemoryFile,
+                             FILE_MAP_ALL_ACCESS | FILE_MAP_EXECUTE, 0, 0, 0);
+  if (!Addr) {
+    CloseHandle(SharedMemoryFile);
+    return errorCodeToError(mapWindowsError(GetLastError()));
+  }
+
+#endif
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+    Reservations[Addr].Size = Size;
+#if defined(_WIN32)
+    Reservations[Addr].SharedMemoryFile = SharedMemoryFile;
+#endif
+  }
+
+  return std::make_pair(ExecutorAddr::fromPtr(Addr),
+                        std::move(SharedMemoryName));
+#else
+  return make_error<StringError>(
+      "SharedMemoryMapper is not supported on this platform yet",
+      inconvertibleErrorCode());
+#endif
+}
+
+Expected<ExecutorAddr> ExecutorSharedMemoryMapperService::initialize(
+    ExecutorAddr Reservation, tpctypes::SharedMemoryFinalizeRequest &FR) {
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
+
+  ExecutorAddr MinAddr(~0ULL);
+
+  // Contents are already in place
+  for (auto &Segment : FR.Segments) {
+    if (Segment.Addr < MinAddr)
+      MinAddr = Segment.Addr;
+
+#if defined(LLVM_ON_UNIX)
+
+    int NativeProt = 0;
+    if (Segment.Prot & tpctypes::WPF_Read)
+      NativeProt |= PROT_READ;
+    if (Segment.Prot & tpctypes::WPF_Write)
+      NativeProt |= PROT_WRITE;
+    if (Segment.Prot & tpctypes::WPF_Exec)
+      NativeProt |= PROT_EXEC;
+
+    if (mprotect(Segment.Addr.toPtr<void *>(), Segment.Size, NativeProt))
+      return errorCodeToError(std::error_code(errno, std::generic_category()));
+
+#elif defined(_WIN32)
+
+    DWORD NativeProt =
+        getWindowsProtectionFlags(fromWireProtectionFlags(Segment.Prot));
+
+    if (!VirtualProtect(Segment.Addr.toPtr<void *>(), Segment.Size, NativeProt,
+                        &NativeProt))
+      return errorCodeToError(mapWindowsError(GetLastError()));
+
+#endif
+
+    if (Segment.Prot & tpctypes::WPF_Exec)
+      sys::Memory::InvalidateInstructionCache(Segment.Addr.toPtr<void *>(),
+                                              Segment.Size);
+  }
+
+  // Run finalization actions and get deinitlization action list.
+  auto DeinitializeActions = shared::runFinalizeActions(FR.Actions);
+  if (!DeinitializeActions) {
+    return DeinitializeActions.takeError();
+  }
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+    Allocations[MinAddr].DeinitializationActions =
+        std::move(*DeinitializeActions);
+    Reservations[Reservation.toPtr<void *>()].Allocations.push_back(MinAddr);
+  }
+
+  return MinAddr;
+
+#else
+  return make_error<StringError>(
+      "SharedMemoryMapper is not supported on this platform yet",
+      inconvertibleErrorCode());
+#endif
+}
+
+Error ExecutorSharedMemoryMapperService::deinitialize(
+    const std::vector<ExecutorAddr> &Bases) {
+  Error AllErr = Error::success();
+
+  {
+    std::lock_guard<std::mutex> Lock(Mutex);
+
+    for (auto Base : Bases) {
+      if (Error Err = shared::runDeallocActions(
+              Allocations[Base].DeinitializationActions)) {
+        AllErr = joinErrors(std::move(AllErr), std::move(Err));
+      }
+
+      Allocations.erase(Base);
+    }
+  }
+
+  return AllErr;
+}
+
+Error ExecutorSharedMemoryMapperService::release(
+    const std::vector<ExecutorAddr> &Bases) {
+#if defined(LLVM_ON_UNIX) || defined(_WIN32)
+  Error Err = Error::success();
+
+  for (auto Base : Bases) {
+    std::vector<ExecutorAddr> AllocAddrs;
+    size_t Size;
+
+#if defined(_WIN32)
+    HANDLE SharedMemoryFile;
+#endif
+
+    {
+      std::lock_guard<std::mutex> Lock(Mutex);
+      auto &R = Reservations[Base.toPtr<void *>()];
+      Size = R.Size;
+
+#if defined(_WIN32)
+      SharedMemoryFile = R.SharedMemoryFile;
+#endif
+
+      AllocAddrs.swap(R.Allocations);
+    }
+
+    // deinitialize sub allocations
+    if (Error E = deinitialize(AllocAddrs))
+      Err = joinErrors(std::move(Err), std::move(E));
+
+#if defined(LLVM_ON_UNIX)
+
+    if (munmap(Base.toPtr<void *>(), Size) != 0)
+      Err = joinErrors(std::move(Err), errorCodeToError(std::error_code(
+                                           errno, std::generic_category())));
+
+#elif defined(_WIN32)
+
+    if (!UnmapViewOfFile(Base.toPtr<void *>()))
+      Err = joinErrors(std::move(Err),
+                       errorCodeToError(mapWindowsError(GetLastError())));
+
+    CloseHandle(SharedMemoryFile);
+
+#endif
+
+    std::lock_guard<std::mutex> Lock(Mutex);
+    Reservations.erase(Base.toPtr<void *>());
+  }
+
+  return Err;
+#else
+  return make_error<StringError>(
+      "SharedMemoryMapper is not supported on this platform yet",
+      inconvertibleErrorCode());
+#endif
+}
+
+Error ExecutorSharedMemoryMapperService::shutdown() {
+  std::vector<ExecutorAddr> ReservationAddrs;
+  if (!Reservations.empty()) {
+    std::lock_guard<std::mutex> Lock(Mutex);
+    {
+      ReservationAddrs.reserve(Reservations.size());
+      for (const auto &R : Reservations) {
+        ReservationAddrs.push_back(ExecutorAddr::fromPtr(R.getFirst()));
+      }
+    }
+  }
+  return release(ReservationAddrs);
+
+  return Error::success();
+}
+
+void ExecutorSharedMemoryMapperService::addBootstrapSymbols(
+    StringMap<ExecutorAddr> &M) {
+  M[rt::ExecutorSharedMemoryMapperServiceInstanceName] =
+      ExecutorAddr::fromPtr(this);
+  M[rt::ExecutorSharedMemoryMapperServiceReserveWrapperName] =
+      ExecutorAddr::fromPtr(&reserveWrapper);
+  M[rt::ExecutorSharedMemoryMapperServiceInitializeWrapperName] =
+      ExecutorAddr::fromPtr(&initializeWrapper);
+  M[rt::ExecutorSharedMemoryMapperServiceDeinitializeWrapperName] =
+      ExecutorAddr::fromPtr(&deinitializeWrapper);
+  M[rt::ExecutorSharedMemoryMapperServiceReleaseWrapperName] =
+      ExecutorAddr::fromPtr(&releaseWrapper);
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+ExecutorSharedMemoryMapperService::reserveWrapper(const char *ArgData,
+                                                  size_t ArgSize) {
+  return shared::WrapperFunction<
+             rt::SPSExecutorSharedMemoryMapperServiceReserveSignature>::
+      handle(ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &ExecutorSharedMemoryMapperService::reserve))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+ExecutorSharedMemoryMapperService::initializeWrapper(const char *ArgData,
+                                                     size_t ArgSize) {
+  return shared::WrapperFunction<
+             rt::SPSExecutorSharedMemoryMapperServiceInitializeSignature>::
+      handle(ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &ExecutorSharedMemoryMapperService::initialize))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+ExecutorSharedMemoryMapperService::deinitializeWrapper(const char *ArgData,
+                                                       size_t ArgSize) {
+  return shared::WrapperFunction<
+             rt::SPSExecutorSharedMemoryMapperServiceDeinitializeSignature>::
+      handle(ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &ExecutorSharedMemoryMapperService::deinitialize))
+          .release();
+}
+
+llvm::orc::shared::CWrapperFunctionResult
+ExecutorSharedMemoryMapperService::releaseWrapper(const char *ArgData,
+                                                  size_t ArgSize) {
+  return shared::WrapperFunction<
+             rt::SPSExecutorSharedMemoryMapperServiceReleaseSignature>::
+      handle(ArgData, ArgSize,
+             shared::makeMethodWrapperHandler(
+                 &ExecutorSharedMemoryMapperService::release))
+          .release();
+}
+
+} // namespace rt_bootstrap
+} // end namespace orc
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
index 7cadf3bb51a7..c848dd65fa7e 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.cpp
@@ -126,7 +126,8 @@ Error SimpleExecutorMemoryManager::finalize(tpctypes::FinalizeRequest &FR) {
           inconvertibleErrorCode()));
 
     char *Mem = Seg.Addr.toPtr<char *>();
-    memcpy(Mem, Seg.Content.data(), Seg.Content.size());
+    if (!Seg.Content.empty())
+      memcpy(Mem, Seg.Content.data(), Seg.Content.size());
     memset(Mem + Seg.Content.size(), 0, Seg.Size - Seg.Content.size());
     assert(Seg.Size <= std::numeric_limits<size_t>::max());
     if (auto EC = sys::Memory::protectMappedMemory(
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 574d9174bebf..cee4cddab5e8 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -1453,7 +1453,36 @@ OpenMPIRBuilder::createTask(const LocationDescription &Loc,
       InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
   InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
   BodyGenCB(TaskAllocaIP, TaskBodyIP);
-  Builder.SetInsertPoint(TaskExitBB);
+  Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
+
+  return Builder.saveIP();
+}
+
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
+                                 InsertPointTy AllocaIP,
+                                 BodyGenCallbackTy BodyGenCB) {
+  if (!updateToLocation(Loc))
+    return InsertPointTy();
+
+  uint32_t SrcLocStrSize;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
+  Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
+  Value *ThreadID = getOrCreateThreadID(Ident);
+
+  // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
+  Function *TaskgroupFn =
+      getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
+  Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
+
+  BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
+  BodyGenCB(AllocaIP, Builder.saveIP());
+
+  Builder.SetInsertPoint(TaskgroupExitBB);
+  // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
+  Function *EndTaskgroupFn =
+      getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
+  Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
 
   return Builder.saveIP();
 }
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 53df94366760..d4138133721e 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -354,6 +354,8 @@ Function *Function::createWithDefaultAttr(FunctionType *Ty,
     B.addAttribute("frame-pointer", "all");
     break;
   }
+  if (M->getModuleFlag("function_return_thunk_extern"))
+    B.addAttribute(Attribute::FnRetThunkExtern);
   F->addFnAttrs(B);
   return F;
 }
diff --git a/llvm/lib/IR/GCStrategy.cpp b/llvm/lib/IR/GCStrategy.cpp
index f3bc5b74f8fd..5833dc26c57e 100644
--- a/llvm/lib/IR/GCStrategy.cpp
+++ b/llvm/lib/IR/GCStrategy.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/GCStrategy.h"
+#include "llvm/ADT/Twine.h"
 
 using namespace llvm;
 
@@ -32,7 +33,7 @@ std::unique_ptr<GCStrategy> llvm::getGCStrategy(const StringRef Name) {
     const std::string error =
         std::string("unsupported GC: ") + Name.str() +
         " (did you remember to link and initialize the library?)";
-    report_fatal_error(error);
+    report_fatal_error(Twine(error));
   } else
-    report_fatal_error(std::string("unsupported GC: ") + Name.str());
+    report_fatal_error(Twine(std::string("unsupported GC: ") + Name.str()));
 }
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 3265050261c8..51a22897babd 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -316,32 +316,38 @@ bool GlobalObject::canIncreaseAlignment() const {
   return true;
 }
 
+template <typename Operation>
 static const GlobalObject *
-findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases) {
-  if (auto *GO = dyn_cast<GlobalObject>(C))
+findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases,
+               const Operation &Op) {
+  if (auto *GO = dyn_cast<GlobalObject>(C)) {
+    Op(*GO);
     return GO;
-  if (auto *GA = dyn_cast<GlobalAlias>(C))
+  }
+  if (auto *GA = dyn_cast<GlobalAlias>(C)) {
+    Op(*GA);
     if (Aliases.insert(GA).second)
-      return findBaseObject(GA->getOperand(0), Aliases);
+      return findBaseObject(GA->getOperand(0), Aliases, Op);
+  }
   if (auto *CE = dyn_cast<ConstantExpr>(C)) {
     switch (CE->getOpcode()) {
     case Instruction::Add: {
-      auto *LHS = findBaseObject(CE->getOperand(0), Aliases);
-      auto *RHS = findBaseObject(CE->getOperand(1), Aliases);
+      auto *LHS = findBaseObject(CE->getOperand(0), Aliases, Op);
+      auto *RHS = findBaseObject(CE->getOperand(1), Aliases, Op);
       if (LHS && RHS)
         return nullptr;
       return LHS ? LHS : RHS;
     }
     case Instruction::Sub: {
-      if (findBaseObject(CE->getOperand(1), Aliases))
+      if (findBaseObject(CE->getOperand(1), Aliases, Op))
         return nullptr;
-      return findBaseObject(CE->getOperand(0), Aliases);
+      return findBaseObject(CE->getOperand(0), Aliases, Op);
     }
     case Instruction::IntToPtr:
     case Instruction::PtrToInt:
     case Instruction::BitCast:
     case Instruction::GetElementPtr:
-      return findBaseObject(CE->getOperand(0), Aliases);
+      return findBaseObject(CE->getOperand(0), Aliases, Op);
     default:
       break;
     }
@@ -351,7 +357,7 @@ findBaseObject(const Constant *C, DenseSet<const GlobalAlias *> &Aliases) {
 
 const GlobalObject *GlobalValue::getAliaseeObject() const {
   DenseSet<const GlobalAlias *> Aliases;
-  return findBaseObject(this, Aliases);
+  return findBaseObject(this, Aliases, [](const GlobalValue &) {});
 }
 
 bool GlobalValue::isAbsoluteSymbolRef() const {
@@ -544,7 +550,7 @@ void GlobalAlias::setAliasee(Constant *Aliasee) {
 
 const GlobalObject *GlobalAlias::getAliaseeObject() const {
   DenseSet<const GlobalAlias *> Aliases;
-  return findBaseObject(getOperand(0), Aliases);
+  return findBaseObject(getOperand(0), Aliases, [](const GlobalValue &) {});
 }
 
 //===----------------------------------------------------------------------===//
@@ -577,5 +583,12 @@ void GlobalIFunc::eraseFromParent() {
 
 const Function *GlobalIFunc::getResolverFunction() const {
   DenseSet<const GlobalAlias *> Aliases;
-  return dyn_cast<Function>(findBaseObject(getResolver(), Aliases));
+  return dyn_cast<Function>(
+      findBaseObject(getResolver(), Aliases, [](const GlobalValue &) {}));
+}
+
+void GlobalIFunc::applyAlongResolverPath(
+    function_ref<void(const GlobalValue &)> Op) const {
+  DenseSet<const GlobalAlias *> Aliases;
+  findBaseObject(getResolver(), Aliases, Op);
 }
diff --git a/llvm/lib/IR/InlineAsm.cpp b/llvm/lib/IR/InlineAsm.cpp
index c75b1aa7c1d6..088fcfdec742 100644
--- a/llvm/lib/IR/InlineAsm.cpp
+++ b/llvm/lib/IR/InlineAsm.cpp
@@ -93,6 +93,9 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
   } else if (*I == '=') {
     ++I;
     Type = isOutput;
+  } else if (*I == '!') {
+    ++I;
+    Type = isLabel;
   }
 
   if (*I == '*') {
@@ -265,14 +268,14 @@ Error InlineAsm::verify(FunctionType *Ty, StringRef ConstStr) {
     return makeStringError("failed to parse constraints");
 
   unsigned NumOutputs = 0, NumInputs = 0, NumClobbers = 0;
-  unsigned NumIndirect = 0;
+  unsigned NumIndirect = 0, NumLabels = 0;
 
   for (const ConstraintInfo &Constraint : Constraints) {
     switch (Constraint.Type) {
     case InlineAsm::isOutput:
-      if ((NumInputs-NumIndirect) != 0 || NumClobbers != 0)
-        return makeStringError("output constraint occurs after input "
-                               "or clobber constraint");
+      if ((NumInputs-NumIndirect) != 0 || NumClobbers != 0 || NumLabels != 0)
+        return makeStringError("output constraint occurs after input, "
+                               "clobber or label constraint");
 
       if (!Constraint.isIndirect) {
         ++NumOutputs;
@@ -289,6 +292,13 @@ Error InlineAsm::verify(FunctionType *Ty, StringRef ConstStr) {
     case InlineAsm::isClobber:
       ++NumClobbers;
       break;
+    case InlineAsm::isLabel:
+      if (NumClobbers)
+        return makeStringError("label constraint occurs after clobber "
+                               "constraint");
+
+      ++NumLabels;
+      break;
     }
   }
 
@@ -312,5 +322,7 @@ Error InlineAsm::verify(FunctionType *Ty, StringRef ConstStr) {
   if (Ty->getNumParams() != NumInputs)
     return makeStringError("number of input constraints does not match number "
                            "of parameters");
+
+  // We don't have access to labels here, NumLabels will be checked separately.
   return Error::success();
 }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b333f40f3ce9..26171f537244 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -960,15 +960,10 @@ void CallBrInst::init(FunctionType *FTy, Value *Fn, BasicBlock *Fallthrough,
   setName(NameStr);
 }
 
-void CallBrInst::updateArgBlockAddresses(unsigned i, BasicBlock *B) {
-  assert(getNumIndirectDests() > i && "IndirectDest # out of range for callbr");
-  if (BasicBlock *OldBB = getIndirectDest(i)) {
-    BlockAddress *Old = BlockAddress::get(OldBB);
-    BlockAddress *New = BlockAddress::get(B);
-    for (unsigned ArgNo = 0, e = arg_size(); ArgNo != e; ++ArgNo)
-      if (dyn_cast<BlockAddress>(getArgOperand(ArgNo)) == Old)
-        setArgOperand(ArgNo, New);
-  }
+BlockAddress *
+CallBrInst::getBlockAddressForIndirectDest(unsigned DestNo) const {
+  return BlockAddress::get(const_cast<Function *>(getFunction()),
+                           getIndirectDest(DestNo));
 }
 
 CallBrInst::CallBrInst(const CallBrInst &CBI)
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index 65a9a32ad2c5..c50d6901c9da 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -694,8 +694,10 @@ unsigned BinaryOpIntrinsic::getNoWrapKind() const {
     return OverflowingBinaryOperator::NoUnsignedWrap;
 }
 
-const GCStatepointInst *GCProjectionInst::getStatepoint() const {
+const Value *GCProjectionInst::getStatepoint() const {
   const Value *Token = getArgOperand(0);
+  if (isa<UndefValue>(Token))
+    return Token;
 
   // This takes care both of relocates for call statepoints and relocates
   // on normal path of invoke statepoint.
@@ -714,13 +716,23 @@ const GCStatepointInst *GCProjectionInst::getStatepoint() const {
 }
 
 Value *GCRelocateInst::getBasePtr() const {
-  if (auto Opt = getStatepoint()->getOperandBundle(LLVMContext::OB_gc_live))
+  auto Statepoint = getStatepoint();
+  if (isa<UndefValue>(Statepoint))
+    return UndefValue::get(Statepoint->getType());
+
+  auto *GCInst = cast<GCStatepointInst>(Statepoint);
+  if (auto Opt = GCInst->getOperandBundle(LLVMContext::OB_gc_live))
     return *(Opt->Inputs.begin() + getBasePtrIndex());
-  return *(getStatepoint()->arg_begin() + getBasePtrIndex());
+  return *(GCInst->arg_begin() + getBasePtrIndex());
 }
 
 Value *GCRelocateInst::getDerivedPtr() const {
-  if (auto Opt = getStatepoint()->getOperandBundle(LLVMContext::OB_gc_live))
+  auto *Statepoint = getStatepoint();
+  if (isa<UndefValue>(Statepoint))
+    return UndefValue::get(Statepoint->getType());
+
+  auto *GCInst = cast<GCStatepointInst>(Statepoint);
+  if (auto Opt = GCInst->getOperandBundle(LLVMContext::OB_gc_live))
     return *(Opt->Inputs.begin() + getDerivedPtrIndex());
-  return *(getStatepoint()->arg_begin() + getDerivedPtrIndex());
+  return *(GCInst->arg_begin() + getDerivedPtrIndex());
 }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 75d02f4c8c82..e3ea256af16d 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -469,6 +469,9 @@ private:
   void visitRangeMetadata(Instruction &I, MDNode *Range, Type *Ty);
   void visitDereferenceableMetadata(Instruction &I, MDNode *MD);
   void visitProfMetadata(Instruction &I, MDNode *MD);
+  void visitCallStackMetadata(MDNode *MD);
+  void visitMemProfMetadata(Instruction &I, MDNode *MD);
+  void visitCallsiteMetadata(Instruction &I, MDNode *MD);
   void visitAnnotationMetadata(MDNode *Annotation);
   void visitAliasScopeMetadata(const MDNode *MD);
   void visitAliasScopeListMetadata(const MDNode *MD);
@@ -1624,8 +1627,10 @@ Verifier::visitModuleFlag(const MDNode *Op,
     break;
 
   case Module::Min: {
-    Check(mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2)),
-          "invalid value for 'min' module flag (expected constant integer)",
+    auto *V = mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
+    Check(V && V->getValue().isNonNegative(),
+          "invalid value for 'min' module flag (expected constant non-negative "
+          "integer)",
           Op->getOperand(2));
     break;
   }
@@ -2200,7 +2205,13 @@ bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
 void Verifier::verifyInlineAsmCall(const CallBase &Call) {
   const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
   unsigned ArgNo = 0;
+  unsigned LabelNo = 0;
   for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    if (CI.Type == InlineAsm::isLabel) {
+      ++LabelNo;
+      continue;
+    }
+
     // Only deal with constraints that correspond to call arguments.
     if (!CI.hasArg())
       continue;
@@ -2222,6 +2233,15 @@ void Verifier::verifyInlineAsmCall(const CallBase &Call) {
 
     ArgNo++;
   }
+
+  if (auto *CallBr = dyn_cast<CallBrInst>(&Call)) {
+    Check(LabelNo == CallBr->getNumIndirectDests(),
+          "Number of label constraints does not match number of callbr dests",
+          &Call);
+  } else {
+    Check(LabelNo == 0, "Label constraints can only be used with callbr",
+          &Call);
+  }
 }
 
 /// Verify that statepoint intrinsic is well formed.
@@ -2839,25 +2859,6 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) {
   Check(CBI.isInlineAsm(), "Callbr is currently only used for asm-goto!", &CBI);
   const InlineAsm *IA = cast<InlineAsm>(CBI.getCalledOperand());
   Check(!IA->canThrow(), "Unwinding from Callbr is not allowed");
-  for (unsigned i = 0, e = CBI.getNumSuccessors(); i != e; ++i)
-    Check(CBI.getSuccessor(i)->getType()->isLabelTy(),
-          "Callbr successors must all have pointer type!", &CBI);
-  for (unsigned i = 0, e = CBI.getNumOperands(); i != e; ++i) {
-    Check(i >= CBI.arg_size() || !isa<BasicBlock>(CBI.getOperand(i)),
-          "Using an unescaped label as a callbr argument!", &CBI);
-    if (isa<BasicBlock>(CBI.getOperand(i)))
-      for (unsigned j = i + 1; j != e; ++j)
-        Check(CBI.getOperand(i) != CBI.getOperand(j),
-              "Duplicate callbr destination!", &CBI);
-  }
-  {
-    SmallPtrSet<BasicBlock *, 4> ArgBBs;
-    for (Value *V : CBI.args())
-      if (auto *BA = dyn_cast<BlockAddress>(V))
-        ArgBBs.insert(BA->getBasicBlock());
-    for (BasicBlock *BB : CBI.getIndirectDests())
-      Check(ArgBBs.count(BB), "Indirect label missing from arglist.", &CBI);
-  }
 
   verifyInlineAsmCall(CBI);
   visitTerminator(CBI);
@@ -4489,6 +4490,55 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
   }
 }
 
+void Verifier::visitCallStackMetadata(MDNode *MD) {
+  // Call stack metadata should consist of a list of at least 1 constant int
+  // (representing a hash of the location).
+  Check(MD->getNumOperands() >= 1,
+        "call stack metadata should have at least 1 operand", MD);
+
+  for (const auto &Op : MD->operands())
+    Check(mdconst::dyn_extract_or_null<ConstantInt>(Op),
+          "call stack metadata operand should be constant integer", Op);
+}
+
+void Verifier::visitMemProfMetadata(Instruction &I, MDNode *MD) {
+  Check(isa<CallBase>(I), "!memprof metadata should only exist on calls", &I);
+  Check(MD->getNumOperands() >= 1,
+        "!memprof annotations should have at least 1 metadata operand "
+        "(MemInfoBlock)",
+        MD);
+
+  // Check each MIB
+  for (auto &MIBOp : MD->operands()) {
+    MDNode *MIB = dyn_cast<MDNode>(MIBOp);
+    // The first operand of an MIB should be the call stack metadata.
+    // There rest of the operands should be MDString tags, and there should be
+    // at least one.
+    Check(MIB->getNumOperands() >= 2,
+          "Each !memprof MemInfoBlock should have at least 2 operands", MIB);
+
+    // Check call stack metadata (first operand).
+    Check(MIB->getOperand(0) != nullptr,
+          "!memprof MemInfoBlock first operand should not be null", MIB);
+    Check(isa<MDNode>(MIB->getOperand(0)),
+          "!memprof MemInfoBlock first operand should be an MDNode", MIB);
+    MDNode *StackMD = dyn_cast<MDNode>(MIB->getOperand(0));
+    visitCallStackMetadata(StackMD);
+
+    // Check that remaining operands are MDString.
+    Check(std::all_of(MIB->op_begin() + 1, MIB->op_end(),
+                      [](const MDOperand &Op) { return isa<MDString>(Op); }),
+          "Not all !memprof MemInfoBlock operands 1 to N are MDString", MIB);
+  }
+}
+
+void Verifier::visitCallsiteMetadata(Instruction &I, MDNode *MD) {
+  Check(isa<CallBase>(I), "!callsite metadata should only exist on calls", &I);
+  // Verify the partial callstack annotated from memprof profiles. This callsite
+  // is a part of a profiled allocation callstack.
+  visitCallStackMetadata(MD);
+}
+
 void Verifier::visitAnnotationMetadata(MDNode *Annotation) {
   Check(isa<MDTuple>(Annotation), "annotation must be a tuple");
   Check(Annotation->getNumOperands() >= 1,
@@ -4735,6 +4785,12 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *MD = I.getMetadata(LLVMContext::MD_prof))
     visitProfMetadata(I, MD);
 
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_memprof))
+    visitMemProfMetadata(I, MD);
+
+  if (MDNode *MD = I.getMetadata(LLVMContext::MD_callsite))
+    visitCallsiteMetadata(I, MD);
+
   if (MDNode *Annotation = I.getMetadata(LLVMContext::MD_annotation))
     visitAnnotationMetadata(Annotation);
 
@@ -5160,14 +5216,13 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       // In all other cases relocate should be tied to the statepoint directly.
       // This covers relocates on a normal return path of invoke statepoint and
       // relocates of a call statepoint.
-      auto Token = Call.getArgOperand(0);
-      Check(isa<GCStatepointInst>(Token),
+      auto *Token = Call.getArgOperand(0);
+      Check(isa<GCStatepointInst>(Token) || isa<UndefValue>(Token),
             "gc relocate is incorrectly tied to the statepoint", Call, Token);
     }
 
     // Verify rest of the relocate arguments.
-    const CallBase &StatepointCall =
-      *cast<GCRelocateInst>(Call).getStatepoint();
+    const Value &StatepointCall = *cast<GCRelocateInst>(Call).getStatepoint();
 
     // Both the base and derived must be piped through the safepoint.
     Value *Base = Call.getArgOperand(1);
@@ -5182,7 +5237,10 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     const uint64_t DerivedIndex = cast<ConstantInt>(Derived)->getZExtValue();
 
     // Check the bounds
-    if (auto Opt = StatepointCall.getOperandBundle(LLVMContext::OB_gc_live)) {
+    if (isa<UndefValue>(StatepointCall))
+      break;
+    if (auto Opt = cast<GCStatepointInst>(StatepointCall)
+                       .getOperandBundle(LLVMContext::OB_gc_live)) {
       Check(BaseIndex < Opt->Inputs.size(),
             "gc.relocate: statepoint base index out of bounds", Call);
       Check(DerivedIndex < Opt->Inputs.size(),
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 9e89cce8312e..e31faf6422ed 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1273,14 +1273,19 @@ Error IRLinker::linkModuleFlagsMetadata() {
   // First build a map of the existing module flags and requirements.
   DenseMap<MDString *, std::pair<MDNode *, unsigned>> Flags;
   SmallSetVector<MDNode *, 16> Requirements;
+  SmallVector<unsigned, 0> Mins;
+  DenseSet<MDString *> SeenMin;
   for (unsigned I = 0, E = DstModFlags->getNumOperands(); I != E; ++I) {
     MDNode *Op = DstModFlags->getOperand(I);
-    ConstantInt *Behavior = mdconst::extract<ConstantInt>(Op->getOperand(0));
+    uint64_t Behavior =
+        mdconst::extract<ConstantInt>(Op->getOperand(0))->getZExtValue();
     MDString *ID = cast<MDString>(Op->getOperand(1));
 
-    if (Behavior->getZExtValue() == Module::Require) {
+    if (Behavior == Module::Require) {
       Requirements.insert(cast<MDNode>(Op->getOperand(2)));
     } else {
+      if (Behavior == Module::Min)
+        Mins.push_back(I);
       Flags[ID] = std::make_pair(Op, I);
     }
   }
@@ -1296,6 +1301,7 @@ Error IRLinker::linkModuleFlagsMetadata() {
     unsigned DstIndex;
     std::tie(DstOp, DstIndex) = Flags.lookup(ID);
     unsigned SrcBehaviorValue = SrcBehavior->getZExtValue();
+    SeenMin.insert(ID);
 
     // If this is a requirement, add it and continue.
     if (SrcBehaviorValue == Module::Require) {
@@ -1309,6 +1315,10 @@ Error IRLinker::linkModuleFlagsMetadata() {
 
     // If there is no existing flag with this ID, just add it.
     if (!DstOp) {
+      if (SrcBehaviorValue == Module::Min) {
+        Mins.push_back(DstModFlags->getNumOperands());
+        SeenMin.erase(ID);
+      }
       Flags[ID] = std::make_pair(SrcOp, DstModFlags->getNumOperands());
       DstModFlags->addOperand(SrcOp);
       continue;
@@ -1362,8 +1372,10 @@ Error IRLinker::linkModuleFlagsMetadata() {
              "Expected MDTuple when appending module flags");
       if (DstValue->isDistinct())
         return dyn_cast<MDTuple>(DstValue);
+      ArrayRef<MDOperand> DstOperands = DstValue->operands();
       MDTuple *New = MDTuple::getDistinct(
-          DstM.getContext(), SmallVector<Metadata *, 4>(DstValue->operands()));
+          DstM.getContext(),
+          SmallVector<Metadata *, 4>(DstOperands.begin(), DstOperands.end()));
       Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
       MDNode *Flag = MDTuple::getDistinct(DstM.getContext(), FlagOps);
       DstModFlags->setOperand(DstIndex, Flag);
@@ -1465,6 +1477,20 @@ Error IRLinker::linkModuleFlagsMetadata() {
 
   }
 
+  // For the Min behavior, set the value to 0 if either module does not have the
+  // flag.
+  for (auto Idx : Mins) {
+    MDNode *Op = DstModFlags->getOperand(Idx);
+    MDString *ID = cast<MDString>(Op->getOperand(1));
+    if (!SeenMin.count(ID)) {
+      ConstantInt *V = mdconst::extract<ConstantInt>(Op->getOperand(2));
+      Metadata *FlagOps[] = {
+          Op->getOperand(0), ID,
+          ConstantAsMetadata::get(ConstantInt::get(V->getType(), 0))};
+      DstModFlags->setOperand(Idx, MDNode::get(DstM.getContext(), FlagOps));
+    }
+  }
+
   // Check all of the requirements.
   for (unsigned I = 0, E = Requirements.size(); I != E; ++I) {
     MDNode *Requirement = Requirements[I];
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 78204ffe4c3b..0b4e9866d50a 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -144,9 +144,9 @@ struct ELFWriter {
 
   uint64_t align(unsigned Alignment);
 
-  bool maybeWriteCompression(uint64_t Size,
+  bool maybeWriteCompression(uint32_t ChType, uint64_t Size,
                              SmallVectorImpl<uint8_t> &CompressedContents,
-                             bool ZLibStyle, unsigned Alignment);
+                             unsigned Alignment);
 
 public:
   ELFWriter(ELFObjectWriter &OWriter, raw_pwrite_stream &OS,
@@ -819,36 +819,25 @@ MCSectionELF *ELFWriter::createRelocationSection(MCContext &Ctx,
 
 // Include the debug info compression header.
 bool ELFWriter::maybeWriteCompression(
-    uint64_t Size, SmallVectorImpl<uint8_t> &CompressedContents, bool ZLibStyle,
-    unsigned Alignment) {
-  if (ZLibStyle) {
-    uint64_t HdrSize =
-        is64Bit() ? sizeof(ELF::Elf32_Chdr) : sizeof(ELF::Elf64_Chdr);
-    if (Size <= HdrSize + CompressedContents.size())
-      return false;
-    // Platform specific header is followed by compressed data.
-    if (is64Bit()) {
-      // Write Elf64_Chdr header.
-      write(static_cast<ELF::Elf64_Word>(ELF::ELFCOMPRESS_ZLIB));
-      write(static_cast<ELF::Elf64_Word>(0)); // ch_reserved field.
-      write(static_cast<ELF::Elf64_Xword>(Size));
-      write(static_cast<ELF::Elf64_Xword>(Alignment));
-    } else {
-      // Write Elf32_Chdr header otherwise.
-      write(static_cast<ELF::Elf32_Word>(ELF::ELFCOMPRESS_ZLIB));
-      write(static_cast<ELF::Elf32_Word>(Size));
-      write(static_cast<ELF::Elf32_Word>(Alignment));
-    }
-    return true;
-  }
-
-  // "ZLIB" followed by 8 bytes representing the uncompressed size of the section,
-  // useful for consumers to preallocate a buffer to decompress into.
-  const StringRef Magic = "ZLIB";
-  if (Size <= Magic.size() + sizeof(Size) + CompressedContents.size())
+    uint32_t ChType, uint64_t Size,
+    SmallVectorImpl<uint8_t> &CompressedContents, unsigned Alignment) {
+  uint64_t HdrSize =
+      is64Bit() ? sizeof(ELF::Elf32_Chdr) : sizeof(ELF::Elf64_Chdr);
+  if (Size <= HdrSize + CompressedContents.size())
     return false;
-  W.OS << Magic;
-  support::endian::write(W.OS, Size, support::big);
+  // Platform specific header is followed by compressed data.
+  if (is64Bit()) {
+    // Write Elf64_Chdr header.
+    write(static_cast<ELF::Elf64_Word>(ChType));
+    write(static_cast<ELF::Elf64_Word>(0)); // ch_reserved field.
+    write(static_cast<ELF::Elf64_Xword>(Size));
+    write(static_cast<ELF::Elf64_Xword>(Alignment));
+  } else {
+    // Write Elf32_Chdr header otherwise.
+    write(static_cast<ELF::Elf32_Word>(ChType));
+    write(static_cast<ELF::Elf32_Word>(Size));
+    write(static_cast<ELF::Elf32_Word>(Alignment));
+  }
   return true;
 }
 
@@ -867,38 +856,31 @@ void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
     return;
   }
 
-  assert((MAI->compressDebugSections() == DebugCompressionType::Z ||
-          MAI->compressDebugSections() == DebugCompressionType::GNU) &&
-         "expected zlib or zlib-gnu style compression");
+  assert(MAI->compressDebugSections() == DebugCompressionType::Z &&
+         "expected zlib style compression");
 
   SmallVector<char, 128> UncompressedData;
   raw_svector_ostream VecOS(UncompressedData);
   Asm.writeSectionData(VecOS, &Section, Layout);
 
-  SmallVector<uint8_t, 128> CompressedContents;
+  SmallVector<uint8_t, 128> Compressed;
+  const uint32_t ChType = ELF::ELFCOMPRESS_ZLIB;
   compression::zlib::compress(
       makeArrayRef(reinterpret_cast<uint8_t *>(UncompressedData.data()),
                    UncompressedData.size()),
-      CompressedContents);
+      Compressed);
 
-  bool ZlibStyle = MAI->compressDebugSections() == DebugCompressionType::Z;
-  if (!maybeWriteCompression(UncompressedData.size(), CompressedContents,
-                             ZlibStyle, Sec.getAlignment())) {
+  if (!maybeWriteCompression(ChType, UncompressedData.size(), Compressed,
+                             Sec.getAlignment())) {
     W.OS << UncompressedData;
     return;
   }
 
-  if (ZlibStyle) {
-    // Set the compressed flag. That is zlib style.
-    Section.setFlags(Section.getFlags() | ELF::SHF_COMPRESSED);
-    // Alignment field should reflect the requirements of
-    // the compressed section header.
-    Section.setAlignment(is64Bit() ? Align(8) : Align(4));
-  } else {
-    // Add "z" prefix to section name. This is zlib-gnu style.
-    MC.renameELFSection(&Section, (".z" + SectionName.drop_front(1)).str());
-  }
-  W.OS << toStringRef(CompressedContents);
+  Section.setFlags(Section.getFlags() | ELF::SHF_COMPRESSED);
+  // Alignment field should reflect the requirements of
+  // the compressed section header.
+  Section.setAlignment(is64Bit() ? Align(8) : Align(4));
+  W.OS << toStringRef(Compressed);
 }
 
 void ELFWriter::WriteSecHdrEntry(uint32_t Name, uint32_t Type, uint64_t Flags,
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index d312e3521c9e..322ed8e23eb6 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -468,24 +468,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
   return R.first->second;
 }
 
-void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
-  StringRef GroupName;
-  if (const MCSymbol *Group = Section->getGroup())
-    GroupName = Group->getName();
-
-  // This function is only used by .debug*, which should not have the
-  // SHF_LINK_ORDER flag.
-  unsigned UniqueID = Section->getUniqueID();
-  ELFUniquingMap.erase(
-      ELFSectionKey{Section->getName(), GroupName, "", UniqueID});
-  auto I = ELFUniquingMap
-               .insert(std::make_pair(
-                   ELFSectionKey{Name, GroupName, "", UniqueID}, Section))
-               .first;
-  StringRef CachedName = I->first.SectionName;
-  const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
-}
-
 MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
                                               unsigned Flags, SectionKind K,
                                               unsigned EntrySize,
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 9f22b9b0a866..f358f593ff39 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -583,15 +583,27 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
   return S;
 }
 
-// Create the AddrSig section and first data fragment here as its layout needs
-// to be computed immediately after in order for it to be exported correctly.
+// The AddrSig section uses a series of relocations to refer to the symbols that
+// should be considered address-significant. The only interesting content of
+// these relocations is their symbol; the type, length etc will be ignored by
+// the linker. The reason we are not referring to the symbol indices directly is
+// that those indices will be invalidated by tools that update the symbol table.
+// Symbol relocations OTOH will have their indices updated by e.g. llvm-strip.
 void MCMachOStreamer::createAddrSigSection() {
   MCAssembler &Asm = getAssembler();
   MCObjectWriter &writer = Asm.getWriter();
   if (!writer.getEmitAddrsigSection())
     return;
+  // Create the AddrSig section and first data fragment here as its layout needs
+  // to be computed immediately after in order for it to be exported correctly.
   MCSection *AddrSigSection =
       Asm.getContext().getObjectFileInfo()->getAddrSigSection();
   Asm.registerSection(*AddrSigSection);
-  new MCDataFragment(AddrSigSection);
+  auto *Frag = new MCDataFragment(AddrSigSection);
+  // We will generate a series of pointer-sized symbol relocations at offset
+  // 0x0. Set the section size to be large enough to contain a single pointer
+  // (instead of emitting a zero-sized section) so these relocations are
+  // technically valid, even though we don't expect these relocations to
+  // actually be applied by the linker.
+  Frag->getContents().resize(8);
 }
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 694ea395fdec..af80d8327210 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -4238,11 +4238,8 @@ bool MasmParser::parseStructInitializer(const StructInfo &Structure,
     }
   }
   // Default-initialize all remaining fields.
-  for (auto It = Structure.Fields.begin() + FieldIndex;
-       It != Structure.Fields.end(); ++It) {
-    const FieldInfo &Field = *It;
+  for (const FieldInfo &Field : llvm::drop_begin(Structure.Fields, FieldIndex))
     FieldInitializers.push_back(Field.Contents);
-  }
 
   if (EndToken) {
     if (EndToken.value() == AsmToken::Greater)
@@ -4350,9 +4347,8 @@ bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
       return true;
   }
   // Default-initialize all remaining values.
-  for (auto it = Contents.Values.begin() + Initializer.Values.size();
-       it != Contents.Values.end(); ++it) {
-    const auto &Value = *it;
+  for (const auto &Value :
+           llvm::drop_begin(Contents.Values, Initializer.Values.size())) {
     if (emitIntValue(Value, Field.Type))
       return true;
   }
@@ -4367,9 +4363,8 @@ bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
                                AsInt.getBitWidth() / 8);
   }
   // Default-initialize all remaining values.
-  for (auto It = Contents.AsIntValues.begin() + Initializer.AsIntValues.size();
-       It != Contents.AsIntValues.end(); ++It) {
-    const auto &AsInt = *It;
+  for (const auto &AsInt :
+       llvm::drop_begin(Contents.AsIntValues, Initializer.AsIntValues.size())) {
     getStreamer().emitIntValue(AsInt.getLimitedValue(),
                                AsInt.getBitWidth() / 8);
   }
@@ -4384,10 +4379,8 @@ bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
       return true;
   }
   // Default-initialize all remaining values.
-  for (auto It =
-           Contents.Initializers.begin() + Initializer.Initializers.size();
-       It != Contents.Initializers.end(); ++It) {
-    const auto &Init = *It;
+  for (const auto &Init : llvm::drop_begin(Contents.Initializers,
+                                           Initializer.Initializers.size())) {
     if (emitStructInitializer(Contents.Structure, Init))
       return true;
   }
@@ -4425,10 +4418,8 @@ bool MasmParser::emitStructInitializer(const StructInfo &Structure,
       return true;
   }
   // Default-initialize all remaining fields.
-  for (auto It =
-           Structure.Fields.begin() + Initializer.FieldInitializers.size();
-       It != Structure.Fields.end(); ++It) {
-    const auto &Field = *It;
+  for (const auto &Field : llvm::drop_begin(
+           Structure.Fields, Initializer.FieldInitializers.size())) {
     getStreamer().emitZeros(Field.Offset - Offset);
     Offset = Field.Offset + Field.SizeOf;
     if (emitFieldValue(Field))
@@ -4649,10 +4640,8 @@ bool MasmParser::parseDirectiveNestedEnds() {
     if (ParentStruct.IsUnion) {
       ParentStruct.Size = std::max(ParentStruct.Size, Structure.Size);
     } else {
-      for (auto FieldIter = ParentStruct.Fields.begin() + OldFields;
-           FieldIter != ParentStruct.Fields.end(); ++FieldIter) {
-        FieldIter->Offset += FirstFieldOffset;
-      }
+      for (auto &Field : llvm::drop_begin(ParentStruct.Fields, OldFields))
+        Field.Offset += FirstFieldOffset;
 
       const unsigned StructureEnd = FirstFieldOffset + Structure.Size;
       if (!ParentStruct.IsUnion) {
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index 5277ce87bee0..fdf8bbbe0a4d 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCPseudoProbe.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -519,7 +520,7 @@ void MCPseudoProbeDecoder::printProbesForAllAddresses(raw_ostream &OS) {
   std::vector<uint64_t> Addresses;
   for (auto Entry : Address2ProbesMap)
     Addresses.push_back(Entry.first);
-  std::sort(Addresses.begin(), Addresses.end());
+  llvm::sort(Addresses);
   for (auto K : Addresses) {
     OS << "Address:\t";
     OS << K;
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 78d0d9cec556..038433cb24fa 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -753,32 +753,27 @@ static MachO::LoadCommandType getLCFromMCVM(MCVersionMinType Type) {
   llvm_unreachable("Invalid mc version min type");
 }
 
-// Encode addrsig data as symbol indexes in variable length encoding.
-void MachObjectWriter::writeAddrsigSection(MCAssembler &Asm) {
+void MachObjectWriter::populateAddrSigSection(MCAssembler &Asm) {
   MCSection *AddrSigSection =
       Asm.getContext().getObjectFileInfo()->getAddrSigSection();
-  MCSection::FragmentListType &fragmentList = AddrSigSection->getFragmentList();
-  if (!fragmentList.size())
-    return;
-
-  assert(fragmentList.size() == 1);
-  MCFragment *pFragment = &*fragmentList.begin();
-  MCDataFragment *pDataFragment = dyn_cast_or_null<MCDataFragment>(pFragment);
-  assert(pDataFragment);
-
-  raw_svector_ostream OS(pDataFragment->getContents());
-  for (const MCSymbol *sym : this->getAddrsigSyms())
-    encodeULEB128(sym->getIndex(), OS);
+  unsigned Log2Size = is64Bit() ? 3 : 2;
+  for (const MCSymbol *S : getAddrsigSyms()) {
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = 0;
+    MRE.r_word1 = (Log2Size << 25) | (MachO::GENERIC_RELOC_VANILLA << 28);
+    addRelocation(S, AddrSigSection, MRE);
+  }
 }
 
 uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
                                        const MCAsmLayout &Layout) {
   uint64_t StartOffset = W.OS.tell();
 
+  populateAddrSigSection(Asm);
+
   // Compute symbol table information and bind symbol indices.
   computeSymbolTable(Asm, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
-  writeAddrsigSection(Asm);
 
   if (!Asm.CGProfile.empty()) {
     MCSection *CGProfileSection = Asm.getContext().getMachOSection(
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 33e496b7a864..809ac37c3442 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -169,6 +169,7 @@ public:
     Strings.clear();
     SectionMap.clear();
     SymbolMap.clear();
+    WeakDefaults.clear();
     MCObjectWriter::reset();
   }
 
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index 977e77bf67fd..d46ae2247535 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -206,6 +206,7 @@ class XCOFFObjectWriter : public MCObjectWriter {
   uint16_t SectionCount = 0;
   uint64_t RelocationEntryOffset = 0;
   std::vector<std::pair<std::string, size_t>> FileNames;
+  bool HasVisibility = false;
 
   support::endian::Writer W;
   std::unique_ptr<MCXCOFFObjectTargetWriter> TargetObjectWriter;
@@ -275,6 +276,7 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void writeSymbolEntryForDwarfSection(const XCOFFSection &DwarfSectionRef,
                                        int16_t SectionIndex);
   void writeFileHeader();
+  void writeAuxFileHeader();
   void writeSectionHeaderTable();
   void writeSections(const MCAssembler &Asm, const MCAsmLayout &Layout);
   void writeSectionForControlSectionEntry(const MCAssembler &Asm,
@@ -308,14 +310,9 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void assignAddressesAndIndices(const MCAsmLayout &);
   void finalizeSectionInfo();
 
-  // TODO aux header support not implemented.
-  bool needsAuxiliaryHeader() const { return false; }
-
-  // Returns the size of the auxiliary header to be written to the object file.
   size_t auxiliaryHeaderSize() const {
-    assert(!needsAuxiliaryHeader() &&
-           "Auxiliary header support not implemented.");
-    return 0;
+    // 64-bit object files have no auxiliary header.
+    return HasVisibility && !is64Bit() ? XCOFF::AuxFileHeaderSizeShort : 0;
   }
 
 public:
@@ -468,6 +465,9 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
     const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(&S);
     const MCSectionXCOFF *ContainingCsect = getContainingCsect(XSym);
 
+    if (XSym->getVisibilityType() != XCOFF::SYM_V_UNSPECIFIED)
+      HasVisibility = true;
+
     if (ContainingCsect->getCSectType() == XCOFF::XTY_ER) {
       // Handle undefined symbol.
       UndefinedCsects.emplace_back(ContainingCsect);
@@ -648,6 +648,7 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm,
   uint64_t StartOffset = W.OS.tell();
 
   writeFileHeader();
+  writeAuxFileHeader();
   writeSectionHeaderTable();
   writeSections(Asm, Layout);
   writeRelocations();
@@ -688,12 +689,6 @@ void XCOFFObjectWriter::writeSymbolEntry(StringRef SymbolName, uint64_t Value,
     W.write<uint32_t>(Value);
   }
   W.write<int16_t>(SectionNumber);
-  // Basic/Derived type. See the description of the n_type field for symbol
-  // table entries for a detailed description. Since we don't yet support
-  // visibility, and all other bits are either optionally set or reserved, this
-  // is always zero.
-  if (SymbolType != 0)
-    report_fatal_error("Emitting non-zero visibilities is not supported yet.");
   // TODO Set the function indicator (bit 10, 0x0020) for functions
   // when debugging is enabled.
   W.write<uint16_t>(SymbolType);
@@ -773,18 +768,32 @@ void XCOFFObjectWriter::writeFileHeader() {
   W.write<int32_t>(0); // TimeStamp
   writeWord(SymbolTableOffset);
   if (is64Bit()) {
-    W.write<uint16_t>(0); // AuxHeaderSize. No optional header for an object
-                          // file that is not to be loaded.
+    W.write<uint16_t>(auxiliaryHeaderSize());
     W.write<uint16_t>(0); // Flags
     W.write<int32_t>(SymbolTableEntryCount);
   } else {
     W.write<int32_t>(SymbolTableEntryCount);
-    W.write<uint16_t>(0); // AuxHeaderSize. No optional header for an object
-                          // file that is not to be loaded.
+    W.write<uint16_t>(auxiliaryHeaderSize());
     W.write<uint16_t>(0); // Flags
   }
 }
 
+void XCOFFObjectWriter::writeAuxFileHeader() {
+  if (!auxiliaryHeaderSize())
+    return;
+  W.write<uint16_t>(0); // Magic
+  W.write<uint16_t>(
+      XCOFF::NEW_XCOFF_INTERPRET); // Version. The new interpretation of the
+                                   // n_type field in the symbol table entry is
+                                   // used in XCOFF32.
+  W.write<uint32_t>(Sections[0]->Size);    // TextSize
+  W.write<uint32_t>(Sections[1]->Size);    // InitDataSize
+  W.write<uint32_t>(Sections[2]->Size);    // BssDataSize
+  W.write<uint32_t>(0);                    // EntryPointAddr
+  W.write<uint32_t>(Sections[0]->Address); // TextStartAddr
+  W.write<uint32_t>(Sections[1]->Address); // DataStartAddr
+}
+
 void XCOFFObjectWriter::writeSectionHeaderTable() {
   auto writeSectionHeader = [&](const SectionEntry *Sec, bool IsDwarf) {
     // Nothing to write for this Section.
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.cpp b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
index f0e4f91cd347..8b44c09023f1 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.cpp
@@ -523,9 +523,6 @@ Error ELFSectionWriter<ELFT>::visit(const CompressedSection &Sec) {
   case DebugCompressionType::None:
     std::copy(Sec.OriginalData.begin(), Sec.OriginalData.end(), Buf);
     return Error::success();
-  case DebugCompressionType::GNU:
-    llvm_unreachable("unexpected zlib-gnu");
-    break;
   case DebugCompressionType::Z:
     Chdr.ch_type = ELF::ELFCOMPRESS_ZLIB;
     break;
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index ad03f9cae9f8..1dffe007b9a9 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -965,14 +965,15 @@ StringRef Archive::Symbol::getName() const {
 Expected<Archive::Child> Archive::Symbol::getMember() const {
   const char *Buf = Parent->getSymbolTable().begin();
   const char *Offsets = Buf;
-  if (Parent->kind() == K_GNU64 || Parent->kind() == K_DARWIN64)
+  if (Parent->kind() == K_GNU64 || Parent->kind() == K_DARWIN64 ||
+      Parent->kind() == K_AIXBIG)
     Offsets += sizeof(uint64_t);
   else
     Offsets += sizeof(uint32_t);
   uint64_t Offset = 0;
   if (Parent->kind() == K_GNU) {
     Offset = read32be(Offsets + SymbolIndex * 4);
-  } else if (Parent->kind() == K_GNU64) {
+  } else if (Parent->kind() == K_GNU64 || Parent->kind() == K_AIXBIG) {
     Offset = read64be(Offsets + SymbolIndex * 8);
   } else if (Parent->kind() == K_BSD) {
     // The SymbolIndex is an index into the ranlib structs that start at
@@ -1105,6 +1106,8 @@ Archive::symbol_iterator Archive::symbol_begin() const {
     // Skip the byte count of the string table.
     buf += sizeof(uint64_t);
     buf += ran_strx;
+  } else if (kind() == K_AIXBIG) {
+    buf = getStringTable().begin();
   } else {
     uint32_t member_count = 0;
     uint32_t symbol_count = 0;
@@ -1127,7 +1130,7 @@ uint32_t Archive::getNumberOfSymbols() const {
   const char *buf = getSymbolTable().begin();
   if (kind() == K_GNU)
     return read32be(buf);
-  if (kind() == K_GNU64)
+  if (kind() == K_GNU64 || kind() == K_AIXBIG)
     return read64be(buf);
   if (kind() == K_BSD)
     return read32le(buf) / 8;
@@ -1180,6 +1183,58 @@ BigArchive::BigArchive(MemoryBufferRef Source, Error &Err)
     Err = malformedError("malformed AIX big archive: last member offset \"" +
                          RawOffset + "\" is not a number");
 
+  // Calculate the global symbol table.
+  uint64_t GlobSymOffset = 0;
+  RawOffset = getFieldRawString(ArFixLenHdr->GlobSymOffset);
+  if (RawOffset.getAsInteger(10, GlobSymOffset))
+    // TODO: add test case.
+    Err = malformedError(
+        "malformed AIX big archive: global symbol table offset \"" + RawOffset +
+        "\" is not a number");
+
+  if (Err)
+    return;
+
+  if (GlobSymOffset > 0) {
+    uint64_t BufferSize = Data.getBufferSize();
+    uint64_t GlobalSymTblContentOffset =
+        GlobSymOffset + sizeof(BigArMemHdrType);
+    if (GlobalSymTblContentOffset > BufferSize) {
+      Err = malformedError("global symbol table header at offset 0x" +
+                           Twine::utohexstr(GlobSymOffset) + " and size 0x" +
+                           Twine::utohexstr(sizeof(BigArMemHdrType)) +
+                           " goes past the end of file");
+      return;
+    }
+
+    const char *GlobSymTblLoc = Data.getBufferStart() + GlobSymOffset;
+    const BigArMemHdrType *GlobalSymHdr =
+        reinterpret_cast<const BigArMemHdrType *>(GlobSymTblLoc);
+    RawOffset = getFieldRawString(GlobalSymHdr->Size);
+    uint64_t Size;
+    if (RawOffset.getAsInteger(10, Size)) {
+      // TODO: add test case.
+      Err = malformedError(
+          "malformed AIX big archive: global symbol table size \"" + RawOffset +
+          "\" is not a number");
+      return;
+    }
+    if (GlobalSymTblContentOffset + Size > BufferSize) {
+      Err = malformedError("global symbol table content at offset 0x" +
+                           Twine::utohexstr(GlobalSymTblContentOffset) +
+                           " and size 0x" + Twine::utohexstr(Size) +
+                           " goes past the end of file");
+      return;
+    }
+    SymbolTable = StringRef(GlobSymTblLoc + sizeof(BigArMemHdrType), Size);
+    unsigned SymNum = getNumberOfSymbols();
+    unsigned SymOffsetsSize = 8 * (SymNum + 1);
+    uint64_t SymbolTableStringSize = Size - SymOffsetsSize;
+    StringTable =
+        StringRef(GlobSymTblLoc + sizeof(BigArMemHdrType) + SymOffsetsSize,
+                  SymbolTableStringSize);
+  }
+
   child_iterator I = child_begin(Err, false);
   if (Err)
     return;
diff --git a/llvm/lib/Object/Decompressor.cpp b/llvm/lib/Object/Decompressor.cpp
index a6a28a0589ac..3842ec92ccfc 100644
--- a/llvm/lib/Object/Decompressor.cpp
+++ b/llvm/lib/Object/Decompressor.cpp
@@ -23,9 +23,7 @@ Expected<Decompressor> Decompressor::create(StringRef Name, StringRef Data,
     return createError("zlib is not available");
 
   Decompressor D(Data);
-  Error Err = isGnuStyle(Name) ? D.consumeCompressedGnuHeader()
-                               : D.consumeCompressedZLibHeader(Is64Bit, IsLE);
-  if (Err)
+  if (Error Err = D.consumeCompressedZLibHeader(Is64Bit, IsLE))
     return std::move(Err);
   return D;
 }
@@ -33,21 +31,6 @@ Expected<Decompressor> Decompressor::create(StringRef Name, StringRef Data,
 Decompressor::Decompressor(StringRef Data)
     : SectionData(Data), DecompressedSize(0) {}
 
-Error Decompressor::consumeCompressedGnuHeader() {
-  if (!SectionData.startswith("ZLIB"))
-    return createError("corrupted compressed section header");
-
-  SectionData = SectionData.substr(4);
-
-  // Consume uncompressed section size (big-endian 8 bytes).
-  if (SectionData.size() < 8)
-    return createError("corrupted uncompressed section size");
-  DecompressedSize = read64be(SectionData.data());
-  SectionData = SectionData.substr(8);
-
-  return Error::success();
-}
-
 Error Decompressor::consumeCompressedZLibHeader(bool Is64Bit,
                                                 bool IsLittleEndian) {
   using namespace ELF;
@@ -72,26 +55,6 @@ Error Decompressor::consumeCompressedZLibHeader(bool Is64Bit,
   return Error::success();
 }
 
-bool Decompressor::isGnuStyle(StringRef Name) {
-  return Name.startswith(".zdebug");
-}
-
-bool Decompressor::isCompressed(const object::SectionRef &Section) {
-  if (Section.isCompressed())
-    return true;
-
-  Expected<StringRef> SecNameOrErr = Section.getName();
-  if (SecNameOrErr)
-    return isGnuStyle(*SecNameOrErr);
-
-  consumeError(SecNameOrErr.takeError());
-  return false;
-}
-
-bool Decompressor::isCompressedELFSection(uint64_t Flags, StringRef Name) {
-  return (Flags & ELF::SHF_COMPRESSED) || isGnuStyle(Name);
-}
-
 Error Decompressor::decompress(MutableArrayRef<uint8_t> Buffer) {
   size_t Size = Buffer.size();
   return compression::zlib::uncompress(arrayRefFromStringRef(SectionData),
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index ce816b097691..d00359c6deef 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -204,7 +204,7 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr,
 
   if (Expr.Extended) {
     Ctx.Ptr = Start;
-    while (1) {
+    while (true) {
       uint8_t Opcode = readOpcode(Ctx);
       switch (Opcode) {
       case wasm::WASM_OPCODE_I32_CONST:
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 593243144f01..3b3eefcc29ca 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -172,7 +172,7 @@ static cl::opt<bool> EnableEagerlyInvalidateAnalyses(
     cl::desc("Eagerly invalidate more analyses in default pipelines"));
 
 static cl::opt<bool> EnableNoRerunSimplificationPipeline(
-    "enable-no-rerun-simplification-pipeline", cl::init(false), cl::Hidden,
+    "enable-no-rerun-simplification-pipeline", cl::init(true), cl::Hidden,
     cl::desc(
         "Prevent running the simplification pipeline on a function more "
         "than once in the case that SCC mutations cause a function to be "
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index bad8184dffcf..baea0eb53ef9 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -81,36 +81,35 @@ cl::opt<bool> PreservedCFGCheckerInstrumentation::VerifyPreservedCFG(
 // facilities, the error message will be shown in place of the expected output.
 //
 enum class ChangePrinter {
-  NoChangePrinter,
-  PrintChangedVerbose,
-  PrintChangedQuiet,
-  PrintChangedDiffVerbose,
-  PrintChangedDiffQuiet,
-  PrintChangedColourDiffVerbose,
-  PrintChangedColourDiffQuiet,
-  PrintChangedDotCfgVerbose,
-  PrintChangedDotCfgQuiet
+  None,
+  Verbose,
+  Quiet,
+  DiffVerbose,
+  DiffQuiet,
+  ColourDiffVerbose,
+  ColourDiffQuiet,
+  DotCfgVerbose,
+  DotCfgQuiet,
 };
 static cl::opt<ChangePrinter> PrintChanged(
     "print-changed", cl::desc("Print changed IRs"), cl::Hidden,
-    cl::ValueOptional, cl::init(ChangePrinter::NoChangePrinter),
+    cl::ValueOptional, cl::init(ChangePrinter::None),
     cl::values(
-        clEnumValN(ChangePrinter::PrintChangedQuiet, "quiet",
-                   "Run in quiet mode"),
-        clEnumValN(ChangePrinter::PrintChangedDiffVerbose, "diff",
+        clEnumValN(ChangePrinter::Quiet, "quiet", "Run in quiet mode"),
+        clEnumValN(ChangePrinter::DiffVerbose, "diff",
                    "Display patch-like changes"),
-        clEnumValN(ChangePrinter::PrintChangedDiffQuiet, "diff-quiet",
+        clEnumValN(ChangePrinter::DiffQuiet, "diff-quiet",
                    "Display patch-like changes in quiet mode"),
-        clEnumValN(ChangePrinter::PrintChangedColourDiffVerbose, "cdiff",
+        clEnumValN(ChangePrinter::ColourDiffVerbose, "cdiff",
                    "Display patch-like changes with color"),
-        clEnumValN(ChangePrinter::PrintChangedColourDiffQuiet, "cdiff-quiet",
+        clEnumValN(ChangePrinter::ColourDiffQuiet, "cdiff-quiet",
                    "Display patch-like changes in quiet mode with color"),
-        clEnumValN(ChangePrinter::PrintChangedDotCfgVerbose, "dot-cfg",
+        clEnumValN(ChangePrinter::DotCfgVerbose, "dot-cfg",
                    "Create a website with graphical changes"),
-        clEnumValN(ChangePrinter::PrintChangedDotCfgQuiet, "dot-cfg-quiet",
+        clEnumValN(ChangePrinter::DotCfgQuiet, "dot-cfg-quiet",
                    "Create a website with graphical changes in quiet mode"),
         // Sentinel value for unspecified option.
-        clEnumValN(ChangePrinter::PrintChangedVerbose, "", "")));
+        clEnumValN(ChangePrinter::Verbose, "", "")));
 
 // An option that supports the -print-changed option.  See
 // the description for -print-changed for an explanation of the use
@@ -596,8 +595,8 @@ void TextChangeReporter<T>::handleIgnored(StringRef PassID, std::string &Name) {
 IRChangedPrinter::~IRChangedPrinter() = default;
 
 void IRChangedPrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
-  if (PrintChanged == ChangePrinter::PrintChangedVerbose ||
-      PrintChanged == ChangePrinter::PrintChangedQuiet)
+  if (PrintChanged == ChangePrinter::Verbose ||
+      PrintChanged == ChangePrinter::Quiet)
     TextChangeReporter<std::string>::registerRequiredCallbacks(PIC);
 }
 
@@ -940,7 +939,22 @@ void PrintPassInstrumentation::registerCallbacks(
     if (isSpecialPass(PassID, SpecialPasses))
       return;
 
-    print() << "Running pass: " << PassID << " on " << getIRName(IR) << "\n";
+    auto &OS = print();
+    OS << "Running pass: " << PassID << " on " << getIRName(IR);
+    if (any_isa<const Function *>(IR)) {
+      unsigned Count = any_cast<const Function *>(IR)->getInstructionCount();
+      OS << " (" << Count << " instruction";
+      if (Count != 1)
+        OS << 's';
+      OS << ')';
+    } else if (any_isa<const LazyCallGraph::SCC *>(IR)) {
+      int Count = any_cast<const LazyCallGraph::SCC *>(IR)->size();
+      OS << " (" << Count << " node";
+      if (Count != 1)
+        OS << 's';
+      OS << ')';
+    }
+    OS << "\n";
     Indent += 2;
   });
   PIC.registerAfterPassCallback(
@@ -1260,10 +1274,10 @@ void InLineChangePrinter::handleFunctionCompare(
 }
 
 void InLineChangePrinter::registerCallbacks(PassInstrumentationCallbacks &PIC) {
-  if (PrintChanged == ChangePrinter::PrintChangedDiffVerbose ||
-      PrintChanged == ChangePrinter::PrintChangedDiffQuiet ||
-      PrintChanged == ChangePrinter::PrintChangedColourDiffVerbose ||
-      PrintChanged == ChangePrinter::PrintChangedColourDiffQuiet)
+  if (PrintChanged == ChangePrinter::DiffVerbose ||
+      PrintChanged == ChangePrinter::DiffQuiet ||
+      PrintChanged == ChangePrinter::ColourDiffVerbose ||
+      PrintChanged == ChangePrinter::ColourDiffQuiet)
     TextChangeReporter<IRDataT<EmptyData>>::registerRequiredCallbacks(PIC);
 }
 
@@ -2096,8 +2110,8 @@ DotCfgChangeReporter::~DotCfgChangeReporter() {
 
 void DotCfgChangeReporter::registerCallbacks(
     PassInstrumentationCallbacks &PIC) {
-  if ((PrintChanged == ChangePrinter::PrintChangedDotCfgVerbose ||
-       PrintChanged == ChangePrinter::PrintChangedDotCfgQuiet)) {
+  if (PrintChanged == ChangePrinter::DotCfgVerbose ||
+       PrintChanged == ChangePrinter::DotCfgQuiet) {
     SmallString<128> OutputDir;
     sys::fs::expand_tilde(DotCfgDir, OutputDir);
     sys::fs::make_absolute(OutputDir);
@@ -2114,14 +2128,12 @@ void DotCfgChangeReporter::registerCallbacks(
 StandardInstrumentations::StandardInstrumentations(
     bool DebugLogging, bool VerifyEach, PrintPassOptions PrintPassOpts)
     : PrintPass(DebugLogging, PrintPassOpts), OptNone(DebugLogging),
-      PrintChangedIR(PrintChanged == ChangePrinter::PrintChangedVerbose),
-      PrintChangedDiff(
-          PrintChanged == ChangePrinter::PrintChangedDiffVerbose ||
-              PrintChanged == ChangePrinter::PrintChangedColourDiffVerbose,
-          PrintChanged == ChangePrinter::PrintChangedColourDiffVerbose ||
-              PrintChanged == ChangePrinter::PrintChangedColourDiffQuiet),
-      WebsiteChangeReporter(PrintChanged ==
-                            ChangePrinter::PrintChangedDotCfgVerbose),
+      PrintChangedIR(PrintChanged == ChangePrinter::Verbose),
+      PrintChangedDiff(PrintChanged == ChangePrinter::DiffVerbose ||
+                           PrintChanged == ChangePrinter::ColourDiffVerbose,
+                       PrintChanged == ChangePrinter::ColourDiffVerbose ||
+                           PrintChanged == ChangePrinter::ColourDiffQuiet),
+      WebsiteChangeReporter(PrintChanged == ChangePrinter::DotCfgVerbose),
       Verify(DebugLogging), VerifyEach(VerifyEach) {}
 
 PrintCrashIRInstrumentation *PrintCrashIRInstrumentation::CrashReporter =
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index f4f13bafb233..8c1eadbe8271 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -349,7 +349,7 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
                       StringRef CompilationDir) {
   auto ProfileReaderOrErr = IndexedInstrProfReader::create(ProfileFilename);
   if (Error E = ProfileReaderOrErr.takeError())
-    return std::move(E);
+    return createFileError(ProfileFilename, std::move(E));
   auto ProfileReader = std::move(ProfileReaderOrErr.get());
   auto Coverage = std::unique_ptr<CoverageMapping>(new CoverageMapping());
   bool DataFound = false;
@@ -358,7 +358,7 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
     auto CovMappingBufOrErr = MemoryBuffer::getFileOrSTDIN(
         File.value(), /*IsText=*/false, /*RequiresNullTerminator=*/false);
     if (std::error_code EC = CovMappingBufOrErr.getError())
-      return errorCodeToError(EC);
+      return createFileError(File.value(), errorCodeToError(EC));
     StringRef Arch = Arches.empty() ? StringRef() : Arches[File.index()];
     MemoryBufferRef CovMappingBufRef =
         CovMappingBufOrErr.get()->getMemBufferRef();
@@ -368,7 +368,7 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
     if (Error E = CoverageReadersOrErr.takeError()) {
       E = handleMaybeNoDataFoundError(std::move(E));
       if (E)
-        return std::move(E);
+        return createFileError(File.value(), std::move(E));
       // E == success (originally a no_data_found error).
       continue;
     }
@@ -378,12 +378,14 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
       Readers.push_back(std::move(Reader));
     DataFound |= !Readers.empty();
     if (Error E = loadFromReaders(Readers, *ProfileReader, *Coverage))
-      return std::move(E);
+      return createFileError(File.value(), std::move(E));
   }
   // If no readers were created, either no objects were provided or none of them
   // had coverage data. Return an error in the latter case.
   if (!DataFound && !ObjectFilenames.empty())
-    return make_error<CoverageMapError>(coveragemap_error::no_data_found);
+    return createFileError(
+        join(ObjectFilenames.begin(), ObjectFilenames.end(), ", "),
+        make_error<CoverageMapError>(coveragemap_error::no_data_found));
   return std::move(Coverage);
 }
 
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index ee8989979a26..23804ce604c4 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -1026,20 +1026,50 @@ InstrProfSymtab &IndexedInstrProfReader::getSymtab() {
   return *Symtab;
 }
 
-Expected<InstrProfRecord>
-IndexedInstrProfReader::getInstrProfRecord(StringRef FuncName,
-                                           uint64_t FuncHash) {
+Expected<InstrProfRecord> IndexedInstrProfReader::getInstrProfRecord(
+    StringRef FuncName, uint64_t FuncHash, uint64_t *MismatchedFuncSum) {
   ArrayRef<NamedInstrProfRecord> Data;
+  uint64_t FuncSum = 0;
   Error Err = Remapper->getRecords(FuncName, Data);
   if (Err)
     return std::move(Err);
   // Found it. Look for counters with the right hash.
+
+  // A flag to indicate if the records are from the same type
+  // of profile (i.e cs vs nocs).
+  bool CSBitMatch = false;
+  auto getFuncSum = [](const std::vector<uint64_t> &Counts) {
+    uint64_t ValueSum = 0;
+    for (unsigned I = 0, S = Counts.size(); I < S; I++) {
+      uint64_t CountValue = Counts[I];
+      if (CountValue == (uint64_t)-1)
+        continue;
+      // Handle overflow -- if that happens, return max.
+      if (std::numeric_limits<uint64_t>::max() - CountValue <= ValueSum)
+        return std::numeric_limits<uint64_t>::max();
+      ValueSum += CountValue;
+    }
+    return ValueSum;
+  };
+
   for (const NamedInstrProfRecord &I : Data) {
     // Check for a match and fill the vector if there is one.
     if (I.Hash == FuncHash)
       return std::move(I);
+    if (NamedInstrProfRecord::hasCSFlagInHash(I.Hash) ==
+        NamedInstrProfRecord::hasCSFlagInHash(FuncHash)) {
+      CSBitMatch = true;
+      if (MismatchedFuncSum == nullptr)
+        continue;
+      FuncSum = std::max(FuncSum, getFuncSum(I.Counts));
+    }
+  }
+  if (CSBitMatch) {
+    if (MismatchedFuncSum != nullptr)
+      *MismatchedFuncSum = FuncSum;
+    return error(instrprof_error::hash_mismatch);
   }
-  return error(instrprof_error::hash_mismatch);
+  return error(instrprof_error::unknown_function);
 }
 
 Expected<memprof::MemProfRecord>
diff --git a/llvm/lib/Support/AddressRanges.cpp b/llvm/lib/Support/AddressRanges.cpp
index 5ba011bac4e9..187d5be00dae 100644
--- a/llvm/lib/Support/AddressRanges.cpp
+++ b/llvm/lib/Support/AddressRanges.cpp
@@ -12,48 +12,59 @@
 
 using namespace llvm;
 
-void AddressRanges::insert(AddressRange Range) {
+AddressRanges::Collection::const_iterator
+AddressRanges::insert(AddressRange Range) {
   if (Range.size() == 0)
-    return;
+    return Ranges.end();
 
   auto It = llvm::upper_bound(Ranges, Range);
   auto It2 = It;
-  while (It2 != Ranges.end() && It2->start() < Range.end())
+  while (It2 != Ranges.end() && It2->start() <= Range.end())
     ++It2;
   if (It != It2) {
-    Range = {Range.start(), std::max(Range.end(), It2[-1].end())};
+    Range = {Range.start(), std::max(Range.end(), std::prev(It2)->end())};
     It = Ranges.erase(It, It2);
   }
-  if (It != Ranges.begin() && Range.start() < It[-1].end())
-    It[-1] = {It[-1].start(), std::max(It[-1].end(), Range.end())};
-  else
-    Ranges.insert(It, Range);
+  if (It != Ranges.begin() && Range.start() <= std::prev(It)->end()) {
+    --It;
+    *It = {It->start(), std::max(It->end(), Range.end())};
+    return It;
+  }
+
+  return Ranges.insert(It, Range);
 }
 
-bool AddressRanges::contains(uint64_t Addr) const {
+AddressRanges::Collection::const_iterator
+AddressRanges::find(uint64_t Addr) const {
   auto It = std::partition_point(
       Ranges.begin(), Ranges.end(),
       [=](const AddressRange &R) { return R.start() <= Addr; });
-  return It != Ranges.begin() && Addr < It[-1].end();
+
+  if (It == Ranges.begin())
+    return Ranges.end();
+
+  --It;
+  if (Addr >= It->end())
+    return Ranges.end();
+
+  return It;
 }
 
-bool AddressRanges::contains(AddressRange Range) const {
+AddressRanges::Collection::const_iterator
+AddressRanges::find(AddressRange Range) const {
   if (Range.size() == 0)
-    return false;
+    return Ranges.end();
+
   auto It = std::partition_point(
       Ranges.begin(), Ranges.end(),
       [=](const AddressRange &R) { return R.start() <= Range.start(); });
+
   if (It == Ranges.begin())
-    return false;
-  return Range.end() <= It[-1].end();
-}
+    return Ranges.end();
 
-Optional<AddressRange>
-AddressRanges::getRangeThatContains(uint64_t Addr) const {
-  auto It = std::partition_point(
-      Ranges.begin(), Ranges.end(),
-      [=](const AddressRange &R) { return R.start() <= Addr; });
-  if (It != Ranges.begin() && Addr < It[-1].end())
-    return It[-1];
-  return llvm::None;
+  --It;
+  if (Range.end() > It->end())
+    return Ranges.end();
+
+  return It;
 }
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index e3df172ef113..5e7d63165130 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -2382,7 +2382,7 @@ protected:
     for (size_t I = 0, E = Opts.size(); I != E; ++I) {
       Option *Opt = Opts[I].second;
       for (auto &Cat : Opt->Categories) {
-        assert(find(SortedCategories, Cat) != SortedCategories.end() &&
+        assert(llvm::is_contained(SortedCategories, Cat) &&
                "Option has an unregistered category");
         CategorizedOptions[Cat].push_back(Opt);
       }
diff --git a/llvm/lib/Support/Compression.cpp b/llvm/lib/Support/Compression.cpp
index 21191972fb8b..e8fb715aa770 100644
--- a/llvm/lib/Support/Compression.cpp
+++ b/llvm/lib/Support/Compression.cpp
@@ -20,6 +20,9 @@
 #if LLVM_ENABLE_ZLIB
 #include <zlib.h>
 #endif
+#if LLVM_ENABLE_ZSTD
+#include <zstd.h>
+#endif
 
 using namespace llvm;
 using namespace llvm::compression;
@@ -100,3 +103,65 @@ Error zlib::uncompress(ArrayRef<uint8_t> Input,
   llvm_unreachable("zlib::uncompress is unavailable");
 }
 #endif
+
+#if LLVM_ENABLE_ZSTD
+
+bool zstd::isAvailable() { return true; }
+
+void zstd::compress(ArrayRef<uint8_t> Input,
+                    SmallVectorImpl<uint8_t> &CompressedBuffer, int Level) {
+  unsigned long CompressedBufferSize = ::ZSTD_compressBound(Input.size());
+  CompressedBuffer.resize_for_overwrite(CompressedBufferSize);
+  unsigned long CompressedSize =
+      ::ZSTD_compress((char *)CompressedBuffer.data(), CompressedBufferSize,
+                      (const char *)Input.data(), Input.size(), Level);
+  if (ZSTD_isError(CompressedSize))
+    report_bad_alloc_error("Allocation failed");
+  // Tell MemorySanitizer that zstd output buffer is fully initialized.
+  // This avoids a false report when running LLVM with uninstrumented ZLib.
+  __msan_unpoison(CompressedBuffer.data(), CompressedSize);
+  if (CompressedSize < CompressedBuffer.size())
+    CompressedBuffer.truncate(CompressedSize);
+}
+
+Error zstd::uncompress(ArrayRef<uint8_t> Input, uint8_t *UncompressedBuffer,
+                       size_t &UncompressedSize) {
+  const size_t Res =
+      ::ZSTD_decompress(UncompressedBuffer, UncompressedSize,
+                        (const uint8_t *)Input.data(), Input.size());
+  UncompressedSize = Res;
+  // Tell MemorySanitizer that zstd output buffer is fully initialized.
+  // This avoids a false report when running LLVM with uninstrumented ZLib.
+  __msan_unpoison(UncompressedBuffer, UncompressedSize);
+  return ZSTD_isError(Res) ? make_error<StringError>(ZSTD_getErrorName(Res),
+                                                     inconvertibleErrorCode())
+                           : Error::success();
+}
+
+Error zstd::uncompress(ArrayRef<uint8_t> Input,
+                       SmallVectorImpl<uint8_t> &UncompressedBuffer,
+                       size_t UncompressedSize) {
+  UncompressedBuffer.resize_for_overwrite(UncompressedSize);
+  Error E =
+      zstd::uncompress(Input, UncompressedBuffer.data(), UncompressedSize);
+  if (UncompressedSize < UncompressedBuffer.size())
+    UncompressedBuffer.truncate(UncompressedSize);
+  return E;
+}
+
+#else
+bool zstd::isAvailable() { return false; }
+void zstd::compress(ArrayRef<uint8_t> Input,
+                    SmallVectorImpl<uint8_t> &CompressedBuffer, int Level) {
+  llvm_unreachable("zstd::compress is unavailable");
+}
+Error zstd::uncompress(ArrayRef<uint8_t> Input, uint8_t *UncompressedBuffer,
+                       size_t &UncompressedSize) {
+  llvm_unreachable("zstd::uncompress is unavailable");
+}
+Error zstd::uncompress(ArrayRef<uint8_t> Input,
+                       SmallVectorImpl<uint8_t> &UncompressedBuffer,
+                       size_t UncompressedSize) {
+  llvm_unreachable("zstd::uncompress is unavailable");
+}
+#endif
diff --git a/llvm/lib/Support/DivisionByConstantInfo.cpp b/llvm/lib/Support/DivisionByConstantInfo.cpp
index 69f39386798c..35486674e02f 100644
--- a/llvm/lib/Support/DivisionByConstantInfo.cpp
+++ b/llvm/lib/Support/DivisionByConstantInfo.cpp
@@ -1,4 +1,4 @@
-//===----- DivisonByConstantInfo.cpp - division by constant -*- C++ -*-----===//
+//===----- DivisionByConstantInfo.cpp - division by constant -*- C++ -*----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -62,11 +62,11 @@ SignedDivisionByConstantInfo SignedDivisionByConstantInfo::get(const APInt &D) {
 /// S. Warren, Jr., chapter 10.
 /// LeadingZeros can be used to simplify the calculation if the upper bits
 /// of the divided value are known zero.
-UnsignedDivisonByConstantInfo
-UnsignedDivisonByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
+UnsignedDivisionByConstantInfo
+UnsignedDivisionByConstantInfo::get(const APInt &D, unsigned LeadingZeros) {
   unsigned P;
   APInt NC, Delta, Q1, R1, Q2, R2;
-  struct UnsignedDivisonByConstantInfo Retval;
+  struct UnsignedDivisionByConstantInfo Retval;
   Retval.IsAdd = false; // initialize "add" indicator
   APInt AllOnes = APInt::getAllOnes(D.getBitWidth()).lshr(LeadingZeros);
   APInt SignedMin = APInt::getSignedMinValue(D.getBitWidth());
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 7fe04af4696b..0fe286d239d4 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -77,6 +77,8 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"zkt", RISCVExtensionVersion{1, 0}},
     {"zk", RISCVExtensionVersion{1, 0}},
 
+    {"zmmul", RISCVExtensionVersion{1, 0}},
+
     {"v", RISCVExtensionVersion{1, 0}},
     {"zvl32b", RISCVExtensionVersion{1, 0}},
     {"zvl64b", RISCVExtensionVersion{1, 0}},
diff --git a/llvm/lib/TableGen/JSONBackend.cpp b/llvm/lib/TableGen/JSONBackend.cpp
index e38903910275..6dc466e29df3 100644
--- a/llvm/lib/TableGen/JSONBackend.cpp
+++ b/llvm/lib/TableGen/JSONBackend.cpp
@@ -129,13 +129,13 @@ void JSONEmitter::run(raw_ostream &OS) {
   // construct the array for each one.
   std::map<std::string, json::Array> instance_lists;
   for (const auto &C : Records.getClasses()) {
-    auto &Name = C.second->getNameInitAsString();
+    const auto Name = C.second->getNameInitAsString();
     (void)instance_lists[Name];
   }
 
   // Main iteration over the defs.
   for (const auto &D : Records.getDefs()) {
-    auto &Name = D.second->getNameInitAsString();
+    const auto Name = D.second->getNameInitAsString();
     auto &Def = *D.second;
 
     json::Object obj;
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 75a99e95541a..6b899a049e6b 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -2424,6 +2424,14 @@ void RecordVal::print(raw_ostream &OS, bool PrintSem) const {
   if (PrintSem) OS << ";\n";
 }
 
+void Record::updateClassLoc(SMLoc Loc) {
+  assert(Locs.size() == 1);
+  ForwardDeclarationLocs.push_back(Locs.front());
+
+  Locs.clear();
+  Locs.push_back(Loc);
+}
+
 void Record::checkName() {
   // Ensure the record name has string type.
   const TypedInit *TypedName = cast<const TypedInit>(Name);
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index acf93dc3d792..aab1802c5348 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -3391,6 +3391,8 @@ bool TGParser::ParseClass() {
         !CurRec->getTemplateArgs().empty())
       return TokError("Class '" + CurRec->getNameInitAsString() +
                       "' already defined");
+
+    CurRec->updateClassLoc(Lex.getLoc());
   } else {
     // If this is the first reference to this class, create and add it.
     auto NewRec =
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index b332e9dcb176..8fb5d49e2121 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -216,7 +216,7 @@ def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
     "IsPaired128Slow", "true", "Paired 128 bit loads and stores are slow">;
 
 def FeatureAscendStoreAddress : SubtargetFeature<"ascend-store-address",
-    "IsStoreAddressAscend", "false",
+    "IsStoreAddressAscend", "true",
     "Schedule vector stores by ascending address">;
 
 def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "IsSTRQroSlow",
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 82fe5772c99d..00621b84d2f2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -69,6 +69,7 @@ public:
   bool tryMLAV64LaneV128(SDNode *N);
   bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithUXTXRegister(SDValue N, SDValue &Reg, SDValue &Shift);
   bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
   bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
   bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
@@ -893,6 +894,30 @@ bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
   return isWorthFolding(N);
 }
 
+/// SelectArithUXTXRegister - Select a "UXTX register" operand. This
+/// operand is refered by the instructions have SP operand
+bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg,
+                                                  SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  AArch64_AM::ShiftExtendType Ext;
+
+  if (N.getOpcode() != ISD::SHL)
+    return false;
+
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!CSD)
+    return false;
+  ShiftVal = CSD->getZExtValue();
+  if (ShiftVal > 4)
+    return false;
+
+  Ext = AArch64_AM::UXTX;
+  Reg = N.getOperand(0);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), SDLoc(N),
+                                    MVT::i32);
+  return isWorthFolding(N);
+}
+
 /// If there's a use of this ADDlow that's not itself a load/store then we'll
 /// need to create a real ADD instruction from it anyway and there's no point in
 /// folding it into the mem op. Theoretically, it shouldn't matter, but there's
@@ -4049,6 +4074,24 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       }
       break;
     }
+    case Intrinsic::swift_async_context_addr: {
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue CopyFP = CurDAG->getCopyFromReg(Chain, DL, AArch64::FP, MVT::i64);
+      SDValue Res = SDValue(
+          CurDAG->getMachineNode(AArch64::SUBXri, DL, MVT::i64, CopyFP,
+                                 CurDAG->getTargetConstant(8, DL, MVT::i32),
+                                 CurDAG->getTargetConstant(0, DL, MVT::i32)),
+          0);
+      ReplaceUses(SDValue(Node, 0), Res);
+      ReplaceUses(SDValue(Node, 1), CopyFP.getValue(1));
+      CurDAG->RemoveDeadNode(Node);
+
+      auto &MF = CurDAG->getMachineFunction();
+      MF.getFrameInfo().setFrameAddressIsTaken(true);
+      MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
+      return;
+    }
     }
   } break;
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -4094,18 +4137,6 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       if (tryMULLV64LaneV128(IntNo, Node))
         return;
       break;
-    case Intrinsic::swift_async_context_addr: {
-      SDLoc DL(Node);
-      CurDAG->SelectNodeTo(Node, AArch64::SUBXri, MVT::i64,
-                           CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-                                                  AArch64::FP, MVT::i64),
-                           CurDAG->getTargetConstant(8, DL, MVT::i32),
-                           CurDAG->getTargetConstant(0, DL, MVT::i32));
-      auto &MF = CurDAG->getMachineFunction();
-      MF.getFrameInfo().setFrameAddressIsTaken(true);
-      MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
-      return;
-    }
     }
     break;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 447ad10ddf22..e070ce2efa6b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -521,6 +521,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
   setOperationAction(ISD::CTPOP, MVT::i128, Custom);
 
+  setOperationAction(ISD::PARITY, MVT::i64, Custom);
+  setOperationAction(ISD::PARITY, MVT::i128, Custom);
+
   setOperationAction(ISD::ABS, MVT::i32, Custom);
   setOperationAction(ISD::ABS, MVT::i64, Custom);
 
@@ -5463,7 +5466,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::SRA_PARTS:
     return LowerShiftParts(Op, DAG);
   case ISD::CTPOP:
-    return LowerCTPOP(Op, DAG);
+  case ISD::PARITY:
+    return LowerCTPOP_PARITY(Op, DAG);
   case ISD::FCOPYSIGN:
     return LowerFCOPYSIGN(Op, DAG);
   case ISD::OR:
@@ -7783,7 +7787,8 @@ SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
   return BitCast(VT, BSP, DAG);
 }
 
-SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
+                                                 SelectionDAG &DAG) const {
   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
           Attribute::NoImplicitFloat))
     return SDValue();
@@ -7791,6 +7796,8 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget->hasNEON())
     return SDValue();
 
+  bool IsParity = Op.getOpcode() == ISD::PARITY;
+
   // While there is no integer popcount instruction, it can
   // be more efficiently lowered to the following sequence that uses
   // AdvSIMD registers/instructions as long as the copies to/from
@@ -7813,6 +7820,10 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
 
+    if (IsParity)
+      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
+                           DAG.getConstant(1, DL, MVT::i32));
+
     if (VT == MVT::i64)
       UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
     return UaddLV;
@@ -7824,9 +7835,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
         ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
         DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
 
+    if (IsParity)
+      UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
+                           DAG.getConstant(1, DL, MVT::i32));
+
     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
   }
 
+  assert(!IsParity && "ISD::PARITY of vector types not supported");
+
   if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
     return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
 
@@ -11811,6 +11828,12 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
           isConcatMask(M, VT, VT.getSizeInBits() == 128));
 }
 
+bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
+                                                   EVT VT) const {
+  // Just delegate to the generic legality, clear masks aren't special.
+  return isShuffleMaskLegal(M, VT);
+}
+
 /// getVShiftImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift operation, where all the elements of the
 /// build_vector must have the same constant integer value.
@@ -11969,6 +11992,11 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+    case AArch64CC::LE:
+      if (!NoNans)
+        return SDValue();
+      // If we ignore NaNs then we can use to the LS implementation.
+      LLVM_FALLTHROUGH;
     case AArch64CC::LS:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
@@ -12073,7 +12101,7 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
   bool ShouldInvert;
   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
 
-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
   SDValue Cmp =
       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
   if (!Cmp.getNode())
@@ -13587,21 +13615,50 @@ AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
 bool
 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
                                                      CombineLevel Level) const {
-  N = N->getOperand(0).getNode();
+  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) &&
+         "Expected shift op");
+
+  SDValue ShiftLHS = N->getOperand(0);
   EVT VT = N->getValueType(0);
-    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
-    // it with shift to let it be lowered to UBFX.
-  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
-      isa<ConstantSDNode>(N->getOperand(1))) {
-    uint64_t TruncMask = N->getConstantOperandVal(1);
+
+  // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not combine
+  // it with shift 'N' to let it be lowered to UBFX.
+  if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+      isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
+    uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
     if (isMask_64(TruncMask) &&
-      N->getOperand(0).getOpcode() == ISD::SRL &&
-      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+        ShiftLHS.getOperand(0).getOpcode() == ISD::SRL &&
+        isa<ConstantSDNode>(ShiftLHS.getOperand(0).getOperand(1)))
       return false;
   }
   return true;
 }
 
+bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
+    const SDNode *N) const {
+  assert(N->getOpcode() == ISD::XOR &&
+         (N->getOperand(0).getOpcode() == ISD::SHL ||
+          N->getOperand(0).getOpcode() == ISD::SRL) &&
+         "Expected XOR(SHIFT) pattern");
+
+  // Only commute if the entire NOT mask is a hidden shifted mask.
+  auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
+  if (XorC && ShiftC) {
+    unsigned MaskIdx, MaskLen;
+    if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
+      unsigned ShiftAmt = ShiftC->getZExtValue();
+      unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+      if (N->getOperand(0).getOpcode() == ISD::SHL)
+        return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
+      return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
+    }
+  }
+
+  return false;
+}
+
 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
     const SDNode *N, CombineLevel Level) const {
   assert(((N->getOpcode() == ISD::SHL &&
@@ -19221,6 +19278,41 @@ static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
 }
 
+static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue Insert = N->getOperand(0);
+  if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
+    return SDValue();
+
+  if (!Insert.getOperand(0).isUndef())
+    return SDValue();
+
+  uint64_t IdxInsert = Insert.getConstantOperandVal(2);
+  uint64_t IdxDupLane = N->getConstantOperandVal(1);
+  if (IdxInsert != IdxDupLane)
+    return SDValue();
+
+  SDValue Bitcast = Insert.getOperand(1);
+  if (Bitcast.getOpcode() != ISD::BITCAST)
+    return SDValue();
+
+  SDValue Subvec = Bitcast.getOperand(0);
+  EVT SubvecVT = Subvec.getValueType();
+  if (!SubvecVT.is128BitVector())
+    return SDValue();
+  EVT NewSubvecVT =
+      getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
+
+  SDLoc DL(N);
+  SDValue NewInsert =
+      DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
+                  DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
+  SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
+                                      NewInsert, N->getOperand(1));
+  return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -19307,6 +19399,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performCSELCombine(N, DCI, DAG);
   case AArch64ISD::DUP:
     return performDUPCombine(N, DCI);
+  case AArch64ISD::DUPLANE128:
+    return performDupLane128Combine(N, DAG);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
   case AArch64ISD::SPLICE:
@@ -19981,7 +20075,8 @@ void AArch64TargetLowering::ReplaceNodeResults(
     return;
 
   case ISD::CTPOP:
-    if (SDValue Result = LowerCTPOP(SDValue(N, 0), DAG))
+  case ISD::PARITY:
+    if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
       Results.push_back(Result);
     return;
   case AArch64ISD::SADDV:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index e02b5e56fd2e..1ba2e2f315ec 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -549,6 +549,10 @@ public:
   /// should be stack expanded.
   bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
 
+  /// Similar to isShuffleMaskLegal. Return true is the given 'select with zero'
+  /// shuffle mask can be codegen'd directly.
+  bool isVectorClearMaskLegal(ArrayRef<int> M, EVT VT) const override;
+
   /// Return the ISD::SETCC ValueType.
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
@@ -653,6 +657,9 @@ public:
   bool isDesirableToCommuteWithShift(const SDNode *N,
                                      CombineLevel Level) const override;
 
+  /// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
+
   /// Return true if it is profitable to fold a pair of shifts into a mask.
   bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                          CombineLevel Level) const override;
@@ -995,7 +1002,7 @@ private:
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP_PARITY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBitreverse(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 02fa36a1df4b..e70d304f37b9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1168,6 +1168,8 @@ def gi_arith_extended_reg32to64_i64 :
     GIComplexOperandMatcher<s64, "selectArithExtendedRegister">,
     GIComplexPatternEquiv<arith_extended_reg32to64_i64>;
 
+def arith_uxtx : ComplexPattern<i64, 2, "SelectArithUXTXRegister", []>;
+
 // Floating-point immediate.
 
 def fpimm16XForm : SDNodeXForm<fpimm, [{
@@ -1234,6 +1236,10 @@ def fpimm0 : FPImmLeaf<fAny, [{
   return Imm.isExactlyValue(+0.0);
 }]>;
 
+def fpimm_minus0 : FPImmLeaf<fAny, [{
+  return Imm.isExactlyValue(-0.0);
+}]>;
+
 def fpimm_half : FPImmLeaf<fAny, [{
   return Imm.isExactlyValue(+0.5);
 }]>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d444223e4494..a7b7e5270888 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1691,6 +1691,11 @@ def : InstAlias<"mov $dst, $src",
 defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
 defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;
 
+def copyFromSP: PatLeaf<(i64 GPR64:$src), [{
+  return N->getOpcode() == ISD::CopyFromReg &&
+         cast<RegisterSDNode>(N->getOperand(1))->getReg() == AArch64::SP;
+}]>;
+
 // Use SUBS instead of SUB to enable CSE between SUBS and SUB.
 def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
           (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
@@ -1709,6 +1714,8 @@ def : Pat<(sub GPR32sp:$R2, arith_extended_reg32_i32:$R3),
           (SUBSWrx GPR32sp:$R2, arith_extended_reg32_i32:$R3)>;
 def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64_i64:$R3),
           (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64_i64:$R3)>;
+def : Pat<(sub copyFromSP:$R2, (arith_uxtx GPR64:$R3, arith_extendlsl64:$imm)),
+          (SUBXrx64 GPR64sp:$R2, GPR64:$R3, arith_extendlsl64:$imm)>;
 }
 
 // Because of the immediate format for add/sub-imm instructions, the
@@ -5293,6 +5300,9 @@ def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
 // CodeGen patterns for addhn and subhn instructions, which can actually be
 // written in LLVM IR without too much difficulty.
 
+// Prioritize ADDHN and SUBHN over UZP2.
+let AddedComplexity = 10 in {
+
 // ADDHN
 def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
           (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
@@ -5343,6 +5353,8 @@ def : Pat<(concat_vectors (v2i32 V64:$Rd),
           (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
                             V128:$Rn, V128:$Rm)>;
 
+} // AddedComplexity = 10
+
 //----------------------------------------------------------------------------
 // AdvSIMD bitwise extract from vector instruction.
 //----------------------------------------------------------------------------
@@ -5409,6 +5421,19 @@ def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
                                  (v2i32 (trunc (v2i64 V128:$Vm))))),
           (UZP1v4i32 V128:$Vn, V128:$Vm)>;
 
+def : Pat<(v16i8 (concat_vectors
+                 (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
+                 (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vm), (i32 8)))))),
+          (UZP2v16i8 V128:$Vn, V128:$Vm)>;
+def : Pat<(v8i16 (concat_vectors
+                 (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vn), (i32 16)))),
+                 (v4i16 (trunc (AArch64vlshr (v4i32 V128:$Vm), (i32 16)))))),
+          (UZP2v8i16 V128:$Vn, V128:$Vm)>;
+def : Pat<(v4i32 (concat_vectors
+                 (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vn), (i32 32)))),
+                 (v2i32 (trunc (AArch64vlshr (v2i64 V128:$Vm), (i32 32)))))),
+          (UZP2v4i32 V128:$Vn, V128:$Vm)>;
+
 //----------------------------------------------------------------------------
 // AdvSIMD TBL/TBX instructions
 //----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
index 6c8845ee8598..79866c9b0a05 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp
@@ -22,8 +22,8 @@ static bool needReorderStoreMI(const MachineInstr *MI) {
     return false;
   case AArch64::STURQi:
   case AArch64::STRQui:
-    if (MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend())
-       return false;
+    if (!MI->getMF()->getSubtarget<AArch64Subtarget>().isStoreAddressAscend())
+      return false;
     LLVM_FALLTHROUGH;
   case AArch64::STPQi:
     return AArch64InstrInfo::getLdStOffsetOp(*MI).isImm();
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c66f9cfd9c22..4032c4667bc7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -278,10 +278,18 @@ def AArch64scvtf_mt  : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch
 def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>;
 
-def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
-def AArch64clasta_n   : SDNode<"AArch64ISD::CLASTA_N",   SDT_AArch64ReduceWithInit>;
-def AArch64clastb_n   : SDNode<"AArch64ISD::CLASTB_N",   SDT_AArch64ReduceWithInit>;
-def AArch64fadda_p    : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
+def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3,
+   [SDTCisVec<1>, SDTCVecEltisVT<1,i1>, SDTCisVec<3>, SDTCisSameNumEltsAs<1,3>]>;
+def AArch64clasta_n     : SDNode<"AArch64ISD::CLASTA_N",   SDT_AArch64ReduceWithInit>;
+def AArch64clastb_n     : SDNode<"AArch64ISD::CLASTB_N",   SDT_AArch64ReduceWithInit>;
+def AArch64fadda_p_node : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
+
+def AArch64fadda_p : PatFrags<(ops node:$op1, node:$op2, node:$op3),
+    [(AArch64fadda_p_node node:$op1, node:$op2, node:$op3),
+     (AArch64fadda_p_node (SVEAllActive), node:$op2,
+             (vselect node:$op1, node:$op3, (splat_vector (f32 fpimm_minus0)))),
+     (AArch64fadda_p_node (SVEAllActive), node:$op2,
+             (vselect node:$op1, node:$op3, (splat_vector (f64 fpimm_minus0))))]>;
 
 def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def AArch64ptest     : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
@@ -447,6 +455,16 @@ let Predicates = [HasSVEorSME] in {
   defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>;
   defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>;
 
+  // zext(cmpeq(x, splat(0))) -> cnot(x)
+  def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive):$Pg), nxv16i8:$Op2, (SVEDup0), SETEQ)))),
+            (CNOT_ZPmZ_B $Op2, $Pg, $Op2)>;
+  def : Pat<(nxv8i16 (zext (nxv8i1 (AArch64setcc_z (nxv8i1 (SVEAllActive):$Pg), nxv8i16:$Op2, (SVEDup0), SETEQ)))),
+            (CNOT_ZPmZ_H $Op2, $Pg, $Op2)>;
+  def : Pat<(nxv4i32 (zext (nxv4i1 (AArch64setcc_z (nxv4i1 (SVEAllActive):$Pg), nxv4i32:$Op2, (SVEDup0), SETEQ)))),
+            (CNOT_ZPmZ_S $Op2, $Pg, $Op2)>;
+  def : Pat<(nxv2i64 (zext (nxv2i1 (AArch64setcc_z (nxv2i1 (SVEAllActive):$Pg), nxv2i64:$Op2, (SVEDup0), SETEQ)))),
+            (CNOT_ZPmZ_D $Op2, $Pg, $Op2)>;
+
   defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", "SMAX_ZPZZ", int_aarch64_sve_smax, DestructiveBinaryComm>;
   defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", "UMAX_ZPZZ", int_aarch64_sve_umax, DestructiveBinaryComm>;
   defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", "SMIN_ZPZZ", int_aarch64_sve_smin, DestructiveBinaryComm>;
@@ -857,6 +875,16 @@ let Predicates = [HasSVEorSME] in {
   defm LD1RQ_W      : sve_mem_ldqr_ss<0b10, "ld1rqw", Z_s, ZPR32, GPR64NoXZRshifted32>;
   defm LD1RQ_D      : sve_mem_ldqr_ss<0b11, "ld1rqd", Z_d, ZPR64, GPR64NoXZRshifted64>;
 
+  let AddedComplexity = 1 in {
+  class LD1RQPat<ValueType vt1, ValueType vt2, SDPatternOperator op, Instruction load_instr, Instruction ptrue> :
+          Pat<(vt1 (op (vt1 (vector_insert_subvec (vt1 undef), (vt2 (load GPR64sp:$Xn)), (i64 0))), (i64 0))),
+          (load_instr (ptrue 31), GPR64sp:$Xn, 0)>;
+  }
+  def : LD1RQPat<nxv16i8, v16i8, AArch64duplane128, LD1RQ_B_IMM, PTRUE_B>;
+  def : LD1RQPat<nxv8i16, v8i16, AArch64duplane128, LD1RQ_H_IMM, PTRUE_H>;
+  def : LD1RQPat<nxv4i32, v4i32, AArch64duplane128, LD1RQ_W_IMM, PTRUE_S>;
+  def : LD1RQPat<nxv2i64, v2i64, AArch64duplane128, LD1RQ_D_IMM, PTRUE_D>;
+
   // continuous load with reg+reg addressing.
   defm LD1B    : sve_mem_cld_ss<0b0000, "ld1b",  Z_b, ZPR8,  GPR64NoXZRshifted8>;
   defm LD1B_H  : sve_mem_cld_ss<0b0001, "ld1b",  Z_h, ZPR16, GPR64NoXZRshifted8>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 3f9795f5198b..47e4c6589c26 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -128,7 +128,7 @@ static cl::opt<bool>
 static cl::opt<bool>
     EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
                  cl::desc("Enable optimizations on complex GEPs"),
-                 cl::init(false));
+                 cl::init(true));
 
 static cl::opt<bool>
     BranchRelaxation("aarch64-enable-branch-relax", cl::Hidden, cl::init(true),
@@ -563,17 +563,6 @@ void AArch64PassConfig::addIRPasses() {
       addPass(createFalkorMarkStridedAccessesPass());
   }
 
-  TargetPassConfig::addIRPasses();
-
-  addPass(createAArch64StackTaggingPass(
-      /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
-
-  // Match interleaved memory accesses to ldN/stN intrinsics.
-  if (TM->getOptLevel() != CodeGenOpt::None) {
-    addPass(createInterleavedLoadCombinePass());
-    addPass(createInterleavedAccessPass());
-  }
-
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
     // Call SeparateConstOffsetFromGEP pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
@@ -587,6 +576,17 @@ void AArch64PassConfig::addIRPasses() {
     addPass(createLICMPass());
   }
 
+  TargetPassConfig::addIRPasses();
+
+  addPass(createAArch64StackTaggingPass(
+      /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
+
+  // Match interleaved memory accesses to ldN/stN intrinsics.
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    addPass(createInterleavedLoadCombinePass());
+    addPass(createInterleavedAccessPass());
+  }
+
   // Add Control Flow Guard checks.
   if (TM->getTargetTriple().isOSWindows())
     addPass(createCFGuardCheckPass());
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 274a025e82a0..66617393c9ae 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -37,6 +38,74 @@ static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
 static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
                                             cl::init(10), cl::Hidden);
 
+class TailFoldingKind {
+private:
+  uint8_t Bits = 0; // Currently defaults to disabled.
+
+public:
+  enum TailFoldingOpts {
+    TFDisabled = 0x0,
+    TFReductions = 0x01,
+    TFRecurrences = 0x02,
+    TFSimple = 0x80,
+    TFAll = TFReductions | TFRecurrences | TFSimple
+  };
+
+  void operator=(const std::string &Val) {
+    if (Val.empty())
+      return;
+    SmallVector<StringRef, 6> TailFoldTypes;
+    StringRef(Val).split(TailFoldTypes, '+', -1, false);
+    for (auto TailFoldType : TailFoldTypes) {
+      if (TailFoldType == "disabled")
+        Bits = 0;
+      else if (TailFoldType == "all")
+        Bits = TFAll;
+      else if (TailFoldType == "default")
+        Bits = 0; // Currently defaults to never tail-folding.
+      else if (TailFoldType == "simple")
+        add(TFSimple);
+      else if (TailFoldType == "reductions")
+        add(TFReductions);
+      else if (TailFoldType == "recurrences")
+        add(TFRecurrences);
+      else if (TailFoldType == "noreductions")
+        remove(TFReductions);
+      else if (TailFoldType == "norecurrences")
+        remove(TFRecurrences);
+      else {
+        errs()
+            << "invalid argument " << TailFoldType.str()
+            << " to -sve-tail-folding=; each element must be one of: disabled, "
+               "all, default, simple, reductions, noreductions, recurrences, "
+               "norecurrences\n";
+      }
+    }
+  }
+
+  operator uint8_t() const { return Bits; }
+
+  void add(uint8_t Flag) { Bits |= Flag; }
+  void remove(uint8_t Flag) { Bits &= ~Flag; }
+};
+
+TailFoldingKind TailFoldingKindLoc;
+
+cl::opt<TailFoldingKind, true, cl::parser<std::string>> SVETailFolding(
+    "sve-tail-folding",
+    cl::desc(
+        "Control the use of vectorisation using tail-folding for SVE:"
+        "\ndisabled    No loop types will vectorize using tail-folding"
+        "\ndefault     Uses the default tail-folding settings for the target "
+        "CPU"
+        "\nall         All legal loop types will vectorize using tail-folding"
+        "\nsimple      Use tail-folding for simple loops (not reductions or "
+        "recurrences)"
+        "\nreductions  Use tail-folding for loops containing reductions"
+        "\nrecurrences Use tail-folding for loops containing first order "
+        "recurrences"),
+    cl::location(TailFoldingKindLoc));
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -2955,3 +3024,20 @@ InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   return BaseT::getShuffleCost(Kind, Tp, Mask, Index, SubTp);
 }
+
+bool AArch64TTIImpl::preferPredicateOverEpilogue(
+    Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+    TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
+  if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
+    return false;
+
+  TailFoldingKind Required; // Defaults to 0.
+  if (LVL->getReductionVars().size())
+    Required.add(TailFoldingKind::TFReductions);
+  if (LVL->getFirstOrderRecurrences().size())
+    Required.add(TailFoldingKind::TFRecurrences);
+  if (!Required)
+    Required.add(TailFoldingKind::TFSimple);
+
+  return (TailFoldingKindLoc & Required) == Required;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 59ec91843266..2231f8705998 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -340,6 +340,11 @@ public:
     return PredicationStyle::None;
   }
 
+  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
+                                   DominatorTree *DT,
+                                   LoopVectorizationLegality *LVL);
+
   bool supportsScalableVectors() const { return ST->hasSVE(); }
 
   bool enableScalableVectorization() const { return ST->hasSVE(); }
@@ -347,6 +352,11 @@ public:
   bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc,
                                    ElementCount VF) const;
 
+  bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty,
+                                       TTI::ReductionFlags Flags) const {
+    return ST->hasSVE();
+  }
+
   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
                                              Optional<FastMathFlags> FMF,
                                              TTI::TargetCostKind CostKind);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index f129bfe11e4d..3fe3b2a69855 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -231,7 +231,70 @@ void AArch64_MC::initLLVMToCVRegMapping(MCRegisterInfo *MRI) {
       {codeview::RegisterId::ARM64_Q29, AArch64::Q29},
       {codeview::RegisterId::ARM64_Q30, AArch64::Q30},
       {codeview::RegisterId::ARM64_Q31, AArch64::Q31},
-
+      {codeview::RegisterId::ARM64_B0, AArch64::B0},
+      {codeview::RegisterId::ARM64_B1, AArch64::B1},
+      {codeview::RegisterId::ARM64_B2, AArch64::B2},
+      {codeview::RegisterId::ARM64_B3, AArch64::B3},
+      {codeview::RegisterId::ARM64_B4, AArch64::B4},
+      {codeview::RegisterId::ARM64_B5, AArch64::B5},
+      {codeview::RegisterId::ARM64_B6, AArch64::B6},
+      {codeview::RegisterId::ARM64_B7, AArch64::B7},
+      {codeview::RegisterId::ARM64_B8, AArch64::B8},
+      {codeview::RegisterId::ARM64_B9, AArch64::B9},
+      {codeview::RegisterId::ARM64_B10, AArch64::B10},
+      {codeview::RegisterId::ARM64_B11, AArch64::B11},
+      {codeview::RegisterId::ARM64_B12, AArch64::B12},
+      {codeview::RegisterId::ARM64_B13, AArch64::B13},
+      {codeview::RegisterId::ARM64_B14, AArch64::B14},
+      {codeview::RegisterId::ARM64_B15, AArch64::B15},
+      {codeview::RegisterId::ARM64_B16, AArch64::B16},
+      {codeview::RegisterId::ARM64_B17, AArch64::B17},
+      {codeview::RegisterId::ARM64_B18, AArch64::B18},
+      {codeview::RegisterId::ARM64_B19, AArch64::B19},
+      {codeview::RegisterId::ARM64_B20, AArch64::B20},
+      {codeview::RegisterId::ARM64_B21, AArch64::B21},
+      {codeview::RegisterId::ARM64_B22, AArch64::B22},
+      {codeview::RegisterId::ARM64_B23, AArch64::B23},
+      {codeview::RegisterId::ARM64_B24, AArch64::B24},
+      {codeview::RegisterId::ARM64_B25, AArch64::B25},
+      {codeview::RegisterId::ARM64_B26, AArch64::B26},
+      {codeview::RegisterId::ARM64_B27, AArch64::B27},
+      {codeview::RegisterId::ARM64_B28, AArch64::B28},
+      {codeview::RegisterId::ARM64_B29, AArch64::B29},
+      {codeview::RegisterId::ARM64_B30, AArch64::B30},
+      {codeview::RegisterId::ARM64_B31, AArch64::B31},
+      {codeview::RegisterId::ARM64_H0, AArch64::H0},
+      {codeview::RegisterId::ARM64_H1, AArch64::H1},
+      {codeview::RegisterId::ARM64_H2, AArch64::H2},
+      {codeview::RegisterId::ARM64_H3, AArch64::H3},
+      {codeview::RegisterId::ARM64_H4, AArch64::H4},
+      {codeview::RegisterId::ARM64_H5, AArch64::H5},
+      {codeview::RegisterId::ARM64_H6, AArch64::H6},
+      {codeview::RegisterId::ARM64_H7, AArch64::H7},
+      {codeview::RegisterId::ARM64_H8, AArch64::H8},
+      {codeview::RegisterId::ARM64_H9, AArch64::H9},
+      {codeview::RegisterId::ARM64_H10, AArch64::H10},
+      {codeview::RegisterId::ARM64_H11, AArch64::H11},
+      {codeview::RegisterId::ARM64_H12, AArch64::H12},
+      {codeview::RegisterId::ARM64_H13, AArch64::H13},
+      {codeview::RegisterId::ARM64_H14, AArch64::H14},
+      {codeview::RegisterId::ARM64_H15, AArch64::H15},
+      {codeview::RegisterId::ARM64_H16, AArch64::H16},
+      {codeview::RegisterId::ARM64_H17, AArch64::H17},
+      {codeview::RegisterId::ARM64_H18, AArch64::H18},
+      {codeview::RegisterId::ARM64_H19, AArch64::H19},
+      {codeview::RegisterId::ARM64_H20, AArch64::H20},
+      {codeview::RegisterId::ARM64_H21, AArch64::H21},
+      {codeview::RegisterId::ARM64_H22, AArch64::H22},
+      {codeview::RegisterId::ARM64_H23, AArch64::H23},
+      {codeview::RegisterId::ARM64_H24, AArch64::H24},
+      {codeview::RegisterId::ARM64_H25, AArch64::H25},
+      {codeview::RegisterId::ARM64_H26, AArch64::H26},
+      {codeview::RegisterId::ARM64_H27, AArch64::H27},
+      {codeview::RegisterId::ARM64_H28, AArch64::H28},
+      {codeview::RegisterId::ARM64_H29, AArch64::H29},
+      {codeview::RegisterId::ARM64_H30, AArch64::H30},
+      {codeview::RegisterId::ARM64_H31, AArch64::H31},
   };
   for (const auto &I : RegMap)
     MRI->mapLLVMRegToCVReg(I.Reg, static_cast<int>(I.CVReg));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 48b5814cd482..2d6f1438e315 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -585,6 +585,12 @@ def FeatureMAIInsts : SubtargetFeature<"mai-insts",
   "Has mAI instructions"
 >;
 
+def FeatureFP8Insts : SubtargetFeature<"fp8-insts",
+  "HasFP8Insts",
+  "true",
+  "Has fp8 and bf8 instructions"
+>;
+
 def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
   "HasPkFmacF16Inst",
   "true",
@@ -1124,6 +1130,7 @@ def FeatureISAVersion9_4_0 : FeatureSet<
    Feature64BitDPP,
    FeaturePackedFP32Ops,
    FeatureMAIInsts,
+   FeatureFP8Insts,
    FeaturePkFmacF16Inst,
    FeatureAtomicFaddRtnInsts,
    FeatureAtomicFaddNoRtnInsts,
@@ -1265,11 +1272,14 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard]>;
 
-// Features for GFX 11.0.0 and 11.0.1
-def FeatureISAVersion11_0 : FeatureSet<
+def FeatureISAVersion11_0_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureUserSGPRInit16Bug])>;
 
+def FeatureISAVersion11_0_1 : FeatureSet<
+  !listconcat(FeatureISAVersion11_Common.Features,
+    [])>;
+
 def FeatureISAVersion11_0_2 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureUserSGPRInit16Bug])>;
@@ -1704,6 +1714,9 @@ def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
 def HasShaderCyclesRegister : Predicate<"Subtarget->hasShaderCyclesRegister()">,
   AssemblerPredicate<(all_of FeatureShaderCyclesRegister)>;
 
+def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
+  AssemblerPredicate<(all_of FeatureFP8Insts)>;
+
 def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
   AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index d28f38e42430..d361e33995cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -74,6 +74,7 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
        << "  WorkGroupIDY: " << FI.second.WorkGroupIDY
        << "  WorkGroupIDZ: " << FI.second.WorkGroupIDZ
        << "  WorkGroupInfo: " << FI.second.WorkGroupInfo
+       << "  LDSKernelId: " << FI.second.LDSKernelId
        << "  PrivateSegmentWaveByteOffset: "
           << FI.second.PrivateSegmentWaveByteOffset
        << "  ImplicitBufferPtr: " << FI.second.ImplicitBufferPtr
@@ -107,6 +108,9 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
     return std::make_tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
                            &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
+  case AMDGPUFunctionArgInfo::LDS_KERNEL_ID:
+    return std::make_tuple(LDSKernelId ? &LDSKernelId : nullptr,
+                           &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
     return std::make_tuple(
         PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
@@ -162,6 +166,7 @@ constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
   AI.WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::SGPR12);
   AI.WorkGroupIDY = ArgDescriptor::createRegister(AMDGPU::SGPR13);
   AI.WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::SGPR14);
+  AI.LDSKernelId = ArgDescriptor::createRegister(AMDGPU::SGPR15);
 
   const unsigned Mask = 0x3ff;
   AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index e9ed45d8cd14..f595e469f998 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -103,6 +103,7 @@ struct AMDGPUFunctionArgInfo {
     KERNARG_SEGMENT_PTR =  3,
     DISPATCH_ID         =  4,
     FLAT_SCRATCH_INIT   =  5,
+    LDS_KERNEL_ID       =  6, // LLVM internal, not part of the ABI
     WORKGROUP_ID_X      = 10,
     WORKGROUP_ID_Y      = 11,
     WORKGROUP_ID_Z      = 12,
@@ -128,6 +129,7 @@ struct AMDGPUFunctionArgInfo {
   ArgDescriptor DispatchID;
   ArgDescriptor FlatScratchInit;
   ArgDescriptor PrivateSegmentSize;
+  ArgDescriptor LDSKernelId;
 
   // System SGPRs in kernels.
   ArgDescriptor WorkGroupIDX;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 57a4660bc1eb..13a65f1ad601 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -27,8 +27,10 @@
 #include "SIMachineFunctionInfo.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -415,6 +417,10 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
         amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32;
   }
 
+  if (CurrentProgramInfo.DynamicCallStack) {
+    KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
+  }
+
   return KernelCodeProperties;
 }
 
@@ -506,6 +512,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   emitFunctionBody();
 
+  emitResourceUsageRemarks(MF, CurrentProgramInfo, MFI->isModuleEntryFunction(),
+                           STM.hasMAIInsts());
+
   if (isVerbose()) {
     MCSectionELF *CommentSection =
         Context.getELFSection(".AMDGPU.csdata", ELF::SHT_PROGBITS, 0);
@@ -875,6 +884,9 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
     LDSAlignShift = 9;
   }
 
+  ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
+  ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
+
   ProgInfo.LDSSize = MFI->getLDSSize();
   ProgInfo.LDSBlocks =
       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
@@ -1180,3 +1192,58 @@ void AMDGPUAsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<AMDGPUResourceUsageAnalysis>();
   AsmPrinter::getAnalysisUsage(AU);
 }
+
+void AMDGPUAsmPrinter::emitResourceUsageRemarks(
+    const MachineFunction &MF, const SIProgramInfo &CurrentProgramInfo,
+    bool isModuleEntryFunction, bool hasMAIInsts) {
+  if (!ORE)
+    return;
+
+  const char *Name = "kernel-resource-usage";
+  const char *Indent = "    ";
+
+  // If the remark is not specifically enabled, do not output to yaml
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  if (!Ctx.getDiagHandlerPtr()->isAnalysisRemarkEnabled(Name))
+    return;
+
+  auto EmitResourceUsageRemark = [&](StringRef RemarkName,
+                                     StringRef RemarkLabel, auto Argument) {
+    // Add an indent for every line besides the line with the kernel name. This
+    // makes it easier to tell which resource usage go with which kernel since
+    // the kernel name will always be displayed first.
+    std::string LabelStr = RemarkLabel.str() + ": ";
+    if (!RemarkName.equals("FunctionName"))
+      LabelStr = Indent + LabelStr;
+
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(Name, RemarkName,
+                                               MF.getFunction().getSubprogram(),
+                                               &MF.front())
+             << LabelStr << ore::NV(RemarkName, Argument);
+    });
+  };
+
+  // FIXME: Formatting here is pretty nasty because clang does not accept
+  // newlines from diagnostics. This forces us to emit multiple diagnostic
+  // remarks to simulate newlines. If and when clang does accept newlines, this
+  // formatting should be aggregated into one remark with newlines to avoid
+  // printing multiple diagnostic location and diag opts.
+  EmitResourceUsageRemark("FunctionName", "Function Name",
+                          MF.getFunction().getName());
+  EmitResourceUsageRemark("NumSGPR", "SGPRs", CurrentProgramInfo.NumSGPR);
+  EmitResourceUsageRemark("NumVGPR", "VGPRs", CurrentProgramInfo.NumArchVGPR);
+  if (hasMAIInsts)
+    EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
+  EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
+                          CurrentProgramInfo.ScratchSize);
+  EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
+                          CurrentProgramInfo.Occupancy);
+  EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
+                          CurrentProgramInfo.SGPRSpill);
+  EmitResourceUsageRemark("VGPRSpill", "VGPRs Spill",
+                          CurrentProgramInfo.VGPRSpill);
+  if (isModuleEntryFunction)
+    EmitResourceUsageRemark("BytesLDS", "LDS Size [bytes/block]",
+                            CurrentProgramInfo.LDSSize);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index ddda2cf107b1..2881b8d7bcca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -69,6 +69,9 @@ private:
                                   uint64_t ScratchSize,
                                   uint64_t CodeSize,
                                   const AMDGPUMachineFunction* MFI);
+  void emitResourceUsageRemarks(const MachineFunction &MF,
+                                const SIProgramInfo &CurrentProgramInfo,
+                                bool isModuleEntryFunction, bool hasMAIInsts);
 
   uint16_t getAmdhsaKernelCodeProperties(
       const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
index 0a2cf3874245..c7a060c5db5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -27,5 +27,6 @@ AMDGPU_ATTRIBUTE(WORKGROUP_ID_Z, "amdgpu-no-workgroup-id-z")
 AMDGPU_ATTRIBUTE(WORKITEM_ID_X, "amdgpu-no-workitem-id-x")
 AMDGPU_ATTRIBUTE(WORKITEM_ID_Y, "amdgpu-no-workitem-id-y")
 AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
+AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id")
 
 #undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 8de0d7e6bff1..a3634d2440c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -72,6 +72,8 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
   case Intrinsic::amdgcn_workgroup_id_z:
   case Intrinsic::r600_read_tgid_z:
     return WORKGROUP_ID_Z;
+  case Intrinsic::amdgcn_lds_kernel_id:
+    return LDS_KERNEL_ID;
   case Intrinsic::amdgcn_dispatch_ptr:
     return DISPATCH_PTR;
   case Intrinsic::amdgcn_dispatch_id:
@@ -457,6 +459,10 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       removeAssumedBits(QUEUE_PTR);
     }
 
+    if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(A)) {
+      removeAssumedBits(LDS_KERNEL_ID);
+    }
+
     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
                                        : ChangeStatus::UNCHANGED;
   }
@@ -591,6 +597,16 @@ private:
     return !A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *this,
                                               UsedAssumedInformation);
   }
+
+  bool funcRetrievesLDSKernelId(Attributor &A) {
+    auto DoesNotRetrieve = [&](Instruction &I) {
+      auto &Call = cast<CallBase>(I);
+      return Call.getIntrinsicID() != Intrinsic::amdgcn_lds_kernel_id;
+    };
+    bool UsedAssumedInformation = false;
+    return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
+                                              UsedAssumedInformation);
+  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
@@ -743,7 +759,8 @@ public:
     AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
     DenseSet<const char *> Allowed(
         {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
-         &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID, &AAPointerInfo::ID});
+         &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID,
+         &AAPointerInfo::ID});
 
     AttributorConfig AC(CGUpdater);
     AC.Allowed = &Allowed;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index fd812eb676ef..4550cfdcf883 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -764,7 +764,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     AMDGPUFunctionArgInfo::DISPATCH_ID,
     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
-    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
+    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
+    AMDGPUFunctionArgInfo::LDS_KERNEL_ID,
   };
 
   static constexpr StringLiteral ImplicitAttrNames[] = {
@@ -774,7 +775,8 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
     "amdgpu-no-dispatch-id",
     "amdgpu-no-workgroup-id-x",
     "amdgpu-no-workgroup-id-y",
-    "amdgpu-no-workgroup-id-z"
+    "amdgpu-no-workgroup-id-z",
+    "amdgpu-no-lds-kernel-id",
   };
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -810,6 +812,14 @@ bool AMDGPUCallLowering::passSpecialInputs(MachineIRBuilder &MIRBuilder,
       LI->loadInputValue(InputReg, MIRBuilder, IncomingArg, ArgRC, ArgTy);
     } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
       LI->getImplicitArgPtr(InputReg, MRI, MIRBuilder);
+    } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
+      Optional<uint32_t> Id =
+          AMDGPUMachineFunction::getLDSKernelIdMetadata(MF.getFunction());
+      if (Id.has_value()) {
+        MIRBuilder.buildConstant(InputReg, Id.value());
+      } else {
+        MIRBuilder.buildUndef(InputReg);
+      }
     } else {
       // We may have proven the input wasn't needed, although the ABI is
       // requiring it. We just need to allocate the register appropriately.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 5747fc0ca8e6..229dfb62ef6e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -88,6 +88,10 @@ def gi_smrd_sgpr :
     GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
     GIComplexPatternEquiv<SMRDSgpr>;
 
+def gi_smrd_sgpr_imm :
+    GIComplexOperandMatcher<s64, "selectSmrdSgprImm">,
+    GIComplexPatternEquiv<SMRDSgprImm>;
+
 def gi_flat_offset :
     GIComplexOperandMatcher<s64, "selectFlatOffset">,
     GIComplexPatternEquiv<FlatOffset>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 6fa44ffcbfaa..632a76b32009 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -875,6 +875,8 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
       Kern.getDocument()->getNode(ProgramInfo.LDSSize);
   Kern[".private_segment_fixed_size"] =
       Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
+  Kern[".uses_dynamic_stack"] =
+      Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack);
 
   // FIXME: The metadata treats the minimum as 16?
   Kern[".kernarg_segment_align"] =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 589992c7a7ec..147c8850587e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -33,7 +33,7 @@
 #include "llvm/IR/Dominators.h"
 #endif
 
-#define DEBUG_TYPE "isel"
+#define DEBUG_TYPE "amdgpu-isel"
 
 using namespace llvm;
 
@@ -1886,21 +1886,21 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
 // Match an immediate (if Imm is true) or an SGPR (if Imm is false)
 // offset. If Imm32Only is true, match only 32-bit immediate offsets
 // available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
-                                          SDValue &Offset, bool Imm,
+bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue Addr, SDValue ByteOffsetNode,
+                                          SDValue *SOffset, SDValue *Offset,
                                           bool Imm32Only) const {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
   if (!C) {
-    if (Imm)
+    if (!SOffset)
       return false;
     if (ByteOffsetNode.getValueType().isScalarInteger() &&
         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
-      Offset = ByteOffsetNode;
+      *SOffset = ByteOffsetNode;
       return true;
     }
     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
-        Offset = ByteOffsetNode.getOperand(0);
+        *SOffset = ByteOffsetNode.getOperand(0);
         return true;
       }
     }
@@ -1912,8 +1912,8 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
   int64_t ByteOffset = C->getSExtValue();
   Optional<int64_t> EncodedOffset =
       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
-  if (EncodedOffset && Imm && !Imm32Only) {
-    Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+  if (EncodedOffset && Offset && !Imm32Only) {
+    *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
     return true;
   }
 
@@ -1922,17 +1922,17 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
     return false;
 
   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
-  if (EncodedOffset && Imm32Only) {
-    Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+  if (EncodedOffset && Offset && Imm32Only) {
+    *Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
     return true;
   }
 
   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
     return false;
 
-  if (!Imm) {
+  if (SOffset) {
     SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
-    Offset = SDValue(
+    *SOffset = SDValue(
         CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
     return true;
   }
@@ -1968,11 +1968,18 @@ SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
 // Match a base and an immediate (if Imm is true) or an SGPR
 // (if Imm is false) offset. If Imm32Only is true, match only 32-bit
 // immediate offsets available on CI.
-bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
-                                    SDValue &Offset, bool Imm,
-                                    bool Imm32Only) const {
+bool AMDGPUDAGToDAGISel::SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase,
+                                              SDValue *SOffset, SDValue *Offset,
+                                              bool Imm32Only) const {
   SDLoc SL(Addr);
 
+  if (SOffset && Offset) {
+    assert(!Imm32Only);
+    SDValue B;
+    return SelectSMRDBaseOffset(Addr, B, nullptr, Offset) &&
+           SelectSMRDBaseOffset(B, SBase, SOffset, nullptr);
+  }
+
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
   // wraparound, because s_load instructions perform the addition in 64 bits.
   if ((Addr.getValueType() != MVT::i32 ||
@@ -1987,34 +1994,55 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
       assert(N0 && N1 && isa<ConstantSDNode>(N1));
     }
     if (N0 && N1) {
-      if (SelectSMRDOffset(N1, Offset, Imm, Imm32Only)) {
-        SBase = Expand32BitAddress(N0);
+      if (SelectSMRDOffset(N0, N1, SOffset, Offset, Imm32Only)) {
+        SBase = N0;
+        return true;
+      }
+      if (SelectSMRDOffset(N1, N0, SOffset, Offset, Imm32Only)) {
+        SBase = N1;
         return true;
       }
     }
     return false;
   }
-  if (!Imm)
+  if (Offset && !SOffset) {
+    SBase = Addr;
+    *Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
+                                    SDValue *SOffset, SDValue *Offset,
+                                    bool Imm32Only) const {
+  if (!SelectSMRDBaseOffset(Addr, SBase, SOffset, Offset, Imm32Only))
     return false;
-  SBase = Expand32BitAddress(Addr);
-  Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
+  SBase = Expand32BitAddress(SBase);
   return true;
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
                                        SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, Offset, /* Imm */ true);
+  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
                                          SDValue &Offset) const {
   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
-  return SelectSMRD(Addr, SBase, Offset, /* Imm */ true, /* Imm32Only */ true);
+  return SelectSMRD(Addr, SBase, /* SOffset */ nullptr, &Offset,
+                    /* Imm32Only */ true);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
-                                        SDValue &Offset) const {
-  return SelectSMRD(Addr, SBase, Offset, /* Imm */ false);
+                                        SDValue &SOffset) const {
+  return SelectSMRD(Addr, SBase, &SOffset, /* Offset */ nullptr);
+}
+
+bool AMDGPUDAGToDAGISel::SelectSMRDSgprImm(SDValue Addr, SDValue &SBase,
+                                           SDValue &SOffset,
+                                           SDValue &Offset) const {
+  return SelectSMRD(Addr, SBase, &SOffset, &Offset);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 7894b8eb5b67..fda2bfac71fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -193,14 +193,18 @@ private:
   bool SelectScratchSVAddr(SDNode *N, SDValue Addr, SDValue &VAddr,
                            SDValue &SAddr, SDValue &Offset) const;
 
-  bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool Imm,
-                        bool Imm32Only) const;
+  bool SelectSMRDOffset(SDValue Base, SDValue ByteOffsetNode, SDValue *SOffset,
+                        SDValue *Offset, bool Imm32Only = false) const;
   SDValue Expand32BitAddress(SDValue Addr) const;
-  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool Imm,
-                  bool Imm32Only = false) const;
+  bool SelectSMRDBaseOffset(SDValue Addr, SDValue &SBase, SDValue *SOffset,
+                            SDValue *Offset, bool Imm32Only = false) const;
+  bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue *SOffset,
+                  SDValue *Offset, bool Imm32Only = false) const;
   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
-  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
+  bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &SOffset) const;
+  bool SelectSMRDSgprImm(SDValue Addr, SDValue &SBase, SDValue &SOffset,
+                         SDValue &Offset) const;
   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 70fae9d784a2..f2e5c2fe00e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1006,6 +1006,14 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
   case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
   case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
     return selectSMFMACIntrin(I);
   default:
     return selectImpl(I, *CoverageInfo);
@@ -2361,7 +2369,7 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
     return;
 
-  GEPInfo GEPInfo(*PtrMI);
+  GEPInfo GEPInfo;
 
   for (unsigned i = 1; i != 3; ++i) {
     const MachineOperand &GEPOp = PtrMI->getOperand(i);
@@ -3237,6 +3245,8 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
     return Register();
 
+  assert(Def->getNumOperands() == 3 &&
+         MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
     return Def->getOperand(1).getReg();
   }
@@ -3354,6 +3364,30 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
   case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
     Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
     break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
+    break;
+  case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
+    Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
+    break;
   default:
     llvm_unreachable("unhandled smfmac intrinsic");
   }
@@ -3800,25 +3834,82 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
   }};
 }
 
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
+bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
+                                                 Register &Base,
+                                                 Register *SOffset,
+                                                 int64_t *Offset) const {
+  MachineInstr *MI = Root.getParent();
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
+  // then we can select all ptr + 32-bit offsets.
   SmallVector<GEPInfo, 4> AddrInfo;
-  getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
+  getAddrModeInfo(*MI, *MRI, AddrInfo);
 
-  if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
-    return None;
+  if (AddrInfo.empty())
+    return false;
 
-  const GEPInfo &GEPInfo = AddrInfo[0];
+  const GEPInfo &GEPI = AddrInfo[0];
   Optional<int64_t> EncodedImm =
-      AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
-  if (!EncodedImm)
+      AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
+
+  if (SOffset && Offset) {
+    if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
+        AddrInfo.size() > 1) {
+      const GEPInfo &GEPI2 = AddrInfo[1];
+      if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
+        if (Register OffsetReg =
+                matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
+          Base = GEPI2.SgprParts[0];
+          *SOffset = OffsetReg;
+          *Offset = *EncodedImm;
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
+    Base = GEPI.SgprParts[0];
+    *Offset = *EncodedImm;
+    return true;
+  }
+
+  // SGPR offset is unsigned.
+  if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
+      GEPI.Imm != 0) {
+    // If we make it this far we have a load with an 32-bit immediate offset.
+    // It is OK to select this using a sgpr offset, because we have already
+    // failed trying to select this load into one of the _IMM variants since
+    // the _IMM Patterns are considered before the _SGPR patterns.
+    Base = GEPI.SgprParts[0];
+    *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
+        .addImm(GEPI.Imm);
+    return true;
+  }
+
+  if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
+    if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
+      Base = GEPI.SgprParts[0];
+      *SOffset = OffsetReg;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
+  Register Base;
+  int64_t Offset;
+  if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
     return None;
 
-  unsigned PtrReg = GEPInfo.SgprParts[0];
-  return {{
-    [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
-    [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
-  }};
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
 }
 
 InstructionSelector::ComplexRendererFns
@@ -3844,43 +3935,24 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
 
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
-  MachineInstr *MI = Root.getParent();
-  MachineBasicBlock *MBB = MI->getParent();
-
-  SmallVector<GEPInfo, 4> AddrInfo;
-  getAddrModeInfo(*MI, *MRI, AddrInfo);
-
-  // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
-  // then we can select all ptr + 32-bit offsets.
-  if (AddrInfo.empty())
+  Register Base, SOffset;
+  if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
     return None;
 
-  const GEPInfo &GEPInfo = AddrInfo[0];
-  Register PtrReg = GEPInfo.SgprParts[0];
-
-  // SGPR offset is unsigned.
-  if (AddrInfo[0].SgprParts.size() == 1 && isUInt<32>(GEPInfo.Imm) &&
-      GEPInfo.Imm != 0) {
-    // If we make it this far we have a load with an 32-bit immediate offset.
-    // It is OK to select this using a sgpr offset, because we have already
-    // failed trying to select this load into one of the _IMM variants since
-    // the _IMM Patterns are considered before the _SGPR patterns.
-    Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
-    BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
-        .addImm(GEPInfo.Imm);
-    return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
-             [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
-  }
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
+           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+}
 
-  if (AddrInfo[0].SgprParts.size() == 2 && GEPInfo.Imm == 0) {
-    if (Register OffsetReg =
-            matchZeroExtendFromS32(*MRI, GEPInfo.SgprParts[1])) {
-      return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
-               [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }}};
-    }
-  }
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
+  Register Base, SOffset;
+  int64_t Offset;
+  if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
+    return None;
 
-  return None;
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
+           [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
 }
 
 std::pair<Register, int>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 22672ba59e76..5baf55d23480 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -63,11 +63,9 @@ public:
 
 private:
   struct GEPInfo {
-    const MachineInstr &GEP;
     SmallVector<unsigned, 2> SgprParts;
     SmallVector<unsigned, 2> VgprParts;
-    int64_t Imm;
-    GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
+    int64_t Imm = 0;
   };
 
   bool isSGPR(Register Reg) const;
@@ -200,12 +198,16 @@ private:
   InstructionSelector::ComplexRendererFns
   selectVINTERPModsHi(MachineOperand &Root) const;
 
+  bool selectSmrdOffset(MachineOperand &Root, Register &Base, Register *SOffset,
+                        int64_t *Offset) const;
   InstructionSelector::ComplexRendererFns
   selectSmrdImm(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectSmrdImm32(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectSmrdSgpr(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectSmrdSgprImm(MachineOperand &Root) const;
 
   std::pair<Register, int> selectFlatOffsetImpl(MachineOperand &Root,
                                                 uint64_t FlatVariant) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 01a3e78ea48c..0979debe9777 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4197,6 +4197,35 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
   return true;
 }
 
+bool AMDGPULegalizerInfo::getLDSKernelId(Register DstReg,
+                                         MachineRegisterInfo &MRI,
+                                         MachineIRBuilder &B) const {
+  Function &F = B.getMF().getFunction();
+  Optional<uint32_t> KnownSize =
+      AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
+  if (KnownSize.has_value())
+    B.buildConstant(DstReg, KnownSize.value());
+  return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeLDSKernelId(MachineInstr &MI,
+                                              MachineRegisterInfo &MRI,
+                                              MachineIRBuilder &B) const {
+
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+  if (!MFI->isEntryFunction()) {
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
+  }
+
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!getLDSKernelId(DstReg, MRI, B))
+    return false;
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
                                               MachineRegisterInfo &MRI,
                                               MachineIRBuilder &B,
@@ -5636,6 +5665,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_workgroup_id_z:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_lds_kernel_id:
+    return legalizePreloadedArgIntrin(MI, MRI, B,
+                                      AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
   case Intrinsic::amdgcn_dispatch_ptr:
     return legalizePreloadedArgIntrin(MI, MRI, B,
                                       AMDGPUFunctionArgInfo::DISPATCH_PTR);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index cee533aa34ec..5e8111e22aad 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -155,6 +155,13 @@ public:
 
   bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B) const;
+
+  bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &B) const;
+
+  bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B) const;
+
   bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B, unsigned AddrSpace) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 78e092b2e872..7e49a6117ebd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -376,15 +376,7 @@ static bool HasNative(AMDGPULibFunc::EFuncId id) {
   return false;
 }
 
-struct TableRef {
-  size_t size;
-  const TableEntry *table; // variable size: from 0 to (size - 1)
-
-  TableRef() : size(0), table(nullptr) {}
-
-  template <size_t N>
-  TableRef(const TableEntry (&tbl)[N]) : size(N), table(&tbl[0]) {}
-};
+using TableRef = ArrayRef<TableEntry>;
 
 static TableRef getOptTable(AMDGPULibFunc::EFuncId id) {
   switch(id) {
@@ -698,11 +690,10 @@ bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
 bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
   // Table-Driven optimization
   const TableRef tr = getOptTable(FInfo.getId());
-  if (tr.size==0)
+  if (tr.empty())
     return false;
 
-  int const sz = (int)tr.size;
-  const TableEntry * const ftbl = tr.table;
+  int const sz = (int)tr.size();
   Value *opr0 = CI->getArgOperand(0);
 
   if (getVecSize(FInfo) > 1) {
@@ -714,8 +705,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
         assert(eltval && "Non-FP arguments in math function!");
         bool found = false;
         for (int i=0; i < sz; ++i) {
-          if (eltval->isExactlyValue(ftbl[i].input)) {
-            DVal.push_back(ftbl[i].result);
+          if (eltval->isExactlyValue(tr[i].input)) {
+            DVal.push_back(tr[i].result);
             found = true;
             break;
           }
@@ -746,8 +737,8 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
     // Scalar version
     if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
       for (int i = 0; i < sz; ++i) {
-        if (CF->isExactlyValue(ftbl[i].input)) {
-          Value *nval = ConstantFP::get(CF->getType(), ftbl[i].result);
+        if (CF->isExactlyValue(tr[i].input)) {
+          Value *nval = ConstantFP::get(CF->getType(), tr[i].result);
           LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
           replaceCall(nval);
           return true;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 35922341de26..b4a8766d682e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -55,21 +55,6 @@ static cl::opt<bool> SuperAlignLDSGlobals(
     cl::init(true), cl::Hidden);
 
 namespace {
-
-SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
-  SmallPtrSet<GlobalValue *, 32> UsedList;
-
-  SmallVector<GlobalValue *, 32> TmpVec;
-  collectUsedGlobalVariables(M, TmpVec, true);
-  UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-  TmpVec.clear();
-  collectUsedGlobalVariables(M, TmpVec, false);
-  UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-  return UsedList;
-}
-
 class AMDGPULowerModuleLDS : public ModulePass {
 
   static void removeFromUsedList(Module &M, StringRef Name,
@@ -153,9 +138,6 @@ class AMDGPULowerModuleLDS : public ModulePass {
                        "");
   }
 
-private:
-  SmallPtrSet<GlobalValue *, 32> UsedList;
-
 public:
   static char ID;
 
@@ -165,9 +147,10 @@ public:
 
   bool runOnModule(Module &M) override {
     CallGraph CG = CallGraph(M);
-    UsedList = getUsedList(M);
     bool Changed = superAlignLDSGlobals(M);
-    Changed |= processUsedLDS(CG, M);
+    std::vector<GlobalVariable *> ModuleScopeVariables =
+        AMDGPU::findVariablesToLower(M, nullptr);
+    Changed |= processUsedLDS(CG, M, ModuleScopeVariables);
 
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
@@ -176,10 +159,11 @@ public:
       // Only lower compute kernels' LDS.
       if (!AMDGPU::isKernel(F.getCallingConv()))
         continue;
-      Changed |= processUsedLDS(CG, M, &F);
+      std::vector<GlobalVariable *> KernelUsedVariables =
+          AMDGPU::findVariablesToLower(M, &F);
+      Changed |= processUsedLDS(CG, M, KernelUsedVariables, &F);
     }
 
-    UsedList.clear();
     return Changed;
   }
 
@@ -228,22 +212,20 @@ private:
     return Changed;
   }
 
-  bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
+  bool processUsedLDS(CallGraph const &CG, Module &M,
+                      std::vector<GlobalVariable *> const &LDSVarsToTransform,
+                      Function *F = nullptr) {
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
 
-    // Find variables to move into new struct instance
-    std::vector<GlobalVariable *> FoundLocalVars =
-        AMDGPU::findVariablesToLower(M, F);
-
-    if (FoundLocalVars.empty()) {
+    if (LDSVarsToTransform.empty()) {
       // No variables to rewrite, no changes made.
       return false;
     }
 
     SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
-    LayoutFields.reserve(FoundLocalVars.size());
-    for (GlobalVariable *GV : FoundLocalVars) {
+    LayoutFields.reserve(LDSVarsToTransform.size());
+    for (GlobalVariable *GV : LDSVarsToTransform) {
       OptimizedStructLayoutField F(GV, DL.getTypeAllocSize(GV->getValueType()),
                                    AMDGPU::getAlign(DL, GV));
       LayoutFields.emplace_back(F);
@@ -252,7 +234,7 @@ private:
     performOptimizedStructLayout(LayoutFields);
 
     std::vector<GlobalVariable *> LocalVars;
-    LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+    LocalVars.reserve(LDSVarsToTransform.size()); // will be at least this large
     {
       // This usually won't need to insert any padding, perhaps avoid the alloc
       uint64_t CurrentOffset = 0;
@@ -352,7 +334,6 @@ private:
         GV->replaceAllUsesWith(GEP);
       }
       if (GV->use_empty()) {
-        UsedList.erase(GV);
         GV->eraseFromParent();
       }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index b461c3c4bfdc..f5e12fd960d0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -11,6 +11,7 @@
 #include "AMDGPUPerfHintAnalysis.h"
 #include "AMDGPUSubtarget.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -101,6 +102,21 @@ void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Function &F) {
   }
 }
 
+Optional<uint32_t>
+AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
+  auto MD = F.getMetadata("llvm.amdgcn.lds.kernel.id");
+  if (MD && MD->getNumOperands() == 1) {
+    ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(0));
+    if (KnownSize) {
+      uint64_t V = KnownSize->getZExtValue();
+      if (V <= UINT32_MAX) {
+        return V;
+      }
+    }
+  }
+  return {};
+}
+
 void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
                                            const GlobalVariable &GV) {
   assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index df62c2314617..97db8b7eb8d6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -11,11 +11,12 @@
 
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Function.h"
 
 namespace llvm {
 
@@ -104,6 +105,8 @@ public:
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
   void allocateModuleLDSGlobal(const Function &F);
 
+  static Optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
+
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
   void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 09dbd2150db6..a9f1e9bd0996 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -74,10 +74,10 @@ public:
 
 private:
   struct MemAccessInfo {
-    const Value *V;
-    const Value *Base;
-    int64_t Offset;
-    MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {}
+    const Value *V = nullptr;
+    const Value *Base = nullptr;
+    int64_t Offset = 0;
+    MemAccessInfo() = default;
     bool isLargeStride(MemAccessInfo &Reference) const;
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     Printable print() const {
@@ -116,6 +116,7 @@ private:
 
   bool isGlobalAddr(const Value *V) const;
   bool isLocalAddr(const Value *V) const;
+  bool isGlobalLoadUsedInBB(const Instruction &) const;
 };
 
 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
@@ -196,6 +197,24 @@ bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const {
   return false;
 }
 
+// Returns true if the global load `I` is used in its own basic block.
+bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
+  const auto *Ld = dyn_cast<LoadInst>(&I);
+  if (!Ld)
+    return false;
+  if (!isGlobalAddr(Ld->getPointerOperand()))
+    return false;
+
+  for (const User *Usr : Ld->users()) {
+    if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
+      if (UsrInst->getParent() == I.getParent())
+        return true;
+    }
+  }
+
+  return false;
+}
+
 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
   AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
 
@@ -203,9 +222,14 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
 
   for (auto &B : F) {
     LastAccess = MemAccessInfo();
+    unsigned UsedGlobalLoadsInBB = 0;
     for (auto &I : B) {
       if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
         unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
+        // TODO: Check if the global load and its user are close to each other
+        // instead (Or do this analysis in GCNSchedStrategy?).
+        if (isGlobalLoadUsedInBB(I))
+          UsedGlobalLoadsInBB += Size;
         if (isIndirectAccess(&I))
           FI.IAMInstCost += Size;
         if (isLargeStride(&I))
@@ -245,6 +269,16 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
         ++FI.InstCost;
       }
     }
+
+    if (!FI.HasDenseGlobalMemAcc) {
+      unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
+      if (GlobalMemAccPercentage > 50) {
+        LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
+                          << B.getName() << " has " << GlobalMemAccPercentage
+                          << "% global memory access\n");
+        FI.HasDenseGlobalMemAcc = true;
+      }
+    }
   }
 
   return &FI;
@@ -286,6 +320,11 @@ bool AMDGPUPerfHint::runOnFunction(Function &F) {
 }
 
 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+  // Reverting optimal scheduling in favour of occupancy with basic block(s)
+  // having dense global memory access can potentially hurt performance.
+  if (FI.HasDenseGlobalMemAcc)
+    return true;
+
   return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
index 31ff80f5f431..2db8db6957ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -41,7 +41,11 @@ public:
     unsigned InstCost;
     unsigned IAMInstCost; // Indirect access memory instruction count
     unsigned LSMInstCost; // Large stride memory instruction count
-    FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
+    bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
+                               // high global memory access
+    FuncInfo()
+        : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0),
+          HasDenseGlobalMemAcc(false) {}
   };
 
   typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 0df6f4d45b06..bd8e568213b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -153,7 +153,10 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
   if (!isVgprRegBank(Dst))
     return false;
 
-  if (MRI.getType(Dst).isVector())
+  // med3 for i16 is only available on gfx9+, and not available for v2i16.
+  LLT Ty = MRI.getType(Dst);
+  if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) &&
+      Ty != LLT::scalar(32))
     return false;
 
   MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0830cbd919a0..887341e67454 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4426,7 +4426,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
     case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
     case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
-    case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
+    case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
+    case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
       // Default for MAI intrinsics.
       // srcC can also be an immediate which can be folded later.
       // FIXME: Should we eventually add an alternative mapping with AGPR src
@@ -4451,7 +4459,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
     case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
     case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
-    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: {
+    case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
+    case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
       // vdst, srcA, srcB, srcC, idx
       OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
       OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
index 4d7a3f4028e8..aa51c5d20bdc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -141,7 +141,7 @@ class ReplaceLDSUseImpl {
   std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
     // Collect LDS which requires module lowering.
     std::vector<GlobalVariable *> LDSGlobals =
-        llvm::AMDGPU::findVariablesToLower(M);
+        llvm::AMDGPU::findVariablesToLower(M, nullptr);
 
     // Remove LDS which don't qualify for replacement.
     llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8297635d7bb2..5d7bade00a3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -340,12 +340,28 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
 def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
 def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
 def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
 def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
 def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
 def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
 def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
 
 // The dummy boolean output is divergent from the IR's perspective,
 // but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 6bd906439ee8..cf4826d81b4b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -736,13 +736,18 @@ static unsigned getMaxNumPreloadedSGPRs() {
                           2 + // dispatch ID
                           2 + // flat scratch init
                           2;  // Implicit buffer ptr
+
   // Max number of system SGPRs
   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
                             1 + // WorkGroupIDY
                             1 + // WorkGroupIDZ
                             1 + // WorkGroupInfo
                             1;  // private segment wave byte offset
-  return MaxUserSGPRs + MaxSystemSGPRs;
+
+  // Max number of synthetic SGPRs
+  unsigned SyntheticSGPRs = 1; // LDSKernelId
+
+  return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
 }
 
 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
@@ -852,34 +857,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
     return MI && TII->isVALU(*MI);
   }
 
-  bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
-    if (Pred->NodeNum < Succ->NodeNum)
-      return true;
-
-    SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
-
-    for (unsigned I = 0; I < Succs.size(); ++I) {
-      for (const SDep &SI : Succs[I]->Succs) {
-        const SUnit *SU = SI.getSUnit();
-        if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
-          Succs.push_back(SU);
-      }
-    }
-
-    SmallPtrSet<const SUnit*, 32> Visited;
-    while (!Preds.empty()) {
-      const SUnit *SU = Preds.pop_back_val();
-      if (llvm::is_contained(Succs, SU))
-        return false;
-      Visited.insert(SU);
-      for (const SDep &SI : SU->Preds)
-        if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
-          Preds.push_back(SI.getSUnit());
-    }
-
-    return true;
-  }
-
   // Link as many SALU instructions in chain as possible. Return the size
   // of the chain. Links up to MaxChain instructions.
   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
@@ -895,18 +872,20 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
 
-      if (SU->addPred(SDep(From, SDep::Artificial), false))
-        ++Linked;
+      if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
+        if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
+          ++Linked;
 
       for (SDep &SI : From->Succs) {
         SUnit *SUv = SI.getSUnit();
-        if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
-          SUv->addPred(SDep(SU, SDep::Artificial), false);
+        if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
+            DAG->canAddEdge(SUv, SU))
+          DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
       }
 
       for (SDep &SI : SU->Succs) {
         SUnit *Succ = SI.getSUnit();
-        if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
+        if (Succ != SU && isSALU(Succ))
           Worklist.push_back(Succ);
       }
     }
@@ -949,7 +928,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
         if (Visited.count(&*LastSALU))
           continue;
 
-        if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
+        if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
+            !DAG->canAddEdge(&*LastSALU, &SU))
           continue;
 
         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 971e44723758..dca926867300 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1584,6 +1584,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
        parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentSize,
                              AMDGPU::SGPR_32RegClass,
                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
+       parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
+                             AMDGPU::SGPR_32RegClass,
+                             MFI->ArgInfo.LDSKernelId, 0, 1) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
                              0, 1) ||
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index e12d0ffef35c..2a9393fc1595 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1739,6 +1739,8 @@ public:
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
   void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
+                    OptionalImmIndexMap &OptionalIdx);
   void cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
                 OptionalImmIndexMap &OptionalIdx);
 
@@ -1767,21 +1769,11 @@ public:
   void cvtDPP8(MCInst &Inst, const OperandVector &Operands) {
     cvtDPP(Inst, Operands, true);
   }
-  void cvtVOPCNoDstDPP(MCInst &Inst, const OperandVector &Operands,
-                       bool IsDPP8 = false);
-  void cvtVOPCNoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
-    cvtVOPCNoDstDPP(Inst, Operands, true);
-  }
   void cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
                   bool IsDPP8 = false);
   void cvtVOP3DPP8(MCInst &Inst, const OperandVector &Operands) {
     cvtVOP3DPP(Inst, Operands, true);
   }
-  void cvtVOPC64NoDstDPP(MCInst &Inst, const OperandVector &Operands,
-                         bool IsDPP8 = false);
-  void cvtVOPC64NoDstDPP8(MCInst &Inst, const OperandVector &Operands) {
-    cvtVOPC64NoDstDPP(Inst, Operands, true);
-  }
 
   OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
                                     AMDGPUOperand::ImmTy Type);
@@ -4177,7 +4169,9 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
       return false;
   }
 
-  if (isGFX940() && (MII.get(Opc).TSFlags & SIInstrFlags::IsDOT)) {
+  uint64_t TSFlags = MII.get(Opc).TSFlags;
+
+  if (isGFX940() && (TSFlags & SIInstrFlags::IsDOT)) {
     int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
     if (OpSelIdx != -1) {
       if (Inst.getOperand(OpSelIdx).getImm() != 0)
@@ -4190,6 +4184,15 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
     }
   }
 
+  // op_sel[0:1] must be 0 for v_dot2_bf16_bf16 and v_dot2_f16_f16 (VOP3 Dot).
+  if ((TSFlags & SIInstrFlags::IsDOT) && (TSFlags & SIInstrFlags::VOP3) &&
+      !(TSFlags & SIInstrFlags::VOP3P)) {
+    int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+    unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+    if (OpSel & 3)
+      return false;
+  }
+
   return true;
 }
 
@@ -4636,9 +4639,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
     Error(IDLoc, "ABS not allowed in VOP3B instructions");
     return false;
   }
-  if (!validateCoherencyBits(Inst, Operands, IDLoc)) {
-    return false;
-  }
   if (!validateExeczVcczOperands(Operands)) {
     return false;
   }
@@ -5004,6 +5004,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
       PARSE_BITS_ENTRY(KD.kernel_code_properties,
                        KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
                        Val, ValRange);
+    } else if (ID == ".amdhsa_uses_dynamic_stack") {
+      PARSE_BITS_ENTRY(KD.kernel_code_properties,
+                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK, Val, ValRange);
     } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
       if (hasArchitectedFlatScratch())
         return Error(IDRange.Start,
@@ -8024,10 +8027,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
   return MatchOperand_NoMatch;
 }
 
-void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands) {
-  cvtVOP3P(Inst, Operands);
-
+// Determines which bit DST_OP_SEL occupies in the op_sel operand according to
+// the number of src operands present, then copies that bit into src0_modifiers.
+void cvtVOP3DstOpSelOnly(MCInst &Inst) {
   int Opc = Inst.getOpcode();
+  int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+  if (OpSelIdx == -1)
+    return;
 
   int SrcNum;
   const int Ops[] = { AMDGPU::OpName::src0,
@@ -8038,7 +8044,6 @@ void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands)
        ++SrcNum);
   assert(SrcNum > 0);
 
-  int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
   unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
 
   if ((OpSel & (1 << SrcNum)) != 0) {
@@ -8048,6 +8053,18 @@ void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands)
   }
 }
 
+void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst,
+                                   const OperandVector &Operands) {
+  cvtVOP3P(Inst, Operands);
+  cvtVOP3DstOpSelOnly(Inst);
+}
+
+void AMDGPUAsmParser::cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
+                                   OptionalImmIndexMap &OptionalIdx) {
+  cvtVOP3P(Inst, Operands, OptionalIdx);
+  cvtVOP3DstOpSelOnly(Inst);
+}
+
 static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
       // 1. This operand is input modifiers
   return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS
@@ -8241,6 +8258,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
 
   const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
 
+  if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
+    Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
+    Inst.addOperand(Inst.getOperand(0));
+  }
+
   if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1) {
     assert(!IsPacked);
     Inst.addOperand(Inst.getOperand(0));
@@ -8747,14 +8770,6 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi);
 }
 
-// Add dummy $old operand
-void AMDGPUAsmParser::cvtVOPC64NoDstDPP(MCInst &Inst,
-                                        const OperandVector &Operands,
-                                        bool IsDPP8) {
-  Inst.addOperand(MCOperand::createReg(0));
-  cvtVOP3DPP(Inst, Operands, IsDPP8);
-}
-
 void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
   OptionalImmIndexMap OptionalIdx;
   unsigned Opc = Inst.getOpcode();
@@ -8802,6 +8817,8 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo
   }
   if (Desc.TSFlags & SIInstrFlags::VOP3P)
     cvtVOP3P(Inst, Operands, OptionalIdx);
+  else if (Desc.TSFlags & SIInstrFlags::VOP3)
+    cvtVOP3OpSel(Inst, Operands, OptionalIdx);
   else if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOpSel);
   }
@@ -8821,14 +8838,6 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands, bo
   }
 }
 
-// Add dummy $old operand
-void AMDGPUAsmParser::cvtVOPCNoDstDPP(MCInst &Inst,
-                                      const OperandVector &Operands,
-                                      bool IsDPP8) {
-  Inst.addOperand(MCOperand::createReg(0));
-  cvtDPP(Inst, Operands, IsDPP8);
-}
-
 void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8) {
   OptionalImmIndexMap OptionalIdx;
 
@@ -9043,12 +9052,27 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
     // v_nop_sdwa_sdwa_vi/gfx9 has no optional sdwa arguments
     switch (BasicInstType) {
     case SIInstrFlags::VOP1:
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
-        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::clamp) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTyClampSI, 0);
+      }
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::omod) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTyOModSI, 0);
+      }
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::dst_sel) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+      }
+      if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                     AMDGPU::OpName::dst_unused) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx,
+                              AMDGPUOperand::ImmTySdwaDstUnused,
+                              DstUnused::UNUSED_PRESERVE);
       }
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
-      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       break;
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ccaf646008b1..98ee720200b4 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -451,7 +451,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
         if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
           convertVOP3PDPPInst(MI);
         else if (AMDGPU::isVOPC64DPP(MI.getOpcode()))
-          convertVOPCDPPInst(MI);
+          convertVOPCDPPInst(MI); // Special VOP3 case
+        else {
+          assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
+          convertVOP3DPPInst(MI); // Regular VOP3 case
+        }
         break;
       }
       Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address);
@@ -745,6 +749,43 @@ DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
   return MCDisassembler::Success;
 }
 
+struct VOPModifiers {
+  unsigned OpSel = 0;
+  unsigned OpSelHi = 0;
+  unsigned NegLo = 0;
+  unsigned NegHi = 0;
+};
+
+// Reconstruct values of VOP3/VOP3P operands such as op_sel.
+// Note that these values do not affect disassembler output,
+// so this is only necessary for consistency with src_modifiers.
+static VOPModifiers collectVOPModifiers(const MCInst &MI,
+                                        bool IsVOP3P = false) {
+  VOPModifiers Modifiers;
+  unsigned Opc = MI.getOpcode();
+  const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
+                        AMDGPU::OpName::src1_modifiers,
+                        AMDGPU::OpName::src2_modifiers};
+  for (int J = 0; J < 3; ++J) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+    if (OpIdx == -1)
+      continue;
+
+    unsigned Val = MI.getOperand(OpIdx).getImm();
+
+    Modifiers.OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
+    if (IsVOP3P) {
+      Modifiers.OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
+      Modifiers.NegLo |= !!(Val & SISrcMods::NEG) << J;
+      Modifiers.NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
+    } else if (J == 0) {
+      Modifiers.OpSel |= !!(Val & SISrcMods::DST_OP_SEL) << 3;
+    }
+  }
+
+  return Modifiers;
+}
+
 // We must check FI == literal to reject not genuine dpp8 insts, and we must
 // first add optional MI operands to check FI
 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
@@ -755,6 +796,11 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
              AMDGPU::isVOPC64DPP(Opc)) {
     convertVOPCDPPInst(MI);
+  } else if (MI.getNumOperands() < DescNumOps &&
+             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+    auto Mods = collectVOPModifiers(MI);
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+                         AMDGPU::OpName::op_sel);
   } else {
     // Insert dummy unused src modifiers.
     if (MI.getNumOperands() < DescNumOps &&
@@ -770,6 +816,18 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   return isValidDPP8(MI) ? MCDisassembler::Success : MCDisassembler::SoftFail;
 }
 
+DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
+  unsigned Opc = MI.getOpcode();
+  unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+  if (MI.getNumOperands() < DescNumOps &&
+      AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1) {
+    auto Mods = collectVOPModifiers(MI);
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
+                         AMDGPU::OpName::op_sel);
+  }
+  return MCDisassembler::Success;
+}
+
 // Note that before gfx10, the MIMG encoding provided no information about
 // VADDR size. Consequently, decoded instructions always show address as if it
 // has 1 dword, which could be not really so.
@@ -914,45 +972,27 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
 DecodeStatus AMDGPUDisassembler::convertVOP3PDPPInst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
+  auto Mods = collectVOPModifiers(MI, true);
 
   if (MI.getNumOperands() < DescNumOps &&
       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in) != -1)
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vdst_in);
 
-  const int ModOps[] = {AMDGPU::OpName::src0_modifiers,
-                        AMDGPU::OpName::src1_modifiers,
-                        AMDGPU::OpName::src2_modifiers};
-  unsigned OpSel = 0;
-  unsigned OpSelHi = 0;
-  unsigned NegLo = 0;
-  unsigned NegHi = 0;
-  for (int J = 0; J < 3; ++J) {
-    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
-    if (OpIdx == -1)
-      break;
-    unsigned Val = MI.getOperand(OpIdx).getImm();
-
-    OpSel |= !!(Val & SISrcMods::OP_SEL_0) << J;
-    OpSelHi |= !!(Val & SISrcMods::OP_SEL_1) << J;
-    NegLo |= !!(Val & SISrcMods::NEG) << J;
-    NegHi |= !!(Val & SISrcMods::NEG_HI) << J;
-  }
-
   if (MI.getNumOperands() < DescNumOps &&
       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(OpSel),
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSel),
                          AMDGPU::OpName::op_sel);
   if (MI.getNumOperands() < DescNumOps &&
       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(OpSelHi),
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.OpSelHi),
                          AMDGPU::OpName::op_sel_hi);
   if (MI.getNumOperands() < DescNumOps &&
       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(NegLo),
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegLo),
                          AMDGPU::OpName::neg_lo);
   if (MI.getNumOperands() < DescNumOps &&
       AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi) != -1)
-    insertNamedMCOperand(MI, MCOperand::createImm(NegHi),
+    insertNamedMCOperand(MI, MCOperand::createImm(Mods.NegHi),
                          AMDGPU::OpName::neg_hi);
 
   return MCDisassembler::Success;
@@ -2000,6 +2040,9 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
     }
 
+    PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
+                    KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
+
     if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
       return MCDisassembler::Fail;
 
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 31869f0917ae..d17e2d8d5082 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -162,6 +162,7 @@ public:
   DecodeStatus convertSDWAInst(MCInst &MI) const;
   DecodeStatus convertDPP8Inst(MCInst &MI) const;
   DecodeStatus convertMIMGInst(MCInst &MI) const;
+  DecodeStatus convertVOP3DPPInst(MCInst &MI) const;
   DecodeStatus convertVOP3PDPPInst(MCInst &MI) const;
   DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 5d254518c67a..4558ddf6dbfe 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -202,6 +202,19 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n");
     return nullptr;
   }
+  int OrigOpE32 = AMDGPU::getVOPe32(OrigOp);
+  // Prior checks cover Mask with VOPC condition, but not on purpose
+  auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
+  assert(RowMaskOpnd && RowMaskOpnd->isImm());
+  auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
+  assert(BankMaskOpnd && BankMaskOpnd->isImm());
+  const bool MaskAllLanes =
+      RowMaskOpnd->getImm() == 0xF && BankMaskOpnd->getImm() == 0xF;
+  (void)MaskAllLanes;
+  assert(MaskAllLanes ||
+         !(TII->isVOPC(DPPOp) ||
+           (TII->isVOP3(DPPOp) && OrigOpE32 != -1 && TII->isVOPC(OrigOpE32))) &&
+             "VOPC cannot form DPP unless mask is full");
 
   auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
                          OrigMI.getDebugLoc(), TII->get(DPPOp))
@@ -234,6 +247,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
                      CombOldVGPR.SubReg);
       ++NumOperands;
+    } else if (TII->isVOPC(DPPOp) || (TII->isVOP3(DPPOp) && OrigOpE32 != -1 &&
+                                      TII->isVOPC(OrigOpE32))) {
+      // VOPC DPP and VOPC promoted to VOP3 DPP do not have an old operand
+      // because they write to SGPRs not VGPRs
     } else {
       // TODO: this discards MAC/FMA instructions for now, let's add it later
       LLVM_DEBUG(dbgs() << "  failed: no old operand in DPP instruction,"
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index 281474994bca..6ff349e31f22 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -249,11 +249,11 @@ def : ProcessorModel<"gfx1036", GFX10SpeedModel,
 //===----------------------------------------------------------------------===//
 
 def : ProcessorModel<"gfx1100", GFX11SpeedModel,
-  FeatureISAVersion11_0.Features
+  FeatureISAVersion11_0_0.Features
 >;
 
 def : ProcessorModel<"gfx1101", GFX11SpeedModel,
-  FeatureISAVersion11_0.Features
+  FeatureISAVersion11_0_1.Features
 >;
 
 def : ProcessorModel<"gfx1102", GFX11SpeedModel,
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 100410bb7644..04da14cc4916 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -733,7 +733,7 @@ void GCNScheduleDAGMILive::collectRematerializableInstructions() {
 
     MachineOperand *Op = MRI.getOneDef(Reg);
     MachineInstr *Def = Op->getParent();
-    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def, AA))
+    if (Op->getSubReg() != 0 || !isTriviallyReMaterializable(*Def))
       continue;
 
     MachineInstr *UseI = &*MRI.use_instr_nodbg_begin(Reg);
@@ -943,9 +943,8 @@ bool GCNScheduleDAGMILive::sinkTriviallyRematInsts(const GCNSubtarget &ST,
 }
 
 // Copied from MachineLICM
-bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI,
-                                                       AAResults *AA) {
-  if (!TII->isTriviallyReMaterializable(MI, AA))
+bool GCNScheduleDAGMILive::isTriviallyReMaterializable(const MachineInstr &MI) {
+  if (!TII->isTriviallyReMaterializable(MI))
     return false;
 
   for (const MachineOperand &MO : MI.operands())
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 97f94f69b70e..c3db849cf81a 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -142,7 +142,7 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   // and single use outside the defining block into RematerializableInsts.
   void collectRematerializableInstructions();
 
-  bool isTriviallyReMaterializable(const MachineInstr &MI, AAResults *AA);
+  bool isTriviallyReMaterializable(const MachineInstr &MI);
 
   // TODO: Should also attempt to reduce RP of SGPRs and AGPRs
   // Attempt to reduce RP of VGPR by sinking trivially rematerializable
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index d269d0945f3b..d71f80c5f458 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -145,6 +145,7 @@ protected:
   bool HasDot7Insts = false;
   bool HasDot8Insts = false;
   bool HasMAIInsts = false;
+  bool HasFP8Insts = false;
   bool HasPkFmacF16Inst = false;
   bool HasAtomicFaddRtnInsts = false;
   bool HasAtomicFaddNoRtnInsts = false;
@@ -721,6 +722,10 @@ public:
     return HasMAIInsts;
   }
 
+  bool hasFP8Insts() const {
+    return HasFP8Insts;
+  }
+
   bool hasPkFmacF16Inst() const {
     return HasPkFmacF16Inst;
   }
@@ -930,7 +935,7 @@ public:
   }
 
   bool hasUserSGPRInit16Bug() const {
-    return UserSGPRInit16Bug;
+    return UserSGPRInit16Bug && isWave32();
   }
 
   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index bd938d829953..21ff2744e5b4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -627,7 +627,7 @@ void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
 
 bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
                                         unsigned OpNo) const {
-  return OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) &&
+  return OpNo == 0 && (Desc.TSFlags & SIInstrFlags::DPP) &&
          (Desc.TSFlags & SIInstrFlags::VOPC) &&
          (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
           Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO));
@@ -644,8 +644,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   // If there are printed modifiers, printOperandAndFPInputMods or
   // printOperandAndIntInputMods will be called instead
   if ((OpNo == 0 ||
-       (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP)) ||
-       (OpNo == 2 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
+       (OpNo == 1 && (Desc.TSFlags & SIInstrFlags::DPP) && ModIdx != -1)) &&
       (Desc.TSFlags & SIInstrFlags::VOPC) &&
       (Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC) ||
        Desc.hasImplicitDefOfPhysReg(AMDGPU::VCC_LO)))
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 078133469549..0e71509cf2bd 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -367,6 +367,8 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
                 kernel_code_properties,
                 amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
+  PRINT_FIELD(OS, ".amdhsa_uses_dynamic_stack", KD, kernel_code_properties,
+              amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
   PRINT_FIELD(OS,
               (hasArchitectedFlatScratch(STI)
                    ? ".amdhsa_enable_private_segment"
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index f54778535b7c..3e95c55df57e 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -67,6 +67,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
@@ -81,9 +82,9 @@ static cl::opt<bool> EnableM0Merge(
   cl::init(true));
 
 namespace {
-
 class SIFixSGPRCopies : public MachineFunctionPass {
   MachineDominatorTree *MDT;
+  unsigned NextVGPRToSGPRCopyID;
 
 public:
   static char ID;
@@ -92,9 +93,16 @@ public:
   const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
-  SIFixSGPRCopies() : MachineFunctionPass(ID) {}
+  SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+  unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
+  void lowerVGPR2SGPRCopies(MachineFunction &MF);
+  // Handles copies which source register is:
+  // 1. Physical register
+  // 2. AGPR
+  // 3. Defined by the instruction the merely moves the immediate
+  bool lowerSpecialCase(MachineInstr &MI);
 
   MachineBasicBlock *processPHINode(MachineInstr &MI);
 
@@ -569,6 +577,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   TII = ST.getInstrInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
 
+  // We have to lower VGPR to SGPR copies before the main loop
+  // because the REG_SEQUENCE and PHI lowering in main loop
+  // convert the def-use chains to VALU and close the opportunities
+  // for keeping them scalar.
+  // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
+  // addresses their lowering and unify the processing in one main loop.
+  lowerVGPR2SGPRCopies(MF);
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
     MachineBasicBlock *MBB = &*BI;
@@ -640,42 +656,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
-        if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
-          Register SrcReg = MI.getOperand(1).getReg();
-          if (!SrcReg.isVirtual()) {
-            MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
-            if (NewBB && NewBB != MBB) {
-              MBB = NewBB;
-              E = MBB->end();
-              BI = MachineFunction::iterator(MBB);
-              BE = MF.end();
-            }
-            assert((!NewBB || NewBB == I->getParent()) &&
-                   "moveToVALU did not return the right basic block");
-            break;
-          }
-
-          MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-          unsigned SMovOp;
-          int64_t Imm;
-          // If we are just copying an immediate, we can replace the copy with
-          // s_mov_b32.
-          if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
-            MI.getOperand(1).ChangeToImmediate(Imm);
-            MI.addImplicitDefUseOperands(MF);
-            MI.setDesc(TII->get(SMovOp));
-            break;
-          }
-          MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
-          if (NewBB && NewBB != MBB) {
-            MBB = NewBB;
-            E = MBB->end();
-            BI = MachineFunction::iterator(MBB);
-            BE = MF.end();
-          }
-          assert((!NewBB || NewBB == I->getParent()) &&
-                 "moveToVALU did not return the right basic block");
-        } else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
+        if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
           tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
         }
 
@@ -916,3 +897,269 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   }
   return CreatedBB;
 }
+
+bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
+  MachineBasicBlock *MBB = MI.getParent();
+  const TargetRegisterClass *SrcRC, *DstRC;
+  std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
+
+  // We return true to indicate that no further processing needed
+  if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
+    return true;
+
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
+    TII->moveToVALU(MI, MDT);
+    return true;
+  }
+
+  unsigned SMovOp;
+  int64_t Imm;
+  // If we are just copying an immediate, we can replace the copy with
+  // s_mov_b32.
+  if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
+    MI.getOperand(1).ChangeToImmediate(Imm);
+    MI.addImplicitDefUseOperands(*MBB->getParent());
+    MI.setDesc(TII->get(SMovOp));
+    return true;
+  }
+  return false;
+}
+
+class V2SCopyInfo {
+public:
+  // VGPR to SGPR copy being processed
+  MachineInstr *Copy;
+  // All SALU instructions reachable from this copy in SSA graph
+  DenseSet<MachineInstr *> SChain;
+  // Number of SGPR to VGPR copies that are used to put the SALU computation
+  // results back to VALU.
+  unsigned NumSVCopies;
+
+  unsigned Score;
+  // Actual count of v_readfirstlane_b32
+  // which need to be inserted to keep SChain SALU
+  unsigned NumReadfirstlanes;
+  // Current score state. To speedup selection V2SCopyInfos for processing
+  bool NeedToBeConvertedToVALU = false;
+  // Unique ID. Used as a key for mapping to keep permanent order.
+  unsigned ID;
+
+  // Count of another VGPR to SGPR copies that contribute to the
+  // current copy SChain
+  unsigned SiblingPenalty = 0;
+  SetVector<unsigned> Siblings;
+  V2SCopyInfo() : Copy(nullptr), ID(0){};
+  V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
+      : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump() {
+    dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
+           << "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
+           << "\nScore: " << Score << "\n";
+  }
+#endif
+};
+
+void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
+
+  DenseMap<unsigned, V2SCopyInfo> Copies;
+  DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
+
+  // The main function that computes the VGPR to SGPR copy score
+  // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
+  auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
+    if (I->SChain.empty())
+      return true;
+    I->Siblings = SiblingPenalty[*std::max_element(
+        I->SChain.begin(), I->SChain.end(),
+        [&](MachineInstr *A, MachineInstr *B) -> bool {
+          return SiblingPenalty[A].size() < SiblingPenalty[B].size();
+        })];
+    I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
+    // The loop below computes the number of another VGPR to SGPR copies
+    // which contribute to the current copy SALU chain. We assume that all the
+    // copies with the same source virtual register will be squashed to one by
+    // regalloc. Also we take careof the copies of the differnt subregs of the
+    // same register.
+    SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
+    for (auto J : I->Siblings) {
+      auto InfoIt = Copies.find(J);
+      if (InfoIt != Copies.end()) {
+        MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
+        if (SiblingCopy->isImplicitDef())
+          // the COPY has already been MoveToVALUed
+          continue;
+
+        SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
+                                      SiblingCopy->getOperand(1).getSubReg()));
+      }
+    }
+    I->SiblingPenalty = SrcRegs.size();
+
+    unsigned Penalty =
+        I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes;
+    unsigned Profit = I->SChain.size();
+    I->Score = Penalty > Profit ? 0 : Profit - Penalty;
+    I->NeedToBeConvertedToVALU = I->Score < 3;
+    return I->NeedToBeConvertedToVALU;
+  };
+
+  auto needProcessing = [](MachineInstr &MI) -> bool {
+    switch (MI.getOpcode()) {
+    case AMDGPU::COPY:
+    case AMDGPU::WQM:
+    case AMDGPU::STRICT_WQM:
+    case AMDGPU::SOFT_WQM:
+    case AMDGPU::STRICT_WWM:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    MachineBasicBlock *MBB = &*BI;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+         ++I) {
+      MachineInstr &MI = *I;
+      if (!needProcessing(MI))
+        continue;
+      if (lowerSpecialCase(MI))
+        continue;
+
+      // Compute the COPY width to pass it to V2SCopyInfo Ctor
+      Register DstReg = MI.getOperand(0).getReg();
+
+      const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg);
+
+      V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI,
+                     TRI->getRegSizeInBits(*DstRC));
+
+      SmallVector<MachineInstr *, 8> AnalysisWorklist;
+      // Needed because the SSA is not a tree but a graph and may have
+      // forks and joins. We should not then go same way twice.
+      DenseSet<MachineInstr *> Visited;
+      AnalysisWorklist.push_back(&MI);
+      while (!AnalysisWorklist.empty()) {
+
+        MachineInstr *Inst = AnalysisWorklist.pop_back_val();
+
+        if (!Visited.insert(Inst).second)
+          continue;
+
+        // Copies and REG_SEQUENCE do not contribute to the final assembly
+        // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
+        if (Inst->isCopy() || Inst->isRegSequence()) {
+          if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
+            if (!Inst->isCopy() ||
+                !tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
+              In.NumSVCopies++;
+              continue;
+            }
+          }
+        }
+
+        SiblingPenalty[Inst].insert(In.ID);
+
+        SmallVector<MachineInstr *, 4> Users;
+        if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
+            (Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
+          auto I = Inst->getIterator();
+          auto E = Inst->getParent()->end();
+          while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) {
+            if (I->readsRegister(AMDGPU::SCC))
+              Users.push_back(&*I);
+          }
+        } else if (Inst->getNumExplicitDefs() != 0) {
+          Register Reg = Inst->getOperand(0).getReg();
+          if (TRI->isSGPRReg(*MRI, Reg))
+            for (auto &U : MRI->use_instructions(Reg))
+              Users.push_back(&U);
+        }
+        for (auto U : Users) {
+          if (TII->isSALU(*U))
+            In.SChain.insert(U);
+          AnalysisWorklist.push_back(U);
+        }
+      }
+      Copies[In.ID] = In;
+    }
+  }
+
+  SmallVector<unsigned, 8> LoweringWorklist;
+  for (auto &C : Copies) {
+    if (needToBeConvertedToVALU(&C.second))
+      LoweringWorklist.push_back(C.second.ID);
+  }
+
+  while (!LoweringWorklist.empty()) {
+    unsigned CurID = LoweringWorklist.pop_back_val();
+    auto CurInfoIt = Copies.find(CurID);
+    if (CurInfoIt != Copies.end()) {
+      V2SCopyInfo C = CurInfoIt->getSecond();
+      LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
+      for (auto S : C.Siblings) {
+        auto SibInfoIt = Copies.find(S);
+        if (SibInfoIt != Copies.end()) {
+          V2SCopyInfo &SI = SibInfoIt->getSecond();
+          LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
+          if (!SI.NeedToBeConvertedToVALU) {
+            set_subtract(SI.SChain, C.SChain);
+            if (needToBeConvertedToVALU(&SI))
+              LoweringWorklist.push_back(SI.ID);
+          }
+          SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; });
+        }
+      }
+      LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
+                        << " is being turned to VALU\n");
+      Copies.erase(C.ID);
+      TII->moveToVALU(*C.Copy, MDT);
+    }
+  }
+
+  // Now do actual lowering
+  for (auto C : Copies) {
+    MachineInstr *MI = C.second.Copy;
+    MachineBasicBlock *MBB = MI->getParent();
+    // We decide to turn V2S copy to v_readfirstlane_b32
+    // remove it from the V2SCopies and remove it from all its siblings
+    LLVM_DEBUG(dbgs() << "V2S copy " << *MI
+                      << " is being turned to v_readfirstlane_b32"
+                      << " Score: " << C.second.Score << "\n");
+    Register DstReg = MI->getOperand(0).getReg();
+    Register SrcReg = MI->getOperand(1).getReg();
+    unsigned SubReg = MI->getOperand(1).getSubReg();
+    const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg);
+    SrcRC = TRI->getSubRegClass(SrcRC, SubReg);
+    size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
+    if (SrcSize == 16) {
+      // HACK to handle possible 16bit VGPR source
+      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                         TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
+      MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
+    } else if (SrcSize == 32) {
+      auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                         TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
+      MIB.addReg(SrcReg, 0, SubReg);
+    } else {
+      auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
+                            TII->get(AMDGPU::REG_SEQUENCE), DstReg);
+      int N = TRI->getRegSizeInBits(*SrcRC) / 32;
+      for (int i = 0; i < N; i++) {
+        Register PartialSrc = TII->buildExtractSubReg(
+            Result, *MRI, MI->getOperand(1), SrcRC,
+            TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
+        Register PartialDst =
+            MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+        BuildMI(*MBB, *Result, Result->getDebugLoc(),
+                TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
+            .addReg(PartialSrc);
+        Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
+      }
+    }
+    MI->eraseFromParent();
+  }
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d16da2a8b86b..438e8b200ecc 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1664,6 +1664,17 @@ SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
 }
 
+SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
+                                         const SDLoc &SL) const {
+
+  Function &F = DAG.getMachineFunction().getFunction();
+  Optional<uint32_t> KnownSize =
+      AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
+  if (KnownSize.has_value())
+    return DAG.getConstant(KnownSize.value(), SL, MVT::i32);
+  return SDValue();
+}
+
 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          const SDLoc &SL, SDValue Val,
                                          bool Signed,
@@ -2049,6 +2060,9 @@ void SITargetLowering::allocateSpecialInputSGPRs(
 
   if (Info.hasWorkGroupIDZ())
     allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
+
+  if (Info.hasLDSKernelId())
+    allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
 }
 
 // Allocate special inputs passed in user SGPRs.
@@ -2102,6 +2116,12 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
+  if (Info.hasLDSKernelId()) {
+    Register Reg = Info.addLDSKernelId();
+    MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+
   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
   // these from the dispatch pointer.
 }
@@ -2347,8 +2367,8 @@ SDValue SITargetLowering::LowerFormalArguments(
            (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
-           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
-           !Info->hasWorkItemIDZ());
+           !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
+           !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
   }
 
   if (CallConv == CallingConv::AMDGPU_PS) {
@@ -2762,7 +2782,8 @@ void SITargetLowering::passSpecialInputs(
     {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
     {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
     {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
-    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}
+    {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
+    {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
   };
 
   for (auto Attr : ImplicitAttrs) {
@@ -2798,6 +2819,13 @@ void SITargetLowering::passSpecialInputs(
       // The implicit arg ptr is special because it doesn't have a corresponding
       // input for kernels, and is computed from the kernarg segment pointer.
       InputReg = getImplicitArgPtr(DAG, DL);
+    } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
+      Optional<uint32_t> Id = AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
+      if (Id.has_value()) {
+        InputReg = DAG.getConstant(Id.value(), DL, ArgVT);
+      } else {
+        InputReg = DAG.getUNDEF(ArgVT);
+      }
     } else {
       // We may have proven the input wasn't needed, although the ABI is
       // requiring it. We just need to allocate the register appropriately.
@@ -6887,6 +6915,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_workgroup_id_z:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+  case Intrinsic::amdgcn_lds_kernel_id: {
+    if (MFI->isEntryFunction())
+      return getLDSKernelId(DAG, DL);
+    return getPreloadedValue(DAG, *MFI, VT,
+                             AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
+  }
   case Intrinsic::amdgcn_workitem_id_x:
     return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
   case Intrinsic::amdgcn_workitem_id_y:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 4fbccf0c5850..d1fecc1afc7f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -48,6 +48,7 @@ private:
   SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
                                    SDValue Chain, uint64_t Offset) const;
   SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
+  SDValue getLDSKernelId(SelectionDAG &DAG, const SDLoc &SL) const;
   SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                    const SDLoc &SL, SDValue Chain,
                                    uint64_t Offset, Align Alignment,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index b398e108bf62..7c1d8d32b624 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -85,7 +85,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bit VOPAsmPrefer32Bit = 0;
 
   // This bit indicates that this is a VOP3 opcode which supports op_sel
-  // modifier (gfx9 only).
+  // modifier.
   field bit VOP3_OPSEL = 0;
 
   // Is it possible for this instruction to be atomic?
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 799d34e32d27..8916f06598c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -108,8 +108,8 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
 }
 
-bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                                    AAResults *AA) const {
+bool SIInstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI) const {
   if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
     // Normally VALU use of exec would block the rematerialization, but that
     // is OK in this case to have an implicit exec read as all VALU do.
@@ -220,16 +220,23 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
       return false;
 
-    assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
+    unsigned NumOps = getNumOperandsNoGlue(Load0);
+    if (NumOps != getNumOperandsNoGlue(Load1))
+      return false;
 
     // Check base reg.
     if (Load0->getOperand(0) != Load1->getOperand(0))
       return false;
 
+    // Match register offsets, if both register and immediate offsets present.
+    assert(NumOps == 4 || NumOps == 5);
+    if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
+      return false;
+
     const ConstantSDNode *Load0Offset =
-        dyn_cast<ConstantSDNode>(Load0->getOperand(1));
+        dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
     const ConstantSDNode *Load1Offset =
-        dyn_cast<ConstantSDNode>(Load1->getOperand(1));
+        dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
 
     if (!Load0Offset || !Load1Offset)
       return false;
@@ -5011,10 +5018,8 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
   }
 
   if (MO->isReg()) {
-    if (!DefinedRC) {
-      // This operand allows any register.
-      return true;
-    }
+    if (!DefinedRC)
+      return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
     if (!isLegalRegOperand(MRI, OpInfo, *MO))
       return false;
     bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 1b411eb83eb3..5840f45bdc5a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -184,8 +184,7 @@ public:
     return ST;
   }
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AAResults *AA) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
 
   bool isIgnorableUse(const MachineOperand &MO) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 23afd6556bc9..81f8dcc482da 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -324,7 +324,8 @@ class isFloatType<ValueType SrcVT> {
 
 // XXX - do v2i16 instructions?
 class isIntType<ValueType SrcVT> {
-  bit ret = !or(!eq(SrcVT.Value, i16.Value),
+  bit ret = !or(!eq(SrcVT.Value, i8.Value),
+                !eq(SrcVT.Value, i16.Value),
                 !eq(SrcVT.Value, i32.Value),
                 !eq(SrcVT.Value, i64.Value),
                 !eq(SrcVT.Value, v4i16.Value),
@@ -1411,6 +1412,10 @@ class IntSDWAInputModsMatchClass <int opSize> : AsmOperandClass {
 
 def Int16SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<16>;
 def Int32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32>;
+def Bin32SDWAInputModsMatchClass : IntSDWAInputModsMatchClass<32> {
+  let Name = "SDWAWithBin32InputMods";
+  let ParserMethod = "parseRegOrImm";
+}
 
 class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
   InputMods <matchClass> {
@@ -1419,6 +1424,7 @@ class IntSDWAInputMods <IntSDWAInputModsMatchClass matchClass> :
 
 def Int16SDWAInputMods : IntSDWAInputMods<Int16SDWAInputModsMatchClass>;
 def Int32SDWAInputMods : IntSDWAInputMods<Int32SDWAInputModsMatchClass>;
+def Bin32SDWAInputMods : IntSDWAInputMods<Bin32SDWAInputModsMatchClass>;
 
 def IntVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithIntInputMods";
@@ -1897,94 +1903,94 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
 
 class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
                  RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> {
 
-  dag ret = !if (!eq(NumSrcArgs, 0),
+  dag ret = !if(!eq(NumSrcArgs, 0),
                 // VOP1 without input operands (V_NOP)
                 (ins ),
-            !if (!eq(NumSrcArgs, 1),
-              !if (HasModifiers,
-                // VOP1_DPP with modifiers
-                (ins OldRC:$old, Src0Mod:$src0_modifiers,
-                     Src0RC:$src0)
-              /* else */,
-                // VOP1_DPP without modifiers
-                (ins OldRC:$old, Src0RC:$src0)
-              /* endif */),
-            !if (!eq(NumSrcArgs, 2),
-              !if (HasModifiers,
-                // VOP2_DPP with modifiers
-                (ins OldRC:$old,
-                     Src0Mod:$src0_modifiers, Src0RC:$src0,
-                     Src1Mod:$src1_modifiers, Src1RC:$src1)
-              /* else */,
-                // VOP2_DPP without modifiers
-                (ins OldRC:$old,
-                     Src0RC:$src0, Src1RC:$src1)
-                )
-              /* NumSrcArgs == 3, VOP3 */,
-              !if (HasModifiers,
-                // VOP3_DPP with modifiers
-                (ins OldRC:$old,
-                     Src0Mod:$src0_modifiers, Src0RC:$src0,
-                     Src1Mod:$src1_modifiers, Src1RC:$src1,
-                     Src2Mod:$src2_modifiers, Src2RC:$src2)
-              /* else */,
-                // VOP3_DPP without modifiers
-                (ins OldRC:$old,
-                     Src0RC:$src0, Src1RC:$src1,
-                     Src2RC:$src2)
+                !con(
+                  !if(HasOld ,(ins OldRC:$old), (ins)),
+                  !if (!eq(NumSrcArgs, 1),
+                    !if (HasModifiers,
+                      // VOP1_DPP with modifiers
+                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0)
+                    /* else */,
+                      // VOP1_DPP without modifiers
+                      (ins Src0RC:$src0)
+                    /* endif */),
+                  !if (!eq(NumSrcArgs, 2),
+                    !if (HasModifiers,
+                      // VOP2_DPP with modifiers
+                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       Src1Mod:$src1_modifiers, Src1RC:$src1)
+                    /* else */,
+                      // VOP2_DPP without modifiers
+                      (ins Src0RC:$src0, Src1RC:$src1)
+                    )
+                    /* NumSrcArgs == 3, VOP3 */,
+                    !if (HasModifiers,
+                      // VOP3_DPP with modifiers
+                      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       Src1Mod:$src1_modifiers, Src1RC:$src1,
+                       Src2Mod:$src2_modifiers, Src2RC:$src2)
+                    /* else */,
+                      // VOP3_DPP without modifiers
+                      (ins Src0RC:$src0, Src1RC:$src1,
+                       Src2RC:$src2)
+                      )
+                    )
+                  )
                 )
-             /* endif */)));
+            );
 }
 
 class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
                  RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
-                           HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
+                           HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
                  (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
 }
 
 class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
                  RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
-                           HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
+                           HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
                  (ins FI:$fi));
 }
 
 class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
                  RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
-                           HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret,
+                           HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
                  (ins dpp8:$dpp8, FI:$fi));
 }
 
-class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
+class getInsVOP3DPPBase<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld> {
   dag old = ( ins OldRC:$old );
   dag base = VOP3Base;
   dag ret =  !con(
-                !if(!ne(NumSrcArgs, 0), old, (ins)),
+                !if(!and(HasOld,!ne(NumSrcArgs, 0)), old, (ins)),
                 base
               );
 }
 
-class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
-  dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+class getInsVOP3DPP<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
+  dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
                  (ins dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
 }
 
-class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
-  dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs>.ret,
+class getInsVOP3DPP16<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
+  dag ret = !con(getInsVOP3DPP<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
                  (ins FI:$fi));
 }
 
-class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs> {
-  dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs>.ret,
+class getInsVOP3DPP8<dag VOP3Base, RegisterOperand OldRC, int NumSrcArgs, bit HasOld = 1> {
+  dag ret = !con(getInsVOP3DPPBase<VOP3Base,OldRC,NumSrcArgs,HasOld>.ret,
                  (ins dpp8:$dpp8, FI:$fi));
 }
 
@@ -2665,6 +2671,8 @@ def VOP_V4I32_I64_I64_V4I32       : VOPProfile <[v4i32,  i64,   i64,   v4i32]>;
 def VOP_V16I32_I64_I64_V16I32     : VOPProfile <[v16i32, i64,   i64,   v16i32]>;
 def VOP_V4F32_V2F32_V2F32_V4F32   : VOPProfile <[v4f32,  v2f32, v2f32, v4f32]>;
 def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>;
+def VOP_V4F32_I64_I64_V4F32       : VOPProfile <[v4f32,  i64,   i64,   v4f32]>;
+def VOP_V16F32_I64_I64_V16F32     : VOPProfile <[v16f32, i64,   i64,   v16f32]>;
 
 def VOP_V4F32_V4F16_V8F16_I32     : VOPProfile <[v4f32,  v4f16, v8f16, i32]>;
 def VOP_V16F32_V4F16_V8F16_I32    : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
@@ -2672,6 +2680,8 @@ def VOP_V4F32_V4I16_V8I16_I32     : VOPProfile <[v4f32,  v4i16, v8i16, i32]>;
 def VOP_V16F32_V4I16_V8I16_I32    : VOPProfile <[v16f32, v4i16, v8i16, i32]>;
 def VOP_V4I32_V2I32_V4I32_I32     : VOPProfile <[v4i32,  v2i32, v4i32, i32]>;
 def VOP_V16I32_V2I32_V4I32_I32    : VOPProfile <[v16i32, v2i32, v4i32, i32]>;
+def VOP_V4F32_V2I32_V4I32_I32     : VOPProfile <[v4f32,  v2i32, v4i32, i32]>;
+def VOP_V16F32_V2I32_V4I32_I32    : VOPProfile <[v16f32, v2i32, v4i32, i32]>;
 
 class Commutable_REV <string revOp, bit isOrig> {
   string RevOp = revOp;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 0504c59ebd9e..9176e85568ee 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -44,6 +44,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupIDY(false),
     WorkGroupIDZ(false),
     WorkGroupInfo(false),
+    LDSKernelId(false),
     PrivateSegmentWaveByteOffset(false),
     WorkItemIDX(false),
     WorkItemIDY(false),
@@ -143,6 +144,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
     if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
       DispatchID = true;
+
+    if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
+      LDSKernelId = true;
   }
 
   // FIXME: This attribute is a hack, we just need an analysis on the function
@@ -261,6 +265,12 @@ Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI)
   return ArgInfo.ImplicitBufferPtr.getRegister();
 }
 
+Register SIMachineFunctionInfo::addLDSKernelId() {
+  ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
+  NumUserSGPRs += 1;
+  return ArgInfo.LDSKernelId.getRegister();
+}
+
 bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
                                              MCPhysReg Reg) {
   for (unsigned I = 0; CSRegs[I]; ++I) {
@@ -561,6 +571,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
   Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
   Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
   Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
+  Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
   Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
   Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
   Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index bebb13cbf09f..5105587617fd 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -191,6 +191,7 @@ struct SIArgumentInfo {
   Optional<SIArgument> WorkGroupIDY;
   Optional<SIArgument> WorkGroupIDZ;
   Optional<SIArgument> WorkGroupInfo;
+  Optional<SIArgument> LDSKernelId;
   Optional<SIArgument> PrivateSegmentWaveByteOffset;
 
   Optional<SIArgument> ImplicitArgPtr;
@@ -215,6 +216,7 @@ template <> struct MappingTraits<SIArgumentInfo> {
     YamlIO.mapOptional("workGroupIDY", AI.WorkGroupIDY);
     YamlIO.mapOptional("workGroupIDZ", AI.WorkGroupIDZ);
     YamlIO.mapOptional("workGroupInfo", AI.WorkGroupInfo);
+    YamlIO.mapOptional("LDSKernelId", AI.LDSKernelId);
     YamlIO.mapOptional("privateSegmentWaveByteOffset",
                        AI.PrivateSegmentWaveByteOffset);
 
@@ -418,6 +420,7 @@ private:
   bool WorkGroupIDY : 1;
   bool WorkGroupIDZ : 1;
   bool WorkGroupInfo : 1;
+  bool LDSKernelId : 1;
   bool PrivateSegmentWaveByteOffset : 1;
 
   bool WorkItemIDX : 1; // Always initialized.
@@ -608,6 +611,7 @@ public:
   Register addDispatchID(const SIRegisterInfo &TRI);
   Register addFlatScratchInit(const SIRegisterInfo &TRI);
   Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
+  Register addLDSKernelId();
 
   /// Increment user SGPRs used for padding the argument list only.
   Register addReservedUserSGPR() {
@@ -705,6 +709,8 @@ public:
     return WorkGroupInfo;
   }
 
+  bool hasLDSKernelId() const { return LDSKernelId; }
+
   bool hasPrivateSegmentWaveByteOffset() const {
     return PrivateSegmentWaveByteOffset;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 66bc46aaefea..19a83ad53e2e 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -12,6 +12,8 @@
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/InitializePasses.h"
 
 using namespace llvm;
@@ -26,6 +28,10 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
+  MCRegister Exec;
+
+  DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
+  SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
 
   Register isCopyFromExec(const MachineInstr &MI) const;
   Register isCopyToExec(const MachineInstr &MI) const;
@@ -44,13 +50,13 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
                                    std::function<bool(MachineInstr *)> Pred,
                                    ArrayRef<MCRegister> NonModifiableRegs,
                                    unsigned MaxInstructions = 20) const;
-  MachineInstr *findPossibleVCMPVCMPXOptimization(MachineInstr &SaveExec,
-                                                  MCRegister Exec) const;
-  bool optimizeExecSequence() const;
-  bool optimizeVCmpxAndSaveexecSequence() const;
-  bool optimizeSingleVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
-                                          MachineInstr &VCmp,
-                                          MCRegister Exec) const;
+  bool optimizeExecSequence();
+  void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
+  bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
+                                    MachineInstr &VCmp, MCRegister Exec) const;
+
+  void tryRecordOrSaveexecXorSequence(MachineInstr &MI);
+  bool optimizeOrSaveexecXorSequences();
 
 public:
   static char ID;
@@ -92,7 +98,7 @@ Register SIOptimizeExecMasking::isCopyFromExec(const MachineInstr &MI) const {
   case AMDGPU::S_MOV_B32:
   case AMDGPU::S_MOV_B32_term: {
     const MachineOperand &Src = MI.getOperand(1);
-    if (Src.isReg() && Src.getReg() == TRI->getExec())
+    if (Src.isReg() && Src.getReg() == Exec)
       return MI.getOperand(0).getReg();
   }
   }
@@ -107,8 +113,7 @@ Register SIOptimizeExecMasking::isCopyToExec(const MachineInstr &MI) const {
   case AMDGPU::S_MOV_B64:
   case AMDGPU::S_MOV_B32: {
     const MachineOperand &Dst = MI.getOperand(0);
-    if (Dst.isReg() && Dst.getReg() == TRI->getExec() &&
-        MI.getOperand(1).isReg())
+    if (Dst.isReg() && Dst.getReg() == Exec && MI.getOperand(1).isReg())
       return MI.getOperand(1).getReg();
     break;
   }
@@ -394,9 +399,7 @@ bool SIOptimizeExecMasking::isRegisterInUseAfter(MachineInstr &Stop,
 // =>
 //     x = s_<op>_saveexec_b64 y
 //
-bool SIOptimizeExecMasking::optimizeExecSequence() const {
-  MCRegister Exec = TRI->getExec();
-
+bool SIOptimizeExecMasking::optimizeExecSequence() {
   bool Changed = false;
   for (MachineBasicBlock &MBB : *MF) {
     MachineBasicBlock::reverse_iterator I = fixTerminators(MBB);
@@ -551,88 +554,9 @@ bool SIOptimizeExecMasking::optimizeExecSequence() const {
   return Changed;
 }
 
-// Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec sequence
-// by looking at an instance of a s_and_saveexec instruction. Returns a pointer
-// to the v_cmp instruction if it is safe to replace the sequence (see the
-// conditions in the function body). This is after register allocation, so some
-// checks on operand dependencies need to be considered.
-MachineInstr *SIOptimizeExecMasking::findPossibleVCMPVCMPXOptimization(
-    MachineInstr &SaveExec, MCRegister Exec) const {
-
-  MachineInstr *VCmp = nullptr;
-
-  Register SaveExecDest = SaveExec.getOperand(0).getReg();
-  if (!TRI->isSGPRReg(*MRI, SaveExecDest))
-    return nullptr;
-
-  MachineOperand *SaveExecSrc0 =
-      TII->getNamedOperand(SaveExec, AMDGPU::OpName::src0);
-  if (!SaveExecSrc0->isReg())
-    return nullptr;
-
-  // Try to find the last v_cmp instruction that defs the saveexec input
-  // operand without any write to Exec or the saveexec input operand inbetween.
-  VCmp = findInstrBackwards(
-      SaveExec,
-      [&](MachineInstr *Check) {
-        return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
-               Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
-      },
-      {Exec, SaveExecSrc0->getReg()});
-
-  if (!VCmp)
-    return nullptr;
-
-  MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
-  assert(VCmpDest && "Should have an sdst operand!");
-
-  // Check if any of the v_cmp source operands is written by the saveexec.
-  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
-  if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
-      SaveExec.modifiesRegister(Src0->getReg(), TRI))
-    return nullptr;
-
-  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
-  if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
-      SaveExec.modifiesRegister(Src1->getReg(), TRI))
-    return nullptr;
-
-  // Don't do the transformation if the destination operand is included in
-  // it's MBB Live-outs, meaning it's used in any of it's successors, leading
-  // to incorrect code if the v_cmp and therefore the def of
-  // the dest operand is removed.
-  if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
-    return nullptr;
-
-  // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
-  // s_and_saveexec, skip the optimization.
-  if (isRegisterInUseBetween(*VCmp, SaveExec, VCmpDest->getReg(), false,
-                             true) ||
-      isRegisterInUseAfter(SaveExec, VCmpDest->getReg()))
-    return nullptr;
-
-  // Try to determine if there is a write to any of the VCmp
-  // operands between the saveexec and the vcmp.
-  // If yes, additional VGPR spilling might need to be inserted. In this case,
-  // it's not worth replacing the instruction sequence.
-  SmallVector<MCRegister, 2> NonDefRegs;
-  if (Src0->isReg())
-    NonDefRegs.push_back(Src0->getReg());
-
-  if (Src1->isReg())
-    NonDefRegs.push_back(Src1->getReg());
-
-  if (!findInstrBackwards(
-          SaveExec, [&](MachineInstr *Check) { return Check == VCmp; },
-          NonDefRegs))
-    return nullptr;
-
-  return VCmp;
-}
-
 // Inserts the optimized s_mov_b32 / v_cmpx sequence based on the
 // operands extracted from a v_cmp ..., s_and_saveexec pattern.
-bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
+bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
     MachineInstr &SaveExecInstr, MachineInstr &VCmp, MCRegister Exec) const {
   const int NewOpcode = AMDGPU::getVCMPXOpFromVCMP(VCmp.getOpcode());
 
@@ -678,50 +602,164 @@ bool SIOptimizeExecMasking::optimizeSingleVCMPSaveExecSequence(
   if (Src1->isReg())
     MRI->clearKillFlags(Src1->getReg());
 
+  SaveExecInstr.eraseFromParent();
+  VCmp.eraseFromParent();
+
   return true;
 }
 
-// After all s_op_saveexec instructions are inserted,
-// replace (on GFX10.3 and later)
+// Record (on GFX10.3 and later) occurences of
 // v_cmp_* SGPR, IMM, VGPR
 // s_and_saveexec_b32 EXEC_SGPR_DEST, SGPR
-// with
+// to be replaced with
 // s_mov_b32 EXEC_SGPR_DEST, exec_lo
 // v_cmpx_* IMM, VGPR
 // to reduce pipeline stalls.
-bool SIOptimizeExecMasking::optimizeVCmpxAndSaveexecSequence() const {
+void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
+    MachineInstr &MI) {
   if (!ST->hasGFX10_3Insts())
-    return false;
+    return;
 
-  bool Changed = false;
-
-  DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
-  MCRegister Exec = TRI->getExec();
   const unsigned AndSaveExecOpcode =
       ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
 
-  for (MachineBasicBlock &MBB : *MF) {
-    for (MachineInstr &MI : MBB) {
-      // Record relevant v_cmp / s_and_saveexec instruction pairs for
-      // replacement.
-      if (MI.getOpcode() != AndSaveExecOpcode)
-        continue;
+  if (MI.getOpcode() != AndSaveExecOpcode)
+    return;
+
+  Register SaveExecDest = MI.getOperand(0).getReg();
+  if (!TRI->isSGPRReg(*MRI, SaveExecDest))
+    return;
 
-      if (MachineInstr *VCmp = findPossibleVCMPVCMPXOptimization(MI, Exec))
-        SaveExecVCmpMapping[&MI] = VCmp;
+  MachineOperand *SaveExecSrc0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  if (!SaveExecSrc0->isReg())
+    return;
+
+  // Tries to find a possibility to optimize a v_cmp ..., s_and_saveexec
+  // sequence by looking at an instance of a s_and_saveexec instruction. Returns
+  // a pointer to the v_cmp instruction if it is safe to replace the sequence
+  // (see the conditions in the function body). This is after register
+  // allocation, so some checks on operand dependencies need to be considered.
+  MachineInstr *VCmp = nullptr;
+
+  // Try to find the last v_cmp instruction that defs the saveexec input
+  // operand without any write to Exec or the saveexec input operand inbetween.
+  VCmp = findInstrBackwards(
+      MI,
+      [&](MachineInstr *Check) {
+        return AMDGPU::getVCMPXOpFromVCMP(Check->getOpcode()) != -1 &&
+               Check->modifiesRegister(SaveExecSrc0->getReg(), TRI);
+      },
+      {Exec, SaveExecSrc0->getReg()});
+
+  if (!VCmp)
+    return;
+
+  MachineOperand *VCmpDest = TII->getNamedOperand(*VCmp, AMDGPU::OpName::sdst);
+  assert(VCmpDest && "Should have an sdst operand!");
+
+  // Check if any of the v_cmp source operands is written by the saveexec.
+  MachineOperand *Src0 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src0);
+  if (Src0->isReg() && TRI->isSGPRReg(*MRI, Src0->getReg()) &&
+      MI.modifiesRegister(Src0->getReg(), TRI))
+    return;
+
+  MachineOperand *Src1 = TII->getNamedOperand(*VCmp, AMDGPU::OpName::src1);
+  if (Src1->isReg() && TRI->isSGPRReg(*MRI, Src1->getReg()) &&
+      MI.modifiesRegister(Src1->getReg(), TRI))
+    return;
+
+  // Don't do the transformation if the destination operand is included in
+  // it's MBB Live-outs, meaning it's used in any of it's successors, leading
+  // to incorrect code if the v_cmp and therefore the def of
+  // the dest operand is removed.
+  if (isLiveOut(*VCmp->getParent(), VCmpDest->getReg()))
+    return;
+
+  // If the v_cmp target is in use between v_cmp and s_and_saveexec or after the
+  // s_and_saveexec, skip the optimization.
+  if (isRegisterInUseBetween(*VCmp, MI, VCmpDest->getReg(), false, true) ||
+      isRegisterInUseAfter(MI, VCmpDest->getReg()))
+    return;
+
+  // Try to determine if there is a write to any of the VCmp
+  // operands between the saveexec and the vcmp.
+  // If yes, additional VGPR spilling might need to be inserted. In this case,
+  // it's not worth replacing the instruction sequence.
+  SmallVector<MCRegister, 2> NonDefRegs;
+  if (Src0->isReg())
+    NonDefRegs.push_back(Src0->getReg());
+
+  if (Src1->isReg())
+    NonDefRegs.push_back(Src1->getReg());
+
+  if (!findInstrBackwards(
+          MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+    return;
+
+  if (VCmp)
+    SaveExecVCmpMapping[&MI] = VCmp;
+}
+
+// Record occurences of
+// s_or_saveexec s_o, s_i
+// s_xor exec, exec, s_o
+// to be replaced with
+// s_andn2_saveexec s_o, s_i.
+void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(MachineInstr &MI) {
+  const unsigned XorOpcode =
+      ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
+
+  if (MI.getOpcode() == XorOpcode && &MI != &MI.getParent()->front()) {
+    const MachineOperand &XorDst = MI.getOperand(0);
+    const MachineOperand &XorSrc0 = MI.getOperand(1);
+    const MachineOperand &XorSrc1 = MI.getOperand(2);
+
+    if (XorDst.isReg() && XorDst.getReg() == Exec && XorSrc0.isReg() &&
+        XorSrc1.isReg() &&
+        (XorSrc0.getReg() == Exec || XorSrc1.getReg() == Exec)) {
+      const unsigned OrSaveexecOpcode = ST->isWave32()
+                                            ? AMDGPU::S_OR_SAVEEXEC_B32
+                                            : AMDGPU::S_OR_SAVEEXEC_B64;
+
+      // Peek at the previous instruction and check if this is a relevant
+      // s_or_saveexec instruction.
+      MachineInstr &PossibleOrSaveexec = *MI.getPrevNode();
+      if (PossibleOrSaveexec.getOpcode() != OrSaveexecOpcode)
+        return;
+
+      const MachineOperand &OrDst = PossibleOrSaveexec.getOperand(0);
+      const MachineOperand &OrSrc0 = PossibleOrSaveexec.getOperand(1);
+      if (OrDst.isReg() && OrSrc0.isReg()) {
+        if ((XorSrc0.getReg() == Exec && XorSrc1.getReg() == OrDst.getReg()) ||
+            (XorSrc0.getReg() == OrDst.getReg() && XorSrc1.getReg() == Exec)) {
+          OrXors.emplace_back(&PossibleOrSaveexec, &MI);
+        }
+      }
     }
   }
+}
 
-  for (const auto &Entry : SaveExecVCmpMapping) {
-    MachineInstr *SaveExecInstr = Entry.getFirst();
-    MachineInstr *VCmpInstr = Entry.getSecond();
+bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
+  if (OrXors.empty()) {
+    return false;
+  }
 
-    if (optimizeSingleVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec)) {
-      SaveExecInstr->eraseFromParent();
-      VCmpInstr->eraseFromParent();
+  bool Changed = false;
+  const unsigned Andn2Opcode = ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
+                                              : AMDGPU::S_ANDN2_SAVEEXEC_B64;
 
-      Changed = true;
-    }
+  for (const auto &Pair : OrXors) {
+    MachineInstr *Or = nullptr;
+    MachineInstr *Xor = nullptr;
+    std::tie(Or, Xor) = Pair;
+    BuildMI(*Or->getParent(), Or->getIterator(), Or->getDebugLoc(),
+            TII->get(Andn2Opcode), Or->getOperand(0).getReg())
+        .addReg(Or->getOperand(1).getReg());
+
+    Or->eraseFromParent();
+    Xor->eraseFromParent();
+
+    Changed = true;
   }
 
   return Changed;
@@ -736,9 +774,42 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
   TRI = ST->getRegisterInfo();
   TII = ST->getInstrInfo();
   MRI = &MF.getRegInfo();
+  Exec = TRI->getExec();
 
   bool Changed = optimizeExecSequence();
-  Changed |= optimizeVCmpxAndSaveexecSequence();
+
+  OrXors.clear();
+  SaveExecVCmpMapping.clear();
+  static unsigned SearchWindow = 10;
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned SearchCount = 0;
+
+    for (auto &MI : llvm::reverse(MBB)) {
+      if (MI.isDebugInstr())
+        continue;
+
+      if (SearchCount >= SearchWindow) {
+        break;
+      }
+
+      tryRecordOrSaveexecXorSequence(MI);
+      tryRecordVCmpxAndSaveexecSequence(MI);
+
+      if (MI.modifiesRegister(Exec, TRI)) {
+        break;
+      }
+
+      ++SearchCount;
+    }
+  }
+
+  Changed |= optimizeOrSaveexecXorSequences();
+  for (const auto &Entry : SaveExecVCmpMapping) {
+    MachineInstr *SaveExecInstr = Entry.getFirst();
+    MachineInstr *VCmpInstr = Entry.getSecond();
+
+    Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
+  }
 
   return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 57dbad468de8..aed84437b890 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -184,6 +184,16 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
   if (isDefBetween(*TRI, LIS, CCReg, *Sel, *And))
     return false;
 
+  // Cannot safely mirror live intervals with PHI nodes, so check for these
+  // before optimization.
+  SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
+  LiveInterval *SelLI = &LIS->getInterval(SelReg);
+  if (llvm::any_of(SelLI->vnis(),
+                    [](const VNInfo *VNI) {
+                      return VNI->isPHIDef();
+                    }))
+    return false;
+
   // TODO: Guard against implicit def operands?
   LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
                     << *And);
@@ -204,31 +214,34 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
 
   LLVM_DEBUG(dbgs() << "=>\n\t" << *Andn2 << '\n');
 
-  SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
-  SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
-
-  LiveInterval *CmpLI =
-      CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
-  LiveInterval *SelLI =
-      SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr;
-
   // Update live intervals for CCReg before potentially removing CmpReg/SelReg,
   // and their associated liveness information.
+  SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
   if (CCReg.isVirtual()) {
-    // Note: this ignores that SelLI might have multiple internal values
-    // or splits and simply extends the live range to cover all cases
-    // where the result of the v_cndmask_b32 was live (e.g. loops).
-    // This could yield worse register allocation in rare edge cases.
-    SlotIndex EndIdx = AndIdx.getRegSlot();
-    if (SelLI && SelLI->endIndex() > EndIdx && SelLI->endIndex().isBlock())
-      EndIdx = SelLI->endIndex();
+    // Apply live ranges from SelLI to CCReg potentially matching splits
+    // and extending to loop boundaries.
+
+    auto applyLiveRanges = [&](LiveRange &Dst, VNInfo *VNI) {
+      // Copy live ranges from SelLI, adjusting start and end as required
+      auto DefSegment = SelLI->FindSegmentContaining(SelIdx.getRegSlot());
+      assert(DefSegment != SelLI->end() &&
+             "No live interval segment covering definition?");
+      for (auto I = DefSegment; I != SelLI->end(); ++I) {
+        SlotIndex Start = I->start < SelIdx.getRegSlot() ?
+                          SelIdx.getRegSlot() : I->start;
+        SlotIndex End = I->end < AndIdx.getRegSlot() || I->end.isBlock() ?
+                        I->end : AndIdx.getRegSlot();
+        Dst.addSegment(LiveRange::Segment(Start, End, VNI));
+      }
+      // If SelLI does not cover AndIdx (because Cmp killed Sel) then extend.
+      if (!SelLI->getSegmentContaining(AndIdx.getRegSlot()))
+        Dst.addSegment(LiveRange::Segment(CmpIdx.getRegSlot(), AndIdx.getRegSlot(), VNI));
+    };
 
     LiveInterval &CCLI = LIS->getInterval(CCReg);
     auto CCQ = CCLI.Query(SelIdx.getRegSlot());
-    if (CCQ.valueIn()) {
-      CCLI.addSegment(LiveRange::Segment(SelIdx.getRegSlot(),
-                                         EndIdx, CCQ.valueIn()));
-    }
+    if (CCQ.valueIn())
+      applyLiveRanges(CCLI, CCQ.valueIn());
 
     if (CC->getSubReg()) {
       LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg());
@@ -237,10 +250,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
           Allocator, Mask,
           [=](LiveInterval::SubRange &SR) {
             auto CCQS = SR.Query(SelIdx.getRegSlot());
-            if (CCQS.valueIn()) {
-              SR.addSegment(LiveRange::Segment(
-                  SelIdx.getRegSlot(), EndIdx, CCQS.valueIn()));
-            }
+            if (CCQS.valueIn())
+              applyLiveRanges(SR, CCQS.valueIn());
           },
           *LIS->getSlotIndexes(), *TRI);
       CCLI.removeEmptySubRanges();
@@ -253,7 +264,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
 
   // Try to remove compare. Cmp value should not used in between of cmp
   // and s_and_b64 if VCC or just unused if any other register.
-  if ((CmpReg.isVirtual() && CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
+  LiveInterval *CmpLI = CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
+  if ((CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
       (CmpReg == Register(CondReg) &&
        std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
                     [&](const MachineInstr &MI) {
@@ -266,18 +278,16 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
     Cmp->eraseFromParent();
 
     // Try to remove v_cndmask_b32.
-    if (SelLI) {
-      // Kill status must be checked before shrinking the live range.
-      bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
-      LIS->shrinkToUses(SelLI);
-      bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
-      if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
-        LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
-
-        LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
-        LIS->RemoveMachineInstrFromMaps(*Sel);
-        Sel->eraseFromParent();
-      }
+    // Kill status must be checked before shrinking the live range.
+    bool IsKill = SelLI->Query(CmpIdx.getRegSlot()).isKill();
+    LIS->shrinkToUses(SelLI);
+    bool IsDead = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
+    if (MRI->use_nodbg_empty(SelReg) && (IsKill || IsDead)) {
+      LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+
+      LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
+      LIS->RemoveMachineInstrFromMaps(*Sel);
+      Sel->eraseFromParent();
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index b13afceba20e..553fb4cf496c 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -49,6 +49,8 @@ struct SIProgramInfo {
     uint32_t AccumOffset = 0;
     uint32_t TgSplit = 0;
     uint32_t NumSGPR = 0;
+    unsigned SGPRSpill = 0;
+    unsigned VGPRSpill = 0;
     uint32_t LDSSize = 0;
     bool FlatUsed = false;
 
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 882d13402a19..b7e8eadfe71d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -119,13 +119,19 @@ class SM_Probe_Pseudo <string opName, string variant, RegisterClass baseClass,
   let PseudoInstr = opName # variant;
 }
 
-class SM_Load_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> pattern=[]>
-  : SM_Pseudo<opName, outs, ins, asmOps, pattern> {
-  RegisterClass BaseClass;
+class SM_Load_Pseudo <string opName, RegisterClass baseClass,
+                      RegisterClass dstClass, OffsetMode offsets>
+  : SM_Pseudo<opName, (outs dstClass:$sdst),
+              !con((ins baseClass:$sbase), offsets.Ins, (ins CPol:$cpol)),
+              " $sdst, $sbase, " # offsets.Asm # "$cpol", []> {
+  RegisterClass BaseClass = baseClass;
   let mayLoad = 1;
   let mayStore = 0;
   let has_glc = 1;
   let has_dlc = 1;
+  let has_offset = offsets.HasOffset;
+  let has_soffset = offsets.HasSOffset;
+  let PseudoInstr = opName # offsets.Variant;
 }
 
 class SM_Store_Pseudo <string opName, RegisterClass baseClass,
@@ -158,40 +164,9 @@ class SM_Discard_Pseudo <string opName, string variant, dag offsets,
 multiclass SM_Pseudo_Loads<string opName,
                            RegisterClass baseClass,
                            RegisterClass dstClass> {
-  def _IMM  : SM_Load_Pseudo <opName,
-                              (outs dstClass:$sdst),
-                              (ins baseClass:$sbase, i32imm:$offset, CPol:$cpol),
-                              " $sdst, $sbase, $offset$cpol", []> {
-    let has_offset = 1;
-    let BaseClass = baseClass;
-    let PseudoInstr = opName # "_IMM";
-    let has_glc = 1;
-    let has_dlc = 1;
-  }
-
-  def _SGPR  : SM_Load_Pseudo <opName,
-                              (outs dstClass:$sdst),
-                              (ins baseClass:$sbase, SReg_32:$soffset, CPol:$cpol),
-                              " $sdst, $sbase, $soffset$cpol", []> {
-    let has_soffset = 1;
-    let BaseClass = baseClass;
-    let PseudoInstr = opName # "_SGPR";
-    let has_glc = 1;
-    let has_dlc = 1;
-  }
-
-  def _SGPR_IMM  : SM_Load_Pseudo <opName,
-                                   (outs dstClass:$sdst),
-                                   (ins baseClass:$sbase, SReg_32:$soffset,
-                                        i32imm:$offset, CPol:$cpol),
-                                   " $sdst, $sbase, $soffset$offset$cpol", []> {
-    let has_offset = 1;
-    let has_soffset = 1;
-    let BaseClass = baseClass;
-    let PseudoInstr = opName # "_SGPR_IMM";
-    let has_glc = 1;
-    let has_dlc = 1;
-  }
+  def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>;
+  def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>;
+  def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>;
 }
 
 multiclass SM_Pseudo_Stores<string opName,
@@ -596,10 +571,10 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
                         soffset{6-0}, ?);
 }
 
-class SMEM_Real_Load_vi<bits<8> op, string ps, dag offsets>
-    : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)> {
-  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
-  let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
+class SMEM_Real_Load_vi<bits<8> op, string ps, OffsetMode offsets>
+    : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps # offsets.Variant)> {
+  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+  let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
 }
 
 // The alternative GFX9 SGPR encoding using soffset to encode the
@@ -614,14 +589,12 @@ class SMEM_Real_SGPR_alt_gfx9 {
 }
 
 multiclass SM_Real_Loads_vi<bits<8> op, string ps> {
-  def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM", (ins smem_offset:$offset)>;
-  def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR", (ins SReg_32:$soffset)>;
-  def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR",
-                                          (ins SReg_32:$soffset)>,
+  def _IMM_vi : SMEM_Real_Load_vi <op, ps, IMM_Offset>;
+  def _SGPR_vi : SMEM_Real_Load_vi <op, ps, SGPR_Offset>;
+  def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_Offset>,
                        SMEM_Real_SGPR_alt_gfx9;
   let IsGFX9SpecificEncoding = true in
-  def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <
-    op, ps#"_SGPR_IMM", (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+  def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_IMM_Offset>;
 }
 
 class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
@@ -883,6 +856,7 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
 def SMRDImm         : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
 def SMRDImm32       : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
 def SMRDSgpr        : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
+def SMRDSgprImm     : ComplexPattern<iPTR, 3, "SelectSMRDSgprImm">;
 def SMRDBufferImm   : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
 def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
 
@@ -903,11 +877,18 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
 
   // 3. SGPR offset
   def : GCNPat <
-    (smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, 0))
+    (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))
   >;
 
-  // 4. No offset
+  // 4. SGPR+IMM offset
+  def : GCNPat <
+    (smrd_load (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))> {
+    let OtherPredicates = [isGFX9Plus];
+  }
+
+  // 5. No offset
   def : GCNPat <
     (vt (smrd_load (i64 SReg_64:$sbase))),
     (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))
@@ -1021,19 +1002,16 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps>
   let Inst{16}    = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
 }
 
-multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
-                               SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
-                               SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
-  def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
-    let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
-  }
-  def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
-    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
-  }
-  def _SGPR_IMM_gfx10 : SMEM_Real_gfx10<op, !cast<SM_Load_Pseudo>(ps#_SGPR_IMM)> {
-    let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset,
-                             smem_offset_mod:$offset, CPol:$cpol);
-  }
+class SMEM_Real_Load_gfx10<bits<8> op, string ps, OffsetMode offsets>
+    : SMEM_Real_gfx10<op, !cast<SM_Pseudo>(ps # offsets.Variant)> {
+  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+  let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+}
+
+multiclass SM_Real_Loads_gfx10<bits<8> op, string ps> {
+  def _IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, IMM_Offset>;
+  def _SGPR_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_Offset>;
+  def _SGPR_IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_IMM_Offset>;
 }
 
 class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> {
@@ -1227,17 +1205,16 @@ class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
   let Inst{14}    = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
 }
 
-class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, dag offsets> :
-    SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName> {
-  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps).BaseClass;
-  let InOperandList = !con((ins BaseClass:$sbase), offsets, (ins CPol:$cpol));
+class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, OffsetMode offsets> :
+    SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
+  RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+  let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
 }
 
 multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> {
-  def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName, (ins smem_offset:$offset)>;
-  def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName, (ins SReg_32:$soffset)>;
-  def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<
-    op, ps#"_SGPR_IMM", opName, (ins SReg_32:$soffset, smem_offset_mod:$offset)>;
+  def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, IMM_Offset>;
+  def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_Offset>;
+  def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_IMM_Offset>;
   def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>,
                       Requires<[isGFX11Plus]>;
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2f334e211181..b5fb390c08e1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -417,9 +417,9 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
 CanBeVOPD getCanBeVOPD(unsigned Opc) {
   const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
   if (Info)
-    return {Info->CanBeVOPDX, 1};
+    return {Info->CanBeVOPDX, true};
   else
-    return {0, 0};
+    return {false, false};
 }
 
 unsigned getVOPDOpcode(unsigned Opc) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 65ed02ca62de..a2d59abd3abb 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -30,7 +30,7 @@ namespace AMDGPU {
 Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
 
 std::vector<GlobalVariable *> findVariablesToLower(Module &M,
-                                                   const Function *F = nullptr);
+                                                   const Function *F);
 
 /// Replace all uses of constant \p C with instructions in \p F.
 void replaceConstantUsesInFunction(ConstantExpr *C, const Function *F);
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1d374a9f90ba..73e4eb8cdc24 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -499,6 +499,59 @@ let SubtargetPredicate = isGFX9Only in {
   defm V_SCREEN_PARTITION_4SE_B32 : VOP1Inst <"v_screen_partition_4se_b32", VOP_I32_I32>;
 } // End SubtargetPredicate = isGFX9Only
 
+class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
+  let HasExtSDWA = 1;
+  let HasExtSDWA9 = 1;
+  let HasExt = 1;
+  let DstRCSDWA = getVALUDstForVT<vt>.ret;
+  let InsSDWA = (ins Bin32SDWAInputMods:$src0_modifiers, Src0SDWA:$src0,
+                     clampmod:$clamp, omod:$omod, src0_sel:$src0_sel);
+  let AsmSDWA = "$vdst, $src0_modifiers$clamp$omod $src0_sel"; // No dst_sel
+  let AsmSDWA9 = AsmSDWA;
+  let EmitDstSel = 0;
+}
+
+def VOPProfileCVT_F32_F8    : VOPProfile_Base_CVT_F32_F8 <f32>;
+def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>;
+
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+    SchedRW = [WriteFloatCvt] in {
+  defm V_CVT_F32_FP8    : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>;
+  defm V_CVT_F32_BF8    : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>;
+  defm V_CVT_PK_F32_FP8 : VOP1Inst<"v_cvt_pk_f32_fp8", VOPProfileCVT_PK_F32_F8>;
+  defm V_CVT_PK_F32_BF8 : VOP1Inst<"v_cvt_pk_f32_bf8", VOPProfileCVT_PK_F32_F8>;
+}
+
+class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
+    VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+    (f32 (node i32:$src, index)),
+    !if (index,
+         (inst_sdwa 0, $src, 0, 0, index),
+         (inst_e32 $src))
+>;
+
+foreach Index = [0, 1, 2, 3] in {
+  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index,
+                       V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>;
+  def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index,
+                       V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>;
+}
+
+class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
+    VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+    (v2f32 (node i32:$src, index)),
+    !if (index,
+         (inst_sdwa 0, $src, 0, 0, SDWA.WORD_1),
+         (inst_e32 $src))
+>;
+
+foreach Index = [0, -1] in {
+  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
+                          V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
+  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
+                          V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+}
+
 let SubtargetPredicate = isGFX10Plus in {
   defm V_PIPEFLUSH        : VOP1Inst<"v_pipeflush", VOP_NO_EXT<VOP_NONE>>;
 
@@ -1106,11 +1159,36 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
 
 }
 
+multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
+  let AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9" in {
+    defm NAME : VOP1_Real_e32e64_vi <op>;
+  }
+
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+      let Inst{42-40} = 6;
+    }
+
+  foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+    def _dpp_gfx9 :
+      VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
+      VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
+}
+
 defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 
 let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
 defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
 
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_F32_FP8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
+defm V_CVT_F32_BF8       : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
+defm V_CVT_PK_F32_FP8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
+defm V_CVT_PK_F32_BF8    : VOP1_Real_NoDstSel_SDWA_gfx9<0x57>;
+}
+
 //===----------------------------------------------------------------------===//
 // GFX10
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index dddd0aacc140..a911483cade5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -481,6 +481,30 @@ def shl_0_to_4 : PatFrag<
   }];
 }
 
+def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
+  let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+                          FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+                          VGPR_32:$vdst_in, op_sel0:$op_sel);
+  let HasClamp = 0;
+  let HasExtVOP3DPP = 0;
+}
+
+def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
+                                              VOP3_OPSEL> {
+  let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
+                          FP32InputMods:$src1_modifiers, Src1RC64:$src1,
+                          FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+                          op_sel0:$op_sel);
+  let HasClamp = 0;
+  let HasSrc2 = 0;
+  let HasSrc2Mods = 1;
+  let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
+                            getAsmVOP3OpSel<3, HasClamp,
+                                            HasSrc0FloatMods, HasSrc1FloatMods,
+                                            HasSrc2FloatMods>.ret);
+  let HasExtVOP3DPP = 0;
+}
+
 let SubtargetPredicate = isGFX9Plus in {
 let isCommutable = 1, isReMaterializable = 1 in {
   defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -526,6 +550,43 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32
 let SubtargetPredicate = isGFX940Plus in
 defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
 
+let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+    SchedRW = [WriteFloatCvt] in {
+  let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
+    defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+    defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
+  }
+
+  // These instructions have non-standard use of op_sel. In particular they are
+  // using op_sel bits 2 and 3 while only having two sources. Therefore dummy
+  // src2 is used to hold the op_sel value.
+  let Constraints = "$vdst = $src2", DisableEncoding = "$src2" in {
+    defm V_CVT_SR_FP8_F32 : VOP3Inst<"v_cvt_sr_fp8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+    defm V_CVT_SR_BF8_F32 : VOP3Inst<"v_cvt_sr_bf8_f32", VOP3_CVT_SR_F8_F32_Profile>;
+  }
+}
+
+class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat<
+    (i32 (node f32:$src0, f32:$src1, i32:$old, index)),
+    (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0))
+>;
+
+class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
+    (i32 (node f32:$src0, i32:$src1, i32:$old, index)),
+    (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
+          !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0))
+>;
+
+foreach Index = [0, -1] in {
+  def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
+  def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
+}
+
+foreach Index = [0, 1, 2, 3] in {
+  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_fp8_f32, Index, V_CVT_SR_FP8_F32_e64>;
+  def : Cvt_SR_F8_F32_Pat<int_amdgcn_cvt_sr_bf8_f32, Index, V_CVT_SR_BF8_F32_e64>;
+}
+
 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
   // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
   (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
@@ -699,15 +760,19 @@ def : DivFmasPat<f64, V_DIV_FMAS_F64_e64, VCC_LO>;
 }
 
 class VOP3_DOT_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile<P, Features> {
-  // FIXME VOP3 DPP versions are unsupported
-  let HasExtVOP3DPP = 0;
   let HasClamp = 0;
   let HasOMod = 0;
-  let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64,
-                                NumSrcArgs, HasClamp, HasOMod,
-                                !if(isFloatType<Src0VT>.ret, FPVRegInputMods, IntOpSelMods),
-                                !if(isFloatType<Src1VT>.ret, FPVRegInputMods, IntOpSelMods),
-                                !if(isFloatType<Src2VT>.ret, FPVRegInputMods, IntOpSelMods)>.ret;
+  // Override modifiers for bf16(i16) (same as float modifiers).
+  let HasSrc0Mods = 1;
+  let HasSrc1Mods = 1;
+  let HasSrc2Mods = 1;
+  let Src0ModDPP = FPVRegInputMods;
+  let Src1ModDPP = FPVRegInputMods;
+  let Src2ModVOP3DPP = FPVRegInputMods;
+  let InsVOP3OpSel = getInsVOP3OpSel<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
+                                     HasClamp, HasOMod, FPVRegInputMods,
+                                     FPVRegInputMods, FPVRegInputMods>.ret;
+  let AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs, HasClamp, 1, 1, 1>.ret;
 }
 
 let SubtargetPredicate = isGFX11Plus in {
@@ -723,7 +788,7 @@ let SubtargetPredicate = isGFX11Plus in {
   defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
 } // End SubtargetPredicate = isGFX11Plus
 
-let SubtargetPredicate = HasDot8Insts in {
+let SubtargetPredicate = HasDot8Insts, IsDOT=1 in {
   defm V_DOT2_F16_F16 :   VOP3Inst<"v_dot2_f16_f16",   VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
   defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
 }
@@ -848,9 +913,8 @@ defm V_MAXMIN_U32          : VOP3_Realtriple_gfx11<0x262>;
 defm V_MINMAX_U32          : VOP3_Realtriple_gfx11<0x263>;
 defm V_MAXMIN_I32          : VOP3_Realtriple_gfx11<0x264>;
 defm V_MINMAX_I32          : VOP3_Realtriple_gfx11<0x265>;
-// FIXME VOP3 DPP Dot instructions are unsupported
-defm V_DOT2_F16_F16        : VOP3_Real_Base_gfx11<0x266>;
-defm V_DOT2_BF16_BF16      : VOP3_Real_Base_gfx11<0x267>;
+defm V_DOT2_F16_F16        : VOP3Dot_Realtriple_gfx11<0x266>;
+defm V_DOT2_BF16_BF16      : VOP3Dot_Realtriple_gfx11<0x267>;
 defm V_DIV_SCALE_F32       : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
 defm V_DIV_SCALE_F64       : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
 defm V_MAD_U64_U32_gfx11   : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
@@ -1161,6 +1225,13 @@ multiclass VOP3OpSel_Real_gfx9<bits<10> op> {
             VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl>;
 }
 
+multiclass VOP3OpSel_Real_gfx9_forced_opsel2<bits<10> op> {
+  def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
+            VOP3OpSel_gfx9 <op, !cast<VOP_Pseudo>(NAME#"_e64").Pfl> {
+    let Inst{13} = src2_modifiers{2}; // op_sel(2)
+  }
+}
+
 multiclass VOP3Interp_Real_vi<bits<10> op> {
   def _vi : VOP3_Real<!cast<VOP_Pseudo>(NAME), SIEncodingFamily.VI>,
             VOP3Interp_vi <op, !cast<VOP_Pseudo>(NAME).Pfl>;
@@ -1352,3 +1423,10 @@ defm V_CVT_PKNORM_I16_F16 : VOP3OpSel_Real_gfx9 <0x299>;
 defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
 
 defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
+
+let OtherPredicates = [HasFP8Insts] in {
+defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
+defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
+defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
+defm V_CVT_SR_BF8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a5>;
+}
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 59ce532af59b..f1ce613d613b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -493,6 +493,8 @@ def VOPProfileMAI_I32_I64_X16   : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,       A
 def VOPProfileMAI_I32_I64_X32   : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32,     AISrc_512_b32,  ADst_512,  AVSrc_64>;
 def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32,   AISrc_128_b32,  ADst_128,  AVSrc_64>;
 def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32,  ADst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_I64_X32   : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32,       AISrc_128_b32,  ADst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_I64_X16   : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32,     AISrc_512_b32,  ADst_512,  AVSrc_64>;
 
 def VOPProfileMAI_F32_F32_X4_VCD     : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32,       VISrc_128_f32,  VDst_128>;
 def VOPProfileMAI_F32_F32_X16_VCD    : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32,     VISrc_512_f32,  VDst_512>;
@@ -515,6 +517,8 @@ def VOPProfileMAI_I32_I64_X16_VCD    : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32,
 def VOPProfileMAI_I32_I64_X32_VCD    : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32,     VISrc_512_b32,  VDst_512,  AVSrc_64>;
 def VOPProfileMAI_F32_V2F32_X16_VCD  : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32,   VISrc_128_b32,  VDst_128,  AVSrc_64>;
 def VOPProfileMAI_F32_V2F32_X32_VCD  : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32,  VDst_512,  AVSrc_64>;
+def VOPProfileMAI_F32_I64_X32_VCD    : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32,       VISrc_128_b32,  VDst_128,  AVSrc_64>;
+def VOPProfileMAI_F32_I64_X16_VCD    : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32,     VISrc_512_b32,  VDst_512,  AVSrc_64>;
 
 def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
 def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
@@ -522,6 +526,8 @@ def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I
 def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
 def VOPProfileSMFMAC_I32_16X16X64_I8  : VOPProfileSMFMAC<VOP_V4I32_V2I32_V4I32_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
 def VOPProfileSMFMAC_I32_32X32X32_I8  : VOPProfileSMFMAC<VOP_V16I32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_16X16X64_F8  : VOPProfileSMFMAC<VOP_V4F32_V2I32_V4I32_I32,  AVDst_128, AVSrc_64, AVSrc_128>;
+def VOPProfileSMFMAC_F32_32X32X32_F8  : VOPProfileSMFMAC<VOP_V16F32_V2I32_V4I32_I32, AVDst_512, AVSrc_64, AVSrc_128>;
 
 class MFMATable <bit is_mac, string Name> {
   bit IsMac = is_mac;
@@ -638,6 +644,14 @@ let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
   defm V_MFMA_I32_16X16X32I8       : MAIInst<"v_mfma_i32_16x16x32i8",       "I32_I64_X16",    int_amdgcn_mfma_i32_16x16x32_i8>;
   defm V_MFMA_F32_16X16X8XF32      : MAIInst<"v_mfma_f32_16x16x8xf32",      "F32_V2F32_X16",  int_amdgcn_mfma_f32_16x16x8_xf32>;
   defm V_MFMA_F32_32X32X4XF32      : MAIInst<"v_mfma_f32_32x32x4xf32",      "F32_V2F32_X32",  int_amdgcn_mfma_f32_32x32x4_xf32>;
+  defm V_MFMA_F32_16X16X32_BF8_BF8 : MAIInst<"v_mfma_f32_16x16x32_bf8_bf8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
+  defm V_MFMA_F32_16X16X32_BF8_FP8 : MAIInst<"v_mfma_f32_16x16x32_bf8_fp8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
+  defm V_MFMA_F32_16X16X32_FP8_BF8 : MAIInst<"v_mfma_f32_16x16x32_fp8_bf8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
+  defm V_MFMA_F32_16X16X32_FP8_FP8 : MAIInst<"v_mfma_f32_16x16x32_fp8_fp8", "F32_I64_X32",    int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
+  defm V_MFMA_F32_32X32X16_BF8_BF8 : MAIInst<"v_mfma_f32_32x32x16_bf8_bf8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
+  defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
+  defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
+  defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16",    int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
 } // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1
 
 multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
@@ -654,6 +668,14 @@ defm V_SMFMAC_F32_16X16X32_BF16    : SMFMACInst<"v_smfmac_f32_16x16x32_bf16",
 defm V_SMFMAC_F32_32X32X16_BF16    : SMFMACInst<"v_smfmac_f32_32x32x16_bf16",    "F32_32X32X16_I16", int_amdgcn_smfmac_f32_32x32x16_bf16>;
 defm V_SMFMAC_I32_16X16X64_I8      : SMFMACInst<"v_smfmac_i32_16x16x64_i8",      "I32_16X16X64_I8",  int_amdgcn_smfmac_i32_16x16x64_i8>;
 defm V_SMFMAC_I32_32X32X32_I8      : SMFMACInst<"v_smfmac_i32_32x32x32_i8",      "I32_32X32X32_I8",  int_amdgcn_smfmac_i32_32x32x32_i8>;
+defm V_SMFMAC_F32_16X16X64_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_bf8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
+defm V_SMFMAC_F32_16X16X64_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_bf8_fp8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
+defm V_SMFMAC_F32_16X16X64_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_bf8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
+defm V_SMFMAC_F32_16X16X64_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x64_fp8_fp8", "F32_16X16X64_F8",  int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
+defm V_SMFMAC_F32_32X32X32_BF8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_bf8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
+defm V_SMFMAC_F32_32X32X32_BF8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_bf8_fp8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
+defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
+defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8",  int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
 }
 
 def MAIInstInfoTable : GenericTable {
@@ -1121,6 +1143,14 @@ defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x
 defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
 defm V_MFMA_F32_16X16X8XF32      : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
 defm V_MFMA_F32_32X32X4XF32      : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
+defm V_MFMA_F32_16X16X32_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x70>;
+defm V_MFMA_F32_16X16X32_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x71>;
+defm V_MFMA_F32_16X16X32_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x72>;
+defm V_MFMA_F32_16X16X32_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x73>;
+defm V_MFMA_F32_32X32X16_BF8_BF8 : VOP3P_Real_MFMA_gfx940 <0x74>;
+defm V_MFMA_F32_32X32X16_BF8_FP8 : VOP3P_Real_MFMA_gfx940 <0x75>;
+defm V_MFMA_F32_32X32X16_FP8_BF8 : VOP3P_Real_MFMA_gfx940 <0x76>;
+defm V_MFMA_F32_32X32X16_FP8_FP8 : VOP3P_Real_MFMA_gfx940 <0x77>;
 
 defm V_MFMA_F32_32X32X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
 defm V_MFMA_F32_16X16X4BF16_1K   : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
@@ -1137,6 +1167,14 @@ defm V_SMFMAC_F32_16X16X32_BF16    : VOP3P_Real_SMFMAC <0x66, "v_smfmac_f32_16x1
 defm V_SMFMAC_F32_32X32X16_BF16    : VOP3P_Real_SMFMAC <0x68, "v_smfmac_f32_32x32x16bf16">;
 defm V_SMFMAC_I32_16X16X64_I8      : VOP3P_Real_SMFMAC <0x6a, "v_smfmac_i32_16x16x64i8">;
 defm V_SMFMAC_I32_32X32X32_I8      : VOP3P_Real_SMFMAC <0x6c, "v_smfmac_i32_32x32x32i8">;
+defm V_SMFMAC_F32_16X16X64_BF8_BF8 : VOP3P_Real_SMFMAC <0x78, "v_smfmac_f32_16x16x64bf8bf8">;
+defm V_SMFMAC_F32_16X16X64_BF8_FP8 : VOP3P_Real_SMFMAC <0x79, "v_smfmac_f32_16x16x64bf8fp8">;
+defm V_SMFMAC_F32_16X16X64_FP8_BF8 : VOP3P_Real_SMFMAC <0x7a, "v_smfmac_f32_16x16x64fp8bf8">;
+defm V_SMFMAC_F32_16X16X64_FP8_FP8 : VOP3P_Real_SMFMAC <0x7b, "v_smfmac_f32_16x16x64fp8fp8">;
+defm V_SMFMAC_F32_32X32X32_BF8_BF8 : VOP3P_Real_SMFMAC <0x7c, "v_smfmac_f32_32x32x32bf8bf8">;
+defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x32x32bf8fp8">;
+defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">;
+defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;
 
 let SubtargetPredicate = HasPackedFP32Ops in {
   defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 33d3441e94c2..d489a089ac78 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -59,15 +59,17 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
                     "$src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl");
   let AsmDPP8 = "$src0, $src1 $dpp8$fi";
   let AsmDPP16 = AsmDPP#"$fi";
+  // VOPC DPP Instructions do not need an old operand
+  let TieRegDPP = "";
   let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
                          NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
-                         Src2ModDPP>.ret;
+                         Src2ModDPP, 0/*HasOld*/>.ret;
   let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
                              NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
-                             Src2ModDPP>.ret;
+                             Src2ModDPP, 0/*HasOld*/>.ret;
   let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
                            NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
-                           Src2ModDPP>.ret;
+                           Src2ModDPP, 0/*HasOld*/>.ret;
 
   // The destination for 32-bit encoding is implicit.
   let HasDst32 = 0;
@@ -76,9 +78,9 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
   let Outs64 = (outs VOPDstS64orS32:$sdst);
   let OutsVOP3DPP = Outs64;
   let OutsVOP3DPP8 = Outs64;
-  let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
-  let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
-  let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs>.ret;
+  let InsVOP3DPP = getInsVOP3DPP<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret;
+  let InsVOP3DPP16 = getInsVOP3DPP16<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret;
+  let InsVOP3DPP8 = getInsVOP3DPP8<InsVOP3Base, Src0VOP3DPP, NumSrcArgs, 0/*HasOld*/>.ret;
   list<SchedReadWrite> Schedule = sched;
 }
 
@@ -293,7 +295,7 @@ multiclass VOPC_Pseudos <string opName,
         let Defs = !if(DefExec, [EXEC], []);
         let SchedRW = P.Schedule;
         let isCompare = 1;
-        let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $sdst", "");
+        let Constraints = "";
     }
   } // end SubtargetPredicate = isGFX11Plus
 
@@ -711,7 +713,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   VOPC_Profile<sched, vt, i32> {
   let AsmDPP = "$src0_modifiers, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let AsmDPP16 = AsmDPP#"$fi";
-  let InsDPP = (ins VGPR_32:$old, FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+  let InsDPP = (ins FPVRegInputMods:$src0_modifiers, VGPR_32:$src0, VGPR_32:$src1, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask, bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let InsDPP16 = !con(InsDPP, (ins FI:$fi));
   // DPP8 forbids modifiers and can inherit from VOPC_Profile
 
@@ -793,7 +795,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
       def _e64_dpp : VOP3_DPP_Pseudo<opName, p> {
         let Defs = !if(DefExec, [EXEC], []);
         let SchedRW = p.Schedule;
-        let Constraints = !if(p.NumSrcArgs, p.TieRegDPP # " = $sdst", "");
+        let Constraints = "";
     }
   } // end SubtargetPredicate = isGFX11Plus
 }
@@ -1068,7 +1070,6 @@ class VOPC_DPP16<bits<8> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
   let Uses = ps.Uses;
   let OtherPredicates = ps.OtherPredicates;
   let Constraints = ps.Constraints;
-  let AsmMatchConverter = "cvtVOPCNoDstDPP";
 }
 
 class VOPC_DPP16_SIMC<bits<8> op, VOP_DPP_Pseudo ps, int subtarget,
@@ -1084,7 +1085,6 @@ class VOPC_DPP8<bits<8> op, VOPC_Pseudo ps, string opName = ps.OpName>
   let Uses = ps.Uses;
   let OtherPredicates = ps.OtherPredicates;
   let Constraints = "";
-  let AsmMatchConverter = "cvtVOPCNoDstDPP8";
 }
 
 // VOPC64
@@ -1133,7 +1133,6 @@ class VOPC64_DPP16_NoDst<bits<10> op, VOP_DPP_Pseudo ps,
                          string opName = ps.OpName>
     : VOPC64_DPP16<op, ps, opName> {
   let Inst{7-0} = ? ;
-  let AsmMatchConverter = "cvtVOPC64NoDstDPP";
 }
 
 class VOPC64_DPP8_Base<bits<10> op, string OpName, VOPProfile P>
@@ -1163,13 +1162,12 @@ class VOPC64_DPP8_Dst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
     : VOPC64_DPP8<op, ps, opName> {
   bits<8> sdst;
   let Inst{7-0} = sdst;
-  let Constraints = "$old = $sdst";
+  let Constraints = "";
 }
 
 class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
     : VOPC64_DPP8<op, ps, opName> {
   let Inst{7-0} = ? ;
-  let AsmMatchConverter = "cvtVOPC64NoDstDPP8";
   let Constraints = "";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 187485ffa3ae..b65ca2d6b1b3 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -269,6 +269,10 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
 
 class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
 
+class VOP3DotOpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11<op, p>{
+  let Inst{11} = ?;
+  let Inst{12} = ?;
+}
 
 // NB: For V_INTERP* opcodes, src0 is encoded as src1 and vice versa
 class VOP3Interp_vi <bits<10> op, VOPProfile P> : VOP3e_vi <op, P> {
@@ -1270,6 +1274,8 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
 
 class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
     : VOP3_DPP<op, opName, ps.Pfl, 1> {
+  let VOP3_OPSEL = ps.Pfl.HasOpSel;
+  let IsDOT = ps.IsDOT;
   let hasSideEffects = ps.hasSideEffects;
   let Defs = ps.Defs;
   let SchedRW = ps.SchedRW;
@@ -1285,6 +1291,8 @@ class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
 
 class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
     : VOP3_DPP8<op, opName, ps.Pfl> {
+  let VOP3_OPSEL = ps.Pfl.HasOpSel;
+  let IsDOT = ps.IsDOT;
   let hasSideEffects = ps.hasSideEffects;
   let Defs = ps.Defs;
   let SchedRW = ps.SchedRW;
@@ -1326,6 +1334,15 @@ let AssemblerPredicate = isGFX11Only,
         VOP3e_gfx11<op, ps.Pfl>;
     }
   }
+  multiclass VOP3Dot_Real_Base_gfx11<bits<10> op, string opName = NAME,
+                                     bit isSingle = 0> {
+    defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+    let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+      def _e64_gfx11 :
+        VOP3_Real<ps, SIEncodingFamily.GFX11>,
+        VOP3DotOpSel_gfx11<op, ps.Pfl>;
+    }
+  }
   multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName,
                                        string asmName, bit isSingle = 0> {
     defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
@@ -1355,6 +1372,15 @@ let AssemblerPredicate = isGFX11Only,
       let DecoderNamespace = "DPPGFX11";
       }
   }
+
+  multiclass VOP3Dot_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> {
+    def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> {
+      let Inst{11} = ?;
+      let Inst{12} = ?;
+      let DecoderNamespace = "DPPGFX11";
+      }
+  }
+
   multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName,
                                            string asmName> {
     defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -1368,6 +1394,16 @@ let AssemblerPredicate = isGFX11Only,
       let DecoderNamespace = "DPP8GFX11";
     }
   }
+
+  multiclass VOP3Dot_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> {
+    defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+    def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> {
+      let Inst{11} = ?;
+      let Inst{12} = ?;
+      let DecoderNamespace = "DPP8GFX11";
+    }
+  }
+
   multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName,
                                            string asmName> {
     defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -1406,6 +1442,12 @@ multiclass VOP3_Realtriple_gfx11<bits<10> op,
   VOP3_Real_dpp_Base_gfx11<op, opName>,
   VOP3_Real_dpp8_Base_gfx11<op, opName>;
 
+multiclass VOP3Dot_Realtriple_gfx11<bits<10> op,
+                                 bit isSingle = 0, string opName = NAME> :
+  VOP3Dot_Real_Base_gfx11<op, opName, isSingle>,
+  VOP3Dot_Real_dpp_Base_gfx11<op, opName>,
+  VOP3Dot_Real_dpp8_Base_gfx11<op, opName>;
+
 multiclass VOP3Only_Realtriple_gfx11<bits<10> op> :
   VOP3_Realtriple_gfx11<op, 1>;
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 80ba7b5f0d2e..183febe756c1 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6726,8 +6726,8 @@ bool ARMBaseInstrInfo::shouldOutlineFromFunctionByDefault(
   return Subtarget.isMClass() && MF.getFunction().hasMinSize();
 }
 
-bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                                         AAResults *AA) const {
+bool ARMBaseInstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI) const {
   // Try hard to rematerialize any VCTPs because if we spill P0, it will block
   // the tail predication conversion. This means that the element count
   // register has to be live for longer, but that has to be better than
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index 3b8f3403e3c3..453e3fa1b99b 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -480,8 +480,7 @@ private:
   MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
                                  const TargetInstrInfo *TII) const;
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AAResults *AA) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
 
 private:
   /// Modeling special VFP / NEON fp MLA / MLS hazards.
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 613904f702f0..e5347ed8e53a 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -1720,6 +1720,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
                                      unsigned UxtOp,
                                      MachineBasicBlock::iterator &NextMBBI) {
   bool IsThumb = STI->isThumb();
+  bool IsThumb1Only = STI->isThumb1Only();
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   const MachineOperand &Dest = MI.getOperand(0);
@@ -1794,7 +1795,8 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
     MIB.addImm(0); // a 32-bit Thumb strex (only) allows an offset.
   MIB.add(predOps(ARMCC::AL));
 
-  unsigned CMPri = IsThumb ? ARM::t2CMPri : ARM::CMPri;
+  unsigned CMPri =
+      IsThumb ? (IsThumb1Only ? ARM::tCMPi8 : ARM::t2CMPri) : ARM::CMPri;
   BuildMI(StoreBB, DL, TII->get(CMPri))
       .addReg(TempReg, RegState::Kill)
       .addImm(0)
@@ -1848,6 +1850,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
                                         MachineBasicBlock::iterator &NextMBBI) {
   bool IsThumb = STI->isThumb();
+  assert(!STI->isThumb1Only() && "CMP_SWAP_64 unsupported under Thumb1!");
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
   MachineOperand &Dest = MI.getOperand(0);
@@ -3044,6 +3047,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       assert(STI->isThumb());
       return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREXH, ARM::t2STREXH, ARM::tUXTH,
                             NextMBBI);
+    case ARM::tCMP_SWAP_32:
+      assert(STI->isThumb());
+      return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0, NextMBBI);
 
     case ARM::CMP_SWAP_8:
       assert(!STI->isThumb());
@@ -3054,11 +3060,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREXH, ARM::STREXH, ARM::UXTH,
                             NextMBBI);
     case ARM::CMP_SWAP_32:
-      if (STI->isThumb())
-        return ExpandCMP_SWAP(MBB, MBBI, ARM::t2LDREX, ARM::t2STREX, 0,
-                              NextMBBI);
-      else
-        return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
+      assert(!STI->isThumb());
+      return ExpandCMP_SWAP(MBB, MBBI, ARM::LDREX, ARM::STREX, 0, NextMBBI);
 
     case ARM::CMP_SWAP_64:
       return ExpandCMP_SWAP_64(MBB, MBBI, NextMBBI);
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index e0e4ffd90e0e..afe16a3cd55c 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -3131,7 +3131,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {
 
     // Else v8i16 pattern of an extract and an insert, with a optional vmovx for
     // extracting odd lanes.
-    if (VT == MVT::v8i16) {
+    if (VT == MVT::v8i16 && Subtarget->hasFullFP16()) {
       SDValue Inp1 = CurDAG->getTargetExtractSubreg(
           ARM::ssub_0 + ExtractLane1 / 2, dl, MVT::f32, Val1.getOperand(0));
       SDValue Inp2 = CurDAG->getTargetExtractSubreg(
@@ -3151,7 +3151,7 @@ bool ARMDAGToDAGISel::tryInsertVectorElt(SDNode *N) {
 
   // The inserted values are not extracted - if they are f16 then insert them
   // directly using a VINS.
-  if (VT == MVT::v8f16) {
+  if (VT == MVT::v8f16 && Subtarget->hasFullFP16()) {
     SDNode *VINS = CurDAG->getMachineNode(ARM::VINSH, dl, MVT::f32, Val2, Val1);
     SDValue NewIns =
         CurDAG->getTargetInsertSubreg(ARM::ssub_0 + Lane2 / 2, dl, MVT::v4f32,
@@ -3512,7 +3512,7 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   else if (MemTy == MVT::i16)
     Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_16 : ARM::CMP_SWAP_16;
   else if (MemTy == MVT::i32)
-    Opcode = ARM::CMP_SWAP_32;
+    Opcode = Subtarget->isThumb() ? ARM::tCMP_SWAP_32 : ARM::CMP_SWAP_32;
   else
     llvm_unreachable("Unknown AtomicCmpSwap type");
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index e6be93e6480a..743cca9ff71f 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -13572,6 +13572,10 @@ static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG,
 bool
 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
                                                  CombineLevel Level) const {
+  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) &&
+         "Expected shift op");
+
   if (Level == BeforeLegalizeTypes)
     return true;
 
@@ -13605,8 +13609,38 @@ ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
   return false;
 }
 
+bool ARMTargetLowering::isDesirableToCommuteXorWithShift(
+    const SDNode *N) const {
+  assert(N->getOpcode() == ISD::XOR &&
+         (N->getOperand(0).getOpcode() == ISD::SHL ||
+          N->getOperand(0).getOpcode() == ISD::SRL) &&
+         "Expected XOR(SHIFT) pattern");
+
+  // Only commute if the entire NOT mask is a hidden shifted mask.
+  auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
+  if (XorC && ShiftC) {
+    unsigned MaskIdx, MaskLen;
+    if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
+      unsigned ShiftAmt = ShiftC->getZExtValue();
+      unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+      if (N->getOperand(0).getOpcode() == ISD::SHL)
+        return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
+      return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
+    }
+  }
+
+  return false;
+}
+
 bool ARMTargetLowering::shouldFoldConstantShiftPairToMask(
     const SDNode *N, CombineLevel Level) const {
+  assert(((N->getOpcode() == ISD::SHL &&
+           N->getOperand(0).getOpcode() == ISD::SRL) ||
+          (N->getOpcode() == ISD::SRL &&
+           N->getOperand(0).getOpcode() == ISD::SHL)) &&
+         "Expected shift-shift mask");
+
   if (!Subtarget->isThumb1Only())
     return true;
 
@@ -19962,6 +19996,14 @@ bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
     }
     break;
   }
+  case ARMISD::VBICIMM: {
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ModImm = Op.getConstantOperandVal(1);
+    unsigned EltBits = 0;
+    uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
+    if ((OriginalDemandedBits & Mask) == 0)
+      return TLO.CombineTo(Op, Op0);
+  }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 10f60ab93ae3..fae279ea7569 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -733,6 +733,8 @@ class VectorType;
     bool isDesirableToCommuteWithShift(const SDNode *N,
                                        CombineLevel Level) const override;
 
+    bool isDesirableToCommuteXorWithShift(const SDNode *N) const override;
+
     bool shouldFoldConstantShiftPairToMask(const SDNode *N,
                                            CombineLevel Level) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 15c33014e988..9c03f72fe6ae 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -1882,6 +1882,7 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane),
             (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
   // For i16's inserts being extracted from low lanes, then may use VINS.
+  let Predicates = [HasFullFP16] in {
   def : Pat<(ARMinsertelt (v8i16 MQPR:$src1),
                           (ARMvgetlaneu (v8i16 MQPR:$src2), imm_even:$extlane),
                           imm_odd:$inslane),
@@ -1889,6 +1890,7 @@ let Predicates = [HasMVEInt] in {
                                 (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$inslane)),
                                        (EXTRACT_SUBREG MQPR:$src2, (SSubReg_f16_reg imm_even:$extlane))),
                                 (SSubReg_f16_reg imm_odd:$inslane)), MQPR)>;
+  }
 
   def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
             (MVE_VMOV_to_lane_8  (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
@@ -1905,17 +1907,21 @@ let Predicates = [HasMVEInt] in {
 
   def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_even:$lane),
             (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>;
+  let Predicates = [HasFullFP16] in {
   def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_odd:$lane),
             (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)),
                                 (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$lane)),
                                        (COPY_TO_REGCLASS HPR:$src2, SPR)),
                                 (SSubReg_f16_reg imm_odd:$lane)), MQPR)>;
+  }
   def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
             (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
+  let Predicates = [HasFullFP16] in {
   def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
             (COPY_TO_REGCLASS
               (VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
               HPR)>;
+  }
 
   def : Pat<(v2f64 (scalar_to_vector (f64 DPR:$src))),
             (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, dsub_0)>;
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index 71527ae1ab11..8f7039a327b3 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -1782,11 +1782,15 @@ def tLDRConstPool
 
 let Constraints = "@earlyclobber $Rd,@earlyclobber $temp",
     mayLoad = 1, mayStore = 1 in {
-def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+def tCMP_SWAP_8 : PseudoInst<(outs GPR:$Rd, tGPR:$temp),
                              (ins GPR:$addr, tGPR:$desired, GPR:$new),
                              NoItinerary, []>, Sched<[]>;
 
-def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, GPR:$temp),
+def tCMP_SWAP_16 : PseudoInst<(outs GPR:$Rd, tGPR:$temp),
                               (ins GPR:$addr, tGPR:$desired, GPR:$new),
                               NoItinerary, []>, Sched<[]>;
+
+def tCMP_SWAP_32 : PseudoInst<(outs GPR:$Rd, tGPR:$temp),
+                              (ins GPR:$addr, GPR:$desired, GPR:$new),
+                              NoItinerary, []>, Sched<[]>;
 }
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index ba1d806c8d81..3c102463ba08 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -20,8 +20,8 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
@@ -33,6 +33,7 @@
 #include "llvm/Transforms/InstCombine/InstCombiner.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -2197,12 +2198,9 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
   return true;
 }
 
-bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
-                                             ScalarEvolution &SE,
-                                             AssumptionCache &AC,
-                                             TargetLibraryInfo *TLI,
-                                             DominatorTree *DT,
-                                             const LoopAccessInfo *LAI) {
+bool ARMTTIImpl::preferPredicateOverEpilogue(
+    Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+    TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) {
   if (!EnableTailPredication) {
     LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
     return false;
@@ -2244,7 +2242,7 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
     return false;
   }
 
-  return canTailPredicateLoop(L, LI, SE, DL, LAI);
+  return canTailPredicateLoop(L, LI, SE, DL, LVL->getLAI());
 }
 
 PredicationStyle ARMTTIImpl::emitGetActiveLaneMask() const {
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index dcf82e703a7f..9c3980d79e60 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -288,12 +288,10 @@ public:
                                 AssumptionCache &AC,
                                 TargetLibraryInfo *LibInfo,
                                 HardwareLoopInfo &HWLoopInfo);
-  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
-                                   ScalarEvolution &SE,
-                                   AssumptionCache &AC,
-                                   TargetLibraryInfo *TLI,
+  bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                                   AssumptionCache &AC, TargetLibraryInfo *TLI,
                                    DominatorTree *DT,
-                                   const LoopAccessInfo *LAI);
+                                   LoopVectorizationLegality *LVL);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
                                OptimizationRemarkEmitter *ORE);
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h
index 2325193bac0a..3dd71243387b 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -92,15 +92,15 @@ public:
   }
 
   /// Get I/O register addresses.
-  int getIORegRAMPZ(void) const { return hasELPM() ? 0x3b : -1; }
-  int getIORegEIND(void) const { return hasEIJMPCALL() ? 0x3c : -1; }
-  int getIORegSPL(void) const { return 0x3d; }
-  int getIORegSPH(void) const { return hasSmallStack() ? -1 : 0x3e; }
-  int getIORegSREG(void) const { return 0x3f; }
+  int getIORegRAMPZ() const { return hasELPM() ? 0x3b : -1; }
+  int getIORegEIND() const { return hasEIJMPCALL() ? 0x3c : -1; }
+  int getIORegSPL() const { return 0x3d; }
+  int getIORegSPH() const { return hasSmallStack() ? -1 : 0x3e; }
+  int getIORegSREG() const { return 0x3f; }
 
   /// Get GPR aliases.
-  int getRegTmpIndex(void) const { return hasTinyEncoding() ? 16 : 0; }
-  int getRegZeroIndex(void) const { return hasTinyEncoding() ? 17 : 1; }
+  int getRegTmpIndex() const { return hasTinyEncoding() ? 16 : 0; }
+  int getRegZeroIndex() const { return hasTinyEncoding() ? 17 : 1; }
 
 private:
   /// The ELF e_flags architecture.
diff --git a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
index d490b385ac16..0bf739452fd2 100644
--- a/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
+++ b/llvm/lib/Target/CSKY/CSKYInstrInfo.cpp
@@ -518,7 +518,7 @@ void CSKYInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   unsigned Opcode = 0;
   if (CSKY::GPRRegClass.contains(DestReg, SrcReg))
-    Opcode = CSKY::MOV32;
+    Opcode = STI.hasE2() ? CSKY::MOV32 : CSKY::MOV16;
   else if (v2sf && CSKY::sFPR32RegClass.contains(DestReg, SrcReg))
     Opcode = CSKY::FMOV_S;
   else if (v3sf && CSKY::FPR32RegClass.contains(DestReg, SrcReg))
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
index 3e09270a66d0..869433613620 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILBitcodeWriter.cpp
@@ -13,6 +13,7 @@
 #include "DXILBitcodeWriter.h"
 #include "DXILValueEnumerator.h"
 #include "PointerTypeAnalysis.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/BitcodeCommon.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -2580,10 +2581,9 @@ void DXILBitcodeWriter::writeFunctionLevelValueSymbolTable(
     SortedTable.push_back(VI.second->getValueName());
   }
   // The keys are unique, so there shouldn't be stability issues.
-  std::sort(SortedTable.begin(), SortedTable.end(),
-            [](const ValueName *A, const ValueName *B) {
-              return A->first() < B->first();
-            });
+  llvm::sort(SortedTable, [](const ValueName *A, const ValueName *B) {
+    return A->first() < B->first();
+  });
 
   for (const ValueName *SI : SortedTable) {
     auto &Name = *SI;
diff --git a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
index 08944ee3f1fe..e2a41515de38 100644
--- a/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
+++ b/llvm/lib/Target/DirectX/DXILWriter/DXILValueEnumerator.cpp
@@ -809,7 +809,7 @@ void ValueEnumerator::organizeMetadata() {
   //   - by function, then
   //   - by isa<MDString>
   // and then sort by the original/current ID.  Since the IDs are guaranteed to
-  // be unique, the result of std::sort will be deterministic.  There's no need
+  // be unique, the result of llvm::sort will be deterministic.  There's no need
   // for std::stable_sort.
   llvm::sort(Order, [this](MDIndex LHS, MDIndex RHS) {
     return std::make_tuple(LHS.F, getMetadataTypeOrder(LHS.get(MDs)), LHS.ID) <
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index abd84a188cfa..bd0232c71d48 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -85,7 +85,6 @@ public:
   int getAllocSizeOf(const Type *Ty) const;
   int getTypeAlignment(Type *Ty) const;
 
-  VectorType *getByteVectorTy(int ScLen) const;
   Constant *getNullValue(Type *Ty) const;
   Constant *getFullValue(Type *Ty) const;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 4acf90bd9788..93c8864347bb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -217,9 +217,8 @@ SDValue LoongArchTargetLowering::lowerGlobalAddress(SDValue Op,
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   unsigned ADDIOp = Subtarget.is64Bit() ? LoongArch::ADDI_D : LoongArch::ADDI_W;
 
-  // FIXME: Only support PC-relative addressing to access the symbol.
-  // TODO: Add target flags.
-  if (!isPositionIndependent()) {
+  // TODO: Support dso_preemptable and target flags.
+  if (GV->isDSOLocal()) {
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty);
     SDValue AddrHi(DAG.getMachineNode(LoongArch::PCALAU12I, DL, Ty, GA), 0);
     SDValue Addr(DAG.getMachineNode(ADDIOp, DL, Ty, AddrHi, GA), 0);
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index 468c4f43cb90..2d08d5c674bc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -38,9 +38,7 @@ static std::string computeDataLayout(const Triple &TT) {
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
-  if (!RM.hasValue())
-    return Reloc::Static;
-  return *RM;
+  return RM.value_or(Reloc::Static);
 }
 
 LoongArchTargetMachine::LoongArchTargetMachine(
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index b98be4ae4b75..4dfc16526a00 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -1192,6 +1192,12 @@ bool MipsTargetLowering::hasBitTest(SDValue X, SDValue Y) const {
 
 bool MipsTargetLowering::shouldFoldConstantShiftPairToMask(
     const SDNode *N, CombineLevel Level) const {
+  assert(((N->getOpcode() == ISD::SHL &&
+           N->getOperand(0).getOpcode() == ISD::SRL) ||
+          (N->getOpcode() == ISD::SRL &&
+           N->getOperand(0).getOpcode() == ISD::SHL)) &&
+         "Expected shift-shift mask");
+
   if (N->getOperand(0).getValueType().isVector())
     return false;
   return true;
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 9977d8ba0300..45e82e935772 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -73,8 +73,10 @@
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -354,8 +356,7 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
       // PTX ABI requires all scalar return values to be at least 32
       // bits in size.  fp16 normally uses .b16 as its storage type in
       // PTX, so its size must be adjusted here, too.
-      if (size < 32)
-        size = 32;
+      size = promoteScalarArgumentSize(size);
 
       O << ".param .b" << size << " func_retval0";
     } else if (isa<PointerType>(Ty)) {
@@ -384,8 +385,8 @@ void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
 
       for (unsigned j = 0, je = elems; j != je; ++j) {
         unsigned sz = elemtype.getSizeInBits();
-        if (elemtype.isInteger() && (sz < 32))
-          sz = 32;
+        if (elemtype.isInteger())
+          sz = promoteScalarArgumentSize(sz);
         O << ".reg .b" << sz << " func_retval" << idx;
         if (j < je - 1)
           O << ", ";
@@ -1168,31 +1169,37 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           GVar->hasInitializer()) {
         const Constant *Initializer = GVar->getInitializer();
         if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
-          AggBuffer aggBuffer(ElementSize, O, *this);
+          AggBuffer aggBuffer(ElementSize, *this);
           bufferAggregateConstant(Initializer, &aggBuffer);
-          if (aggBuffer.numSymbols) {
-            if (static_cast<const NVPTXTargetMachine &>(TM).is64Bit()) {
-              O << " .u64 ";
+          if (aggBuffer.numSymbols()) {
+            unsigned int ptrSize = MAI->getCodePointerSize();
+            if (ElementSize % ptrSize ||
+                !aggBuffer.allSymbolsAligned(ptrSize)) {
+              // Print in bytes and use the mask() operator for pointers.
+              if (!STI.hasMaskOperator())
+                report_fatal_error(
+                    "initialized packed aggregate with pointers '" +
+                    GVar->getName() +
+                    "' requires at least PTX ISA version 7.1");
+              O << " .u8 ";
               getSymbol(GVar)->print(O, MAI);
-              O << "[";
-              O << ElementSize / 8;
+              O << "[" << ElementSize << "] = {";
+              aggBuffer.printBytes(O);
+              O << "}";
             } else {
-              O << " .u32 ";
+              O << " .u" << ptrSize * 8 << " ";
               getSymbol(GVar)->print(O, MAI);
-              O << "[";
-              O << ElementSize / 4;
+              O << "[" << ElementSize / ptrSize << "] = {";
+              aggBuffer.printWords(O);
+              O << "}";
             }
-            O << "]";
           } else {
             O << " .b8 ";
             getSymbol(GVar)->print(O, MAI);
-            O << "[";
-            O << ElementSize;
-            O << "]";
+            O << "[" << ElementSize << "] = {";
+            aggBuffer.printBytes(O);
+            O << "}";
           }
-          O << " = {";
-          aggBuffer.print();
-          O << "}";
         } else {
           O << " .b8 ";
           getSymbol(GVar)->print(O, MAI);
@@ -1219,6 +1226,80 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   O << ";\n";
 }
 
+void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
+  const Value *v = Symbols[nSym];
+  const Value *v0 = SymbolsBeforeStripping[nSym];
+  if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+    MCSymbol *Name = AP.getSymbol(GVar);
+    PointerType *PTy = dyn_cast<PointerType>(v0->getType());
+    // Is v0 a generic pointer?
+    bool isGenericPointer = PTy && PTy->getAddressSpace() == 0;
+    if (EmitGeneric && isGenericPointer && !isa<Function>(v)) {
+      os << "generic(";
+      Name->print(os, AP.MAI);
+      os << ")";
+    } else {
+      Name->print(os, AP.MAI);
+    }
+  } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
+    const MCExpr *Expr = AP.lowerConstantForGV(cast<Constant>(CExpr), false);
+    AP.printMCExpr(*Expr, os);
+  } else
+    llvm_unreachable("symbol type unknown");
+}
+
+void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
+  unsigned int ptrSize = AP.MAI->getCodePointerSize();
+  symbolPosInBuffer.push_back(size);
+  unsigned int nSym = 0;
+  unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+  for (unsigned int pos = 0; pos < size;) {
+    if (pos)
+      os << ", ";
+    if (pos != nextSymbolPos) {
+      os << (unsigned int)buffer[pos];
+      ++pos;
+      continue;
+    }
+    // Generate a per-byte mask() operator for the symbol, which looks like:
+    //   .global .u8 addr[] = {0xFF(foo), 0xFF00(foo), 0xFF0000(foo), ...};
+    // See https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers
+    std::string symText;
+    llvm::raw_string_ostream oss(symText);
+    printSymbol(nSym, oss);
+    for (unsigned i = 0; i < ptrSize; ++i) {
+      if (i)
+        os << ", ";
+      llvm::write_hex(os, 0xFFULL << i * 8, HexPrintStyle::PrefixUpper);
+      os << "(" << symText << ")";
+    }
+    pos += ptrSize;
+    nextSymbolPos = symbolPosInBuffer[++nSym];
+    assert(nextSymbolPos >= pos);
+  }
+}
+
+void NVPTXAsmPrinter::AggBuffer::printWords(raw_ostream &os) {
+  unsigned int ptrSize = AP.MAI->getCodePointerSize();
+  symbolPosInBuffer.push_back(size);
+  unsigned int nSym = 0;
+  unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
+  assert(nextSymbolPos % ptrSize == 0);
+  for (unsigned int pos = 0; pos < size; pos += ptrSize) {
+    if (pos)
+      os << ", ";
+    if (pos == nextSymbolPos) {
+      printSymbol(nSym, os);
+      nextSymbolPos = symbolPosInBuffer[++nSym];
+      assert(nextSymbolPos % ptrSize == 0);
+      assert(nextSymbolPos >= pos + ptrSize);
+    } else if (ptrSize == 4)
+      os << support::endian::read32le(&buffer[pos]);
+    else
+      os << support::endian::read64le(&buffer[pos]);
+  }
+}
+
 void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
   if (localDecls.find(f) == localDecls.end())
     return;
@@ -1494,8 +1575,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       unsigned sz = 0;
       if (isa<IntegerType>(Ty)) {
         sz = cast<IntegerType>(Ty)->getBitWidth();
-        if (sz < 32)
-          sz = 32;
+        sz = promoteScalarArgumentSize(sz);
       } else if (isa<PointerType>(Ty))
         sz = thePointerTy.getSizeInBits();
       else if (Ty->isHalfTy())
@@ -1559,8 +1639,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
 
         for (unsigned j = 0, je = elems; j != je; ++j) {
           unsigned sz = elemtype.getSizeInBits();
-          if (elemtype.isInteger() && (sz < 32))
-            sz = 32;
+          if (elemtype.isInteger())
+            sz = promoteScalarArgumentSize(sz);
           O << "\t.reg .b" << sz << " ";
           printParamName(I, paramIndex, O);
           if (j < je - 1)
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index cd61e99a103a..710c089e3325 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -61,24 +61,30 @@ class MCOperand;
 class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
 
   class AggBuffer {
-    // Used to buffer the emitted string for initializing global
-    // aggregates.
+    // Used to buffer the emitted string for initializing global aggregates.
     //
-    // Normally an aggregate (array, vector or structure) is emitted
-    // as a u8[]. However, if one element/field of the aggregate
-    // is a non-NULL address, then the aggregate is emitted as u32[]
-    // or u64[].
+    // Normally an aggregate (array, vector, or structure) is emitted as a u8[].
+    // However, if either element/field of the aggregate is a non-NULL address,
+    // and all such addresses are properly aligned, then the aggregate is
+    // emitted as u32[] or u64[]. In the case of unaligned addresses, the
+    // aggregate is emitted as u8[], and the mask() operator is used for all
+    // pointers.
     //
-    // We first layout the aggregate in 'buffer' in bytes, except for
-    // those symbol addresses. For the i-th symbol address in the
-    //aggregate, its corresponding 4-byte or 8-byte elements in 'buffer'
-    // are filled with 0s. symbolPosInBuffer[i-1] records its position
-    // in 'buffer', and Symbols[i-1] records the Value*.
+    // We first layout the aggregate in 'buffer' in bytes, except for those
+    // symbol addresses. For the i-th symbol address in the aggregate, its
+    // corresponding 4-byte or 8-byte elements in 'buffer' are filled with 0s.
+    // symbolPosInBuffer[i-1] records its position in 'buffer', and Symbols[i-1]
+    // records the Value*.
     //
-    // Once we have this AggBuffer setup, we can choose how to print
-    // it out.
+    // Once we have this AggBuffer setup, we can choose how to print it out.
   public:
-    unsigned numSymbols;   // number of symbol addresses
+    // number of symbol addresses
+    unsigned numSymbols() const { return Symbols.size(); }
+
+    bool allSymbolsAligned(unsigned ptrSize) const {
+      return llvm::all_of(symbolPosInBuffer,
+                          [=](unsigned pos) { return pos % ptrSize == 0; });
+    }
 
   private:
     const unsigned size;   // size of the buffer in bytes
@@ -94,15 +100,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     // SymbolsBeforeStripping[i].
     SmallVector<const Value *, 4> SymbolsBeforeStripping;
     unsigned curpos;
-    raw_ostream &O;
     NVPTXAsmPrinter &AP;
     bool EmitGeneric;
 
   public:
-    AggBuffer(unsigned size, raw_ostream &O, NVPTXAsmPrinter &AP)
-        : size(size), buffer(size), O(O), AP(AP) {
+    AggBuffer(unsigned size, NVPTXAsmPrinter &AP)
+        : size(size), buffer(size), AP(AP) {
       curpos = 0;
-      numSymbols = 0;
       EmitGeneric = AP.EmitGeneric;
     }
 
@@ -135,63 +139,13 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       symbolPosInBuffer.push_back(curpos);
       Symbols.push_back(GVar);
       SymbolsBeforeStripping.push_back(GVarBeforeStripping);
-      numSymbols++;
     }
 
-    void print() {
-      if (numSymbols == 0) {
-        // print out in bytes
-        for (unsigned i = 0; i < size; i++) {
-          if (i)
-            O << ", ";
-          O << (unsigned int) buffer[i];
-        }
-      } else {
-        // print out in 4-bytes or 8-bytes
-        unsigned int pos = 0;
-        unsigned int nSym = 0;
-        unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
-        unsigned int nBytes = 4;
-        if (static_cast<const NVPTXTargetMachine &>(AP.TM).is64Bit())
-          nBytes = 8;
-        for (pos = 0; pos < size; pos += nBytes) {
-          if (pos)
-            O << ", ";
-          if (pos == nextSymbolPos) {
-            const Value *v = Symbols[nSym];
-            const Value *v0 = SymbolsBeforeStripping[nSym];
-            if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-              MCSymbol *Name = AP.getSymbol(GVar);
-              PointerType *PTy = dyn_cast<PointerType>(v0->getType());
-              bool IsNonGenericPointer = false; // Is v0 a non-generic pointer?
-              if (PTy && PTy->getAddressSpace() != 0) {
-                IsNonGenericPointer = true;
-              }
-              if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
-                O << "generic(";
-                Name->print(O, AP.MAI);
-                O << ")";
-              } else {
-                Name->print(O, AP.MAI);
-              }
-            } else if (const ConstantExpr *CExpr = dyn_cast<ConstantExpr>(v0)) {
-              const MCExpr *Expr =
-                AP.lowerConstantForGV(cast<Constant>(CExpr), false);
-              AP.printMCExpr(*Expr, O);
-            } else
-              llvm_unreachable("symbol type unknown");
-            nSym++;
-            if (nSym >= numSymbols)
-              nextSymbolPos = size + 1;
-            else
-              nextSymbolPos = symbolPosInBuffer[nSym];
-          } else if (nBytes == 4)
-            O << *(unsigned int *)(&buffer[pos]);
-          else
-            O << *(unsigned long long *)(&buffer[pos]);
-        }
-      }
-    }
+    void printBytes(raw_ostream &os);
+    void printWords(raw_ostream &os);
+
+  private:
+    void printSymbol(unsigned nSym, raw_ostream &os);
   };
 
   friend class AggBuffer;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 6ad016dfa0a7..8264032b765a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -206,6 +206,40 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
   }
 }
 
+/// PromoteScalarIntegerPTX
+/// Used to make sure the arguments/returns are suitable for passing
+/// and promote them to a larger size if they're not.
+///
+/// The promoted type is placed in \p PromoteVT if the function returns true.
+static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT) {
+  if (VT.isScalarInteger()) {
+    switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
+    default:
+      llvm_unreachable(
+          "Promotion is not suitable for scalars of size larger than 64-bits");
+    case 1:
+      *PromotedVT = MVT::i1;
+      break;
+    case 2:
+    case 4:
+    case 8:
+      *PromotedVT = MVT::i8;
+      break;
+    case 16:
+      *PromotedVT = MVT::i16;
+      break;
+    case 32:
+      *PromotedVT = MVT::i32;
+      break;
+    case 64:
+      *PromotedVT = MVT::i64;
+      break;
+    }
+    return EVT(*PromotedVT) != VT;
+  }
+  return false;
+}
+
 // Check whether we can merge loads/stores of some of the pieces of a
 // flattened function parameter or return value into a single vector
 // load/store.
@@ -1291,8 +1325,7 @@ std::string NVPTXTargetLowering::getPrototype(
       // PTX ABI requires all scalar return values to be at least 32
       // bits in size.  fp16 normally uses .b16 as its storage type in
       // PTX, so its size must be adjusted here, too.
-      if (size < 32)
-        size = 32;
+      size = promoteScalarArgumentSize(size);
 
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
@@ -1343,8 +1376,7 @@ std::string NVPTXTargetLowering::getPrototype(
       unsigned sz = 0;
       if (isa<IntegerType>(Ty)) {
         sz = cast<IntegerType>(Ty)->getBitWidth();
-        if (sz < 32)
-          sz = 32;
+        sz = promoteScalarArgumentSize(sz);
       } else if (isa<PointerType>(Ty)) {
         sz = PtrVT.getSizeInBits();
       } else if (Ty->isHalfTy())
@@ -1515,11 +1547,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       NeedAlign = true;
     } else {
       // declare .param .b<size> .param<n>;
-      if ((VT.isInteger() || VT.isFloatingPoint()) && TypeSize < 4) {
+      if (VT.isInteger() || VT.isFloatingPoint()) {
         // PTX ABI requires integral types to be at least 32 bits in
         // size. FP16 is loaded/stored using i16, so it's handled
         // here as well.
-        TypeSize = 4;
+        TypeSize = promoteScalarArgumentSize(TypeSize * 8) / 8;
       }
       SDValue DeclareScalarParamOps[] = {
           Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
@@ -1556,6 +1588,17 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
 
       SDValue StVal = OutVals[OIdx];
+
+      MVT PromotedVT;
+      if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
+        EltVT = EVT(PromotedVT);
+      }
+      if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
+        llvm::ISD::NodeType Ext =
+            Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+        StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
+      }
+
       if (IsByVal) {
         auto PtrVT = getPointerTy(DL);
         SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
@@ -1638,9 +1681,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // Plus, this behavior is consistent with nvcc's.
     if (RetTy->isFloatingPointTy() || RetTy->isPointerTy() ||
         (RetTy->isIntegerTy() && !RetTy->isIntegerTy(128))) {
-      // Scalar needs to be at least 32bit wide
-      if (resultsz < 32)
-        resultsz = 32;
+      resultsz = promoteScalarArgumentSize(resultsz);
       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
                                   DAG.getConstant(resultsz, dl, MVT::i32),
@@ -1778,6 +1819,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       EVT TheLoadType = VTs[i];
       EVT EltType = Ins[i].VT;
       Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
+      MVT PromotedVT;
+
+      if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
+        TheLoadType = EVT(PromotedVT);
+        EltType = EVT(PromotedVT);
+        needTruncate = true;
+      }
+
       if (ExtendIntegerRetVal) {
         TheLoadType = MVT::i32;
         EltType = MVT::i32;
@@ -2558,6 +2607,13 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
             // v2f16 was loaded as an i32. Now we must bitcast it back.
             else if (EltVT == MVT::v2f16)
               Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
+
+            // If a promoted integer type is used, truncate down to the original
+            MVT PromotedVT;
+            if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
+              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+            }
+
             // Extend the element if necessary (e.g. an i8 is loaded
             // into an i16 register)
             if (Ins[InsIdx].VT.isInteger() &&
@@ -2627,11 +2683,26 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     return Chain;
 
   const DataLayout &DL = DAG.getDataLayout();
+  SmallVector<SDValue, 16> PromotedOutVals;
   SmallVector<EVT, 16> VTs;
   SmallVector<uint64_t, 16> Offsets;
   ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
 
+  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+    SDValue PromotedOutVal = OutVals[i];
+    MVT PromotedVT;
+    if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
+      VTs[i] = EVT(PromotedVT);
+    }
+    if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
+      llvm::ISD::NodeType Ext =
+          Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
+    }
+    PromotedOutVals.push_back(PromotedOutVal);
+  }
+
   auto VectorInfo = VectorizePTXValueVTs(
       VTs, Offsets,
       RetTy->isSized() ? getFunctionParamOptimizedAlign(&F, RetTy, DL)
@@ -2652,12 +2723,14 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
     }
 
-    SDValue RetVal = OutVals[i];
+    SDValue OutVal = OutVals[i];
+    SDValue RetVal = PromotedOutVals[i];
+
     if (ExtendIntegerRetVal) {
       RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
                                                   : ISD::ZERO_EXTEND,
                            dl, MVT::i32, RetVal);
-    } else if (RetVal.getValueSizeInBits() < 16) {
+    } else if (OutVal.getValueSizeInBits() < 16) {
       // Use 16-bit registers for small load-stores as it's the
       // smallest general purpose register size supported by NVPTX.
       RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 9a249d3da3d5..cea3dce3f1c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -77,6 +77,7 @@ public:
   bool hasImageHandles() const;
   bool hasFP16Math() const { return SmVersion >= 53; }
   bool allowFP16Math() const;
+  bool hasMaskOperator() const { return PTXVersion >= 71; }
   unsigned int getSmVersion() const { return SmVersion; }
   std::string getTargetName() const { return TargetName; }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
index bf1524194cfb..6fee57b4664e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h
@@ -59,6 +59,16 @@ bool isKernelFunction(const Function &);
 bool getAlign(const Function &, unsigned index, unsigned &);
 bool getAlign(const CallInst &, unsigned index, unsigned &);
 
+// PTX ABI requires all scalar argument/return values to have
+// bit-size as a power of two of at least 32 bits.
+inline unsigned promoteScalarArgumentSize(unsigned size) {
+  if (size <= 32)
+    return 32;
+  else if (size <= 64)
+    return 64;
+  else
+    return size;
+}
 }
 
 #endif
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 4247cf557c2a..14c4fd3a9ffa 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -5473,7 +5473,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::MUL: {
     SDValue Op1 = N->getOperand(1);
-    if (Op1.getOpcode() != ISD::Constant || Op1.getValueType() != MVT::i64)
+    if (Op1.getOpcode() != ISD::Constant ||
+        (Op1.getValueType() != MVT::i64 && Op1.getValueType() != MVT::i32))
       break;
 
     // If the multiplier fits int16, we can handle it with mulli.
@@ -5486,13 +5487,27 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // (mul X, c1 << c2) -> (rldicr (mulli X, c1) c2). We do this in ISEL due to
     // DAGCombiner prefers (shl (mul X, c1), c2) -> (mul X, c1 << c2).
     uint64_t ImmSh = Imm >> Shift;
-    if (isInt<16>(ImmSh)) {
-      uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16);
+    if (!isInt<16>(ImmSh))
+      break;
+
+    uint64_t SextImm = SignExtend64(ImmSh & 0xFFFF, 16);
+    if (Op1.getValueType() == MVT::i64) {
       SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i64);
       SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI8, dl, MVT::i64,
                                                N->getOperand(0), SDImm);
-      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, SDValue(MulNode, 0),
-                           getI32Imm(Shift, dl), getI32Imm(63 - Shift, dl));
+
+      SDValue Ops[] = {SDValue(MulNode, 0), getI32Imm(Shift, dl),
+                       getI32Imm(63 - Shift, dl)};
+      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
+      return;
+    } else {
+      SDValue SDImm = CurDAG->getTargetConstant(SextImm, dl, MVT::i32);
+      SDNode *MulNode = CurDAG->getMachineNode(PPC::MULLI, dl, MVT::i32,
+                                              N->getOperand(0), SDImm);
+
+      SDValue Ops[] = {SDValue(MulNode, 0), getI32Imm(Shift, dl),
+                       getI32Imm(0, dl), getI32Imm(31 - Shift, dl)};
+      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       return;
     }
     break;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 59486c323567..c85f57f04c7d 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1086,8 +1086,8 @@ unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
 
 // For opcodes with the ReMaterializable flag set, this function is called to
 // verify the instruction is really rematable.
-bool PPCInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                                     AliasAnalysis *AA) const {
+bool PPCInstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default:
     // This function should only be called for opcodes with the ReMaterializable
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index e22b0086bde8..980bb3107a8b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -495,8 +495,7 @@ public:
                              unsigned &SubIdx) const override;
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
-  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AAResults *AA) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
   unsigned isStoreToStackSlot(const MachineInstr &MI,
                               int &FrameIndex) const override;
 
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index 4689c0638ca6..23703ac54d0e 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -568,7 +568,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
     const SCEVAddRecExpr *BasePtrSCEV = cast<SCEVAddRecExpr>(BaseSCEV);
 
     // Make sure the base is able to expand.
-    if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
+    if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart()))
       return MadeChange;
 
     assert(BasePtrSCEV->isAffine() &&
@@ -602,7 +602,7 @@ bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains(
       // Make sure offset is able to expand. Only need to check one time as the
       // offsets are reused between different chains.
       if (!BaseElemIdx)
-        if (!isSafeToExpand(OffsetSCEV, *SE))
+        if (!SCEVE.isSafeToExpand(OffsetSCEV))
           return false;
 
       Value *OffsetValue = SCEVE.expandCodeFor(
@@ -1018,14 +1018,13 @@ bool PPCLoopInstrFormPrep::rewriteLoadStores(
   if (!BasePtrSCEV->isAffine())
     return MadeChange;
 
-  if (!isSafeToExpand(BasePtrSCEV->getStart(), *SE))
-    return MadeChange;
-
-  SmallPtrSet<Value *, 16> DeletedPtrs;
-
   BasicBlock *Header = L->getHeader();
   SCEVExpander SCEVE(*SE, Header->getModule()->getDataLayout(),
                      "loopprepare-formrewrite");
+  if (!SCEVE.isSafeToExpand(BasePtrSCEV->getStart()))
+    return MadeChange;
+
+  SmallPtrSet<Value *, 16> DeletedPtrs;
 
   // For some DS form load/store instructions, it can also be an update form,
   // if the stride is constant and is a multipler of 4. Use update form if
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 917837a307ad..e6140edc8403 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -30,6 +30,9 @@ class MachineInstr;
 class MachineOperand;
 class PassRegistry;
 
+FunctionPass *createRISCVCodeGenPreparePass();
+void initializeRISCVCodeGenPreparePass(PassRegistry &);
+
 bool lowerRISCVMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                     AsmPrinter &AP);
 bool lowerRISCVMachineOperandToMCOperand(const MachineOperand &MO,
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index e783ef38b448..8a6f69c7f7ca 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -19,6 +19,19 @@ def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
                            AssemblerPredicate<(all_of FeatureStdExtM),
                            "'M' (Integer Multiplication and Division)">;
 
+def FeatureStdExtZmmul
+    : SubtargetFeature<"zmmul", "HasStdExtZmmul", "true",
+                       "'Zmmul' (Integer Multiplication)">;
+def HasStdExtZmmul : Predicate<"Subtarget->hasStdExtZmmul()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZmmul),
+                               "'Zmmul' (Integer Multiplication)">;
+
+def HasStdExtMOrZmmul
+    : Predicate<"Subtarget->hasStdExtM() || Subtarget->hasStdExtZmmul()">,
+                AssemblerPredicate<(any_of FeatureStdExtM, FeatureStdExtZmmul),
+                                   "'M' (Integer Multiplication and Division) or "
+                                   "'Zmmul' (Integer Multiplication)">;
+
 def FeatureStdExtA
     : SubtargetFeature<"a", "HasStdExtA", "true",
                        "'A' (Atomic Instructions)">;
@@ -465,7 +478,8 @@ def TuneNoDefaultUnroll
                        "Disable default unroll preference.">;
 
 def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
-                                   "SiFive 7-Series processors">;
+                                   "SiFive 7-Series processors",
+                                   [TuneNoDefaultUnroll]>;
 
 //===----------------------------------------------------------------------===//
 // Named operands for CSR instructions.
@@ -499,9 +513,9 @@ def : ProcessorModel<"rocket-rv32", RocketModel, []>;
 def : ProcessorModel<"rocket-rv64", RocketModel, [Feature64Bit]>;
 
 def : ProcessorModel<"sifive-7-rv32", SiFive7Model, [],
-                     [TuneSiFive7, TuneNoDefaultUnroll]>;
+                     [TuneSiFive7]>;
 def : ProcessorModel<"sifive-7-rv64", SiFive7Model, [Feature64Bit],
-                     [TuneSiFive7, TuneNoDefaultUnroll]>;
+                     [TuneSiFive7]>;
 
 def : ProcessorModel<"sifive-e20", RocketModel, [FeatureStdExtM,
                                                  FeatureStdExtC]>;
@@ -528,7 +542,7 @@ def : ProcessorModel<"sifive-e76", SiFive7Model, [FeatureStdExtM,
                                                   FeatureStdExtA,
                                                   FeatureStdExtF,
                                                   FeatureStdExtC],
-                     [TuneSiFive7, TuneNoDefaultUnroll]>;
+                     [TuneSiFive7]>;
 
 def : ProcessorModel<"sifive-s21", RocketModel, [Feature64Bit,
                                                  FeatureStdExtM,
@@ -553,7 +567,7 @@ def : ProcessorModel<"sifive-s76", SiFive7Model, [Feature64Bit,
                                                   FeatureStdExtF,
                                                   FeatureStdExtD,
                                                   FeatureStdExtC],
-                     [TuneSiFive7, TuneNoDefaultUnroll]>;
+                     [TuneSiFive7]>;
 
 def : ProcessorModel<"sifive-u54", RocketModel, [Feature64Bit,
                                                  FeatureStdExtM,
@@ -568,7 +582,7 @@ def : ProcessorModel<"sifive-u74", SiFive7Model, [Feature64Bit,
                                                   FeatureStdExtF,
                                                   FeatureStdExtD,
                                                   FeatureStdExtC],
-                     [TuneSiFive7, TuneNoDefaultUnroll]>;
+                     [TuneSiFive7]>;
 
 //===----------------------------------------------------------------------===//
 // Define the RISC-V target.
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
new file mode 100644
index 000000000000..b700a9ede39b
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -0,0 +1,169 @@
+//===----- RISCVCodeGenPrepare.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a RISCV specific version of CodeGenPrepare.
+// It munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-codegenprepare"
+#define PASS_NAME "RISCV CodeGenPrepare"
+
+STATISTIC(NumZExtToSExt, "Number of SExt instructions converted to ZExt");
+
+namespace {
+
+class RISCVCodeGenPrepare : public FunctionPass {
+  const DataLayout *DL;
+  const RISCVSubtarget *ST;
+
+public:
+  static char ID;
+
+  RISCVCodeGenPrepare() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override { return PASS_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<TargetPassConfig>();
+  }
+
+private:
+  bool optimizeZExt(ZExtInst *I);
+  bool optimizeAndExt(BinaryOperator *BO);
+};
+
+} // end anonymous namespace
+
+bool RISCVCodeGenPrepare::optimizeZExt(ZExtInst *ZExt) {
+  if (!ST->is64Bit())
+    return false;
+
+  Value *Src = ZExt->getOperand(0);
+
+  // We only care about ZExt from i32 to i64.
+  if (!ZExt->getType()->isIntegerTy(64) || !Src->getType()->isIntegerTy(32))
+    return false;
+
+  // Look for an opportunity to replace (i64 (zext (i32 X))) with a sext if we
+  // can determine that the sign bit of X is zero via a dominating condition.
+  // This often occurs with widened induction variables.
+  if (isImpliedByDomCondition(ICmpInst::ICMP_SGE, Src,
+                              Constant::getNullValue(Src->getType()), ZExt,
+                              *DL)) {
+    auto *SExt = new SExtInst(Src, ZExt->getType(), "", ZExt);
+    SExt->takeName(ZExt);
+    SExt->setDebugLoc(ZExt->getDebugLoc());
+
+    ZExt->replaceAllUsesWith(SExt);
+    ZExt->eraseFromParent();
+    ++NumZExtToSExt;
+    return true;
+  }
+
+  return false;
+}
+
+// Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set,
+// but bits 63:32 are zero. If we can prove that bit 31 of X is 0, we can fill
+// the upper 32 bits with ones. A separate transform will turn (zext X) into
+// (sext X) for the same condition.
+bool RISCVCodeGenPrepare::optimizeAndExt(BinaryOperator *BO) {
+  if (!ST->is64Bit())
+    return false;
+
+  if (BO->getOpcode() != Instruction::And)
+    return false;
+
+  if (!BO->getType()->isIntegerTy(64))
+    return false;
+
+  // Left hand side should be sext or zext.
+  Instruction *LHS = dyn_cast<Instruction>(BO->getOperand(0));
+  if (!LHS || (!isa<SExtInst>(LHS) && !isa<ZExtInst>(LHS)))
+    return false;
+
+  Value *LHSSrc = LHS->getOperand(0);
+  if (!LHSSrc->getType()->isIntegerTy(32))
+    return false;
+
+  // Right hand side should be a constant.
+  Value *RHS = BO->getOperand(1);
+
+  auto *CI = dyn_cast<ConstantInt>(RHS);
+  if (!CI)
+    return false;
+  uint64_t C = CI->getZExtValue();
+
+  // Look for constants that fit in 32 bits but not simm12, and can be made
+  // into simm12 by sign extending bit 31. This will allow use of ANDI.
+  // TODO: Is worth making simm32?
+  if (!isUInt<32>(C) || isInt<12>(C) || !isInt<12>(SignExtend64<32>(C)))
+    return false;
+
+  // If we can determine the sign bit of the input is 0, we can replace the
+  // And mask constant.
+  if (!isImpliedByDomCondition(ICmpInst::ICMP_SGE, LHSSrc,
+                               Constant::getNullValue(LHSSrc->getType()),
+                               LHS, *DL))
+    return false;
+
+  // Sign extend the constant and replace the And operand.
+  C = SignExtend64<32>(C);
+  BO->setOperand(1, ConstantInt::get(LHS->getType(), C));
+
+  return true;
+}
+
+bool RISCVCodeGenPrepare::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  auto &TM = TPC.getTM<RISCVTargetMachine>();
+  ST = &TM.getSubtarget<RISCVSubtarget>(F);
+
+  DL = &F.getParent()->getDataLayout();
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    for (Instruction &I : llvm::make_early_inc_range(BB)) {
+      if (auto *ZExt = dyn_cast<ZExtInst>(&I))
+        MadeChange |= optimizeZExt(ZExt);
+      else if (I.getOpcode() == Instruction::And)
+        MadeChange |= optimizeAndExt(cast<BinaryOperator>(&I));
+    }
+  }
+
+  return MadeChange;
+}
+
+INITIALIZE_PASS_BEGIN(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(RISCVCodeGenPrepare, DEBUG_TYPE, PASS_NAME, false, false)
+
+char RISCVCodeGenPrepare::ID = 0;
+
+FunctionPass *llvm::createRISCVCodeGenPreparePass() {
+  return new RISCVCodeGenPrepare();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 5b823af1e9b8..d5826b46d738 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -690,6 +690,14 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // 32 trailing ones should use srliw via tablegen pattern.
     if (TrailingOnes == 32 || ShAmt >= TrailingOnes)
       break;
+    // If C2 is (1 << ShAmt) use bexti if possible.
+    if (Subtarget->hasStdExtZbs() && ShAmt + 1 == TrailingOnes) {
+      SDNode *BEXTI =
+          CurDAG->getMachineNode(RISCV::BEXTI, DL, VT, N0->getOperand(0),
+                                 CurDAG->getTargetConstant(ShAmt, DL, VT));
+      ReplaceNode(Node, BEXTI);
+      return;
+    }
     unsigned LShAmt = Subtarget->getXLen() - TrailingOnes;
     SDNode *SLLI =
         CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0->getOperand(0),
@@ -939,18 +947,17 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (!isMask_64(C2))
       break;
 
-    // This should be the only use of the AND unless we will use
-    // (SRLI (SLLI X, 32), 32). We don't use a shift pair for other AND
-    // constants.
-    if (!N0.hasOneUse() && C2 != UINT64_C(0xFFFFFFFF))
-      break;
-
-    // If this can be an ANDI, ZEXT.H or ZEXT.W we don't need to do this
-    // optimization.
-    if (isInt<12>(C2) ||
+    // If this can be an ANDI, ZEXT.H or ZEXT.W, don't do this if the ANDI/ZEXT
+    // has multiple users or the constant is a simm12. This prevents inserting
+    // a shift and still have uses of the AND/ZEXT. Shifting a simm12 will
+    // likely make it more costly to materialize. Otherwise, using a SLLI
+    // might allow it to be compressed.
+    bool IsANDIOrZExt =
+        isInt<12>(C2) ||
         (C2 == UINT64_C(0xFFFF) &&
          (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp())) ||
-        (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba()))
+        (C2 == UINT64_C(0xFFFFFFFF) && Subtarget->hasStdExtZba());
+    if (IsANDIOrZExt && (isInt<12>(N1C->getSExtValue()) || !N0.hasOneUse()))
       break;
 
     // We need to shift left the AND input and C1 by a total of XLen bits.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 658865703079..1702546b58a6 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -215,21 +215,26 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::MULO_I64, nullptr);
   }
 
-  if (!Subtarget.hasStdExtM()) {
-    setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU, ISD::SDIV, ISD::UDIV,
-                        ISD::SREM, ISD::UREM},
-                       XLenVT, Expand);
+  if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul()) {
+    setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU}, XLenVT, Expand);
   } else {
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::MUL, {MVT::i32, MVT::i128}, Custom);
-
-      setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM},
-                         {MVT::i8, MVT::i16, MVT::i32}, Custom);
     } else {
       setOperationAction(ISD::MUL, MVT::i64, Custom);
     }
   }
 
+  if (!Subtarget.hasStdExtM()) {
+    setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM},
+                       XLenVT, Expand);
+  } else {
+    if (Subtarget.is64Bit()) {
+      setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM},
+                          {MVT::i8, MVT::i16, MVT::i32}, Custom);
+    }
+  }
+
   setOperationAction(
       {ISD::SDIVREM, ISD::UDIVREM, ISD::SMUL_LOHI, ISD::UMUL_LOHI}, XLenVT,
       Expand);
@@ -294,7 +299,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, XLenVT, Custom);
   }
 
-  static constexpr ISD::NodeType FPLegalNodeTypes[] = {
+  static const unsigned FPLegalNodeTypes[] = {
       ISD::FMINNUM,        ISD::FMAXNUM,       ISD::LRINT,
       ISD::LLRINT,         ISD::LROUND,        ISD::LLROUND,
       ISD::STRICT_LRINT,   ISD::STRICT_LLRINT, ISD::STRICT_LROUND,
@@ -307,7 +312,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       ISD::SETUGE, ISD::SETULT, ISD::SETULE, ISD::SETUNE, ISD::SETGT,
       ISD::SETGE,  ISD::SETNE,  ISD::SETO,   ISD::SETUO};
 
-  static const ISD::NodeType FPOpToExpand[] = {
+  static const unsigned FPOpToExpand[] = {
       ISD::FSIN, ISD::FCOS,       ISD::FSINCOS,   ISD::FPOW,
       ISD::FREM, ISD::FP16_TO_FP, ISD::FP_TO_FP16};
 
@@ -315,8 +320,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
 
   if (Subtarget.hasStdExtZfh()) {
-    for (auto NT : FPLegalNodeTypes)
-      setOperationAction(NT, MVT::f16, Legal);
+    setOperationAction(FPLegalNodeTypes, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);
     setCondCodeAction(FPCCToExpand, MVT::f16, Expand);
@@ -340,14 +344,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget.hasStdExtF()) {
-    for (auto NT : FPLegalNodeTypes)
-      setOperationAction(NT, MVT::f32, Legal);
+    setOperationAction(FPLegalNodeTypes, MVT::f32, Legal);
     setCondCodeAction(FPCCToExpand, MVT::f32, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
     setOperationAction(ISD::SELECT, MVT::f32, Custom);
     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
-    for (auto Op : FPOpToExpand)
-      setOperationAction(Op, MVT::f32, Expand);
+    setOperationAction(FPOpToExpand, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
     setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   }
@@ -356,8 +358,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::i32, Custom);
 
   if (Subtarget.hasStdExtD()) {
-    for (auto NT : FPLegalNodeTypes)
-      setOperationAction(NT, MVT::f64, Legal);
+    setOperationAction(FPLegalNodeTypes, MVT::f64, Legal);
     setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
     setCondCodeAction(FPCCToExpand, MVT::f64, Expand);
@@ -366,8 +367,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
     setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-    for (auto Op : FPOpToExpand)
-      setOperationAction(Op, MVT::f64, Expand);
+    setOperationAction(FPOpToExpand, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
     setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   }
@@ -458,17 +458,22 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_SETCC,       ISD::VP_FP_ROUND,
         ISD::VP_FP_EXTEND};
 
+    static const unsigned IntegerVecReduceOps[] = {
+        ISD::VECREDUCE_ADD,  ISD::VECREDUCE_AND,  ISD::VECREDUCE_OR,
+        ISD::VECREDUCE_XOR,  ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
+        ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN};
+
+    static const unsigned FloatingPointVecReduceOps[] = {
+        ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN,
+        ISD::VECREDUCE_FMAX};
+
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
       // element type being illegal.
       setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT},
                          MVT::i64, Custom);
 
-      setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND,
-                          ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR,
-                          ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
-                          ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
-                         MVT::i64, Custom);
+      setOperationAction(IntegerVecReduceOps, MVT::i64, Custom);
 
       setOperationAction({ISD::VP_REDUCE_ADD, ISD::VP_REDUCE_AND,
                           ISD::VP_REDUCE_OR, ISD::VP_REDUCE_XOR,
@@ -581,11 +586,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
       // Custom-lower reduction operations to set up the corresponding custom
       // nodes' operands.
-      setOperationAction({ISD::VECREDUCE_ADD, ISD::VECREDUCE_AND,
-                          ISD::VECREDUCE_OR, ISD::VECREDUCE_XOR,
-                          ISD::VECREDUCE_SMAX, ISD::VECREDUCE_SMIN,
-                          ISD::VECREDUCE_UMAX, ISD::VECREDUCE_UMIN},
-                         VT, Custom);
+      setOperationAction(IntegerVecReduceOps, VT, Custom);
 
       setOperationAction(IntegerVPOps, VT, Custom);
 
@@ -661,9 +662,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND},
                          VT, Custom);
 
-      setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD,
-                          ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX},
-                         VT, Custom);
+      setOperationAction(FloatingPointVecReduceOps, VT, Custom);
 
       // Expand FP operations that need libcalls.
       setOperationAction(ISD::FREM, VT, Expand);
@@ -905,17 +904,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FFLOOR, ISD::FROUND},
                            VT, Custom);
 
-        for (auto CC : VFPCCToExpand)
-          setCondCodeAction(CC, VT, Expand);
+        setCondCodeAction(VFPCCToExpand, VT, Expand);
 
         setOperationAction({ISD::VSELECT, ISD::SELECT}, VT, Custom);
         setOperationAction(ISD::SELECT_CC, VT, Expand);
 
         setOperationAction(ISD::BITCAST, VT, Custom);
 
-        setOperationAction({ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD,
-                            ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMAX},
-                           VT, Custom);
+        setOperationAction(FloatingPointVecReduceOps, VT, Custom);
 
         setOperationAction(FloatingPointVPOps, VT, Custom);
       }
@@ -943,7 +939,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setJumpIsExpensive();
 
   setTargetDAGCombine({ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::AND,
-                       ISD::OR, ISD::XOR});
+                       ISD::OR, ISD::XOR, ISD::SETCC});
   if (Subtarget.is64Bit())
     setTargetDAGCombine(ISD::SRA);
 
@@ -1374,6 +1370,23 @@ unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context
 // with 1/-1.
 static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
                                     ISD::CondCode &CC, SelectionDAG &DAG) {
+  // If this is a single bit test that can't be handled by ANDI, shift the
+  // bit to be tested to the MSB and perform a signed compare with 0.
+  if (isIntEqualitySetCC(CC) && isNullConstant(RHS) &&
+      LHS.getOpcode() == ISD::AND && LHS.hasOneUse() &&
+      isa<ConstantSDNode>(LHS.getOperand(1))) {
+    uint64_t Mask = LHS.getConstantOperandVal(1);
+    if (isPowerOf2_64(Mask) && !isInt<12>(Mask)) {
+      CC = CC == ISD::SETEQ ? ISD::SETGE : ISD::SETLT;
+      unsigned ShAmt = LHS.getValueSizeInBits() - 1 - Log2_64(Mask);
+      LHS = LHS.getOperand(0);
+      if (ShAmt != 0)
+        LHS = DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS,
+                          DAG.getConstant(ShAmt, DL, LHS.getValueType()));
+      return;
+    }
+  }
+
   // Convert X > -1 to X >= 0.
   if (CC == ISD::SETGT && isAllOnesConstant(RHS)) {
     RHS = DAG.getConstant(0, DL, RHS.getValueType());
@@ -3707,10 +3720,7 @@ SDValue RISCVTargetLowering::lowerGlobalAddress(SDValue Op,
   SDLoc DL(Op);
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
   assert(N->getOffset() == 0 && "unexpected offset in global node");
-
-  const GlobalValue *GV = N->getGlobal();
-  bool IsLocal = getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
-  return getAddr(N, DAG, IsLocal);
+  return getAddr(N, DAG, N->getGlobal()->isDSOLocal());
 }
 
 SDValue RISCVTargetLowering::lowerBlockAddress(SDValue Op,
@@ -8130,6 +8140,50 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG) {
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false);
 }
 
+// Replace (seteq (i64 (and X, 0xffffffff)), C1) with
+// (seteq (i64 (sext_inreg (X, i32)), C1')) where C1' is C1 sign extended from
+// bit 31. Same for setne. C1' may be cheaper to materialize and the sext_inreg
+// can become a sext.w instead of a shift pair.
+static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   const RISCVSubtarget &Subtarget) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  if (OpVT != MVT::i64 || !Subtarget.is64Bit())
+    return SDValue();
+
+  // RHS needs to be a constant.
+  auto *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (!N1C)
+    return SDValue();
+
+  // LHS needs to be (and X, 0xffffffff).
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse() ||
+      !isa<ConstantSDNode>(N0.getOperand(1)) ||
+      N0.getConstantOperandVal(1) != UINT64_C(0xffffffff))
+    return SDValue();
+
+  // Looking for an equality compare.
+  ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (!isIntEqualitySetCC(Cond))
+    return SDValue();
+
+  const APInt &C1 = cast<ConstantSDNode>(N1)->getAPIntValue();
+
+  SDLoc dl(N);
+  // If the constant is larger than 2^32 - 1 it is impossible for both sides
+  // to be equal.
+  if (C1.getActiveBits() > 32)
+    return DAG.getBoolConstant(Cond == ISD::SETNE, dl, VT, OpVT);
+
+  SDValue SExtOp = DAG.getNode(ISD::SIGN_EXTEND_INREG, N, OpVT,
+                               N0.getOperand(0), DAG.getValueType(MVT::i32));
+  return DAG.getSetCC(dl, VT, SExtOp, DAG.getConstant(C1.trunc(32).sext(64),
+                                                      dl, OpVT), Cond);
+}
+
 static SDValue
 performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
                                 const RISCVSubtarget &Subtarget) {
@@ -8658,6 +8712,75 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
       DAG.getConstant(32 - ShAmt, DL, MVT::i64));
 }
 
+// Perform common combines for BR_CC and SELECT_CC condtions.
+static bool combine_CC(SDValue &LHS, SDValue &RHS, SDValue &CC, const SDLoc &DL,
+                       SelectionDAG &DAG, const RISCVSubtarget &Subtarget) {
+  ISD::CondCode CCVal = cast<CondCodeSDNode>(CC)->get();
+  if (!ISD::isIntEqualitySetCC(CCVal))
+    return false;
+
+  // Fold ((setlt X, Y), 0, ne) -> (X, Y, lt)
+  // Sometimes the setcc is introduced after br_cc/select_cc has been formed.
+  if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
+      LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
+    // If we're looking for eq 0 instead of ne 0, we need to invert the
+    // condition.
+    bool Invert = CCVal == ISD::SETEQ;
+    CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
+    if (Invert)
+      CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+    translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+
+    CC = DAG.getCondCode(CCVal);
+    return true;
+  }
+
+  // Fold ((xor X, Y), 0, eq/ne) -> (X, Y, eq/ne)
+  if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS)) {
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+    return true;
+  }
+
+  // Fold ((srl (and X, 1<<C), C), 0, eq/ne) -> ((shl X, XLen-1-C), 0, ge/lt)
+  if (isNullConstant(RHS) && LHS.getOpcode() == ISD::SRL && LHS.hasOneUse() &&
+      LHS.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue LHS0 = LHS.getOperand(0);
+    if (LHS0.getOpcode() == ISD::AND &&
+        LHS0.getOperand(1).getOpcode() == ISD::Constant) {
+      uint64_t Mask = LHS0.getConstantOperandVal(1);
+      uint64_t ShAmt = LHS.getConstantOperandVal(1);
+      if (isPowerOf2_64(Mask) && Log2_64(Mask) == ShAmt) {
+        CCVal = CCVal == ISD::SETEQ ? ISD::SETGE : ISD::SETLT;
+        CC = DAG.getCondCode(CCVal);
+
+        ShAmt = LHS.getValueSizeInBits() - 1 - ShAmt;
+        LHS = LHS0.getOperand(0);
+        if (ShAmt != 0)
+          LHS =
+              DAG.getNode(ISD::SHL, DL, LHS.getValueType(), LHS0.getOperand(0),
+                          DAG.getConstant(ShAmt, DL, LHS.getValueType()));
+        return true;
+      }
+    }
+  }
+
+  // (X, 1, setne) -> // (X, 0, seteq) if we can prove X is 0/1.
+  // This can occur when legalizing some floating point comparisons.
+  APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
+  if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
+    CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
+    CC = DAG.getCondCode(CCVal);
+    RHS = DAG.getConstant(0, DL, LHS.getValueType());
+    return true;
+  }
+
+  return false;
+}
+
 SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -8872,6 +8995,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FMAXNUM:
   case ISD::FMINNUM:
     return combineBinOpToReduce(N, DAG);
+  case ISD::SETCC:
+    return performSETCCCombine(N, DAG, Subtarget);
   case ISD::SIGN_EXTEND_INREG:
     return performSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   case ISD::ZERO_EXTEND:
@@ -8900,110 +9025,32 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     // Transform
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
+    SDValue CC = N->getOperand(2);
     SDValue TrueV = N->getOperand(3);
     SDValue FalseV = N->getOperand(4);
+    SDLoc DL(N);
 
     // If the True and False values are the same, we don't need a select_cc.
     if (TrueV == FalseV)
       return TrueV;
 
-    ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(2))->get();
-    if (!ISD::isIntEqualitySetCC(CCVal))
-      break;
-
-    // Fold (select_cc (setlt X, Y), 0, ne, trueV, falseV) ->
-    //      (select_cc X, Y, lt, trueV, falseV)
-    // Sometimes the setcc is introduced after select_cc has been formed.
-    if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
-        LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
-      // If we're looking for eq 0 instead of ne 0, we need to invert the
-      // condition.
-      bool Invert = CCVal == ISD::SETEQ;
-      CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
-      if (Invert)
-        CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
-
-      SDLoc DL(N);
-      RHS = LHS.getOperand(1);
-      LHS = LHS.getOperand(0);
-      translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
-
-      SDValue TargetCC = DAG.getCondCode(CCVal);
-      return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
-                         {LHS, RHS, TargetCC, TrueV, FalseV});
-    }
-
-    // Fold (select_cc (xor X, Y), 0, eq/ne, trueV, falseV) ->
-    //      (select_cc X, Y, eq/ne, trueV, falseV)
-    if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS))
-      return DAG.getNode(RISCVISD::SELECT_CC, SDLoc(N), N->getValueType(0),
-                         {LHS.getOperand(0), LHS.getOperand(1),
-                          N->getOperand(2), TrueV, FalseV});
-    // (select_cc X, 1, setne, trueV, falseV) ->
-    // (select_cc X, 0, seteq, trueV, falseV) if we can prove X is 0/1.
-    // This can occur when legalizing some floating point comparisons.
-    APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
-    if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
-      SDLoc DL(N);
-      CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
-      SDValue TargetCC = DAG.getCondCode(CCVal);
-      RHS = DAG.getConstant(0, DL, LHS.getValueType());
+    if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
       return DAG.getNode(RISCVISD::SELECT_CC, DL, N->getValueType(0),
-                         {LHS, RHS, TargetCC, TrueV, FalseV});
-    }
+                         {LHS, RHS, CC, TrueV, FalseV});
 
-    break;
+    return SDValue();
   }
   case RISCVISD::BR_CC: {
     SDValue LHS = N->getOperand(1);
     SDValue RHS = N->getOperand(2);
-    ISD::CondCode CCVal = cast<CondCodeSDNode>(N->getOperand(3))->get();
-    if (!ISD::isIntEqualitySetCC(CCVal))
-      break;
-
-    // Fold (br_cc (setlt X, Y), 0, ne, dest) ->
-    //      (br_cc X, Y, lt, dest)
-    // Sometimes the setcc is introduced after br_cc has been formed.
-    if (LHS.getOpcode() == ISD::SETCC && isNullConstant(RHS) &&
-        LHS.getOperand(0).getValueType() == Subtarget.getXLenVT()) {
-      // If we're looking for eq 0 instead of ne 0, we need to invert the
-      // condition.
-      bool Invert = CCVal == ISD::SETEQ;
-      CCVal = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
-      if (Invert)
-        CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
-
-      SDLoc DL(N);
-      RHS = LHS.getOperand(1);
-      LHS = LHS.getOperand(0);
-      translateSetCCForBranch(DL, LHS, RHS, CCVal, DAG);
+    SDValue CC = N->getOperand(3);
+    SDLoc DL(N);
 
+    if (combine_CC(LHS, RHS, CC, DL, DAG, Subtarget))
       return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0),
-                         N->getOperand(0), LHS, RHS, DAG.getCondCode(CCVal),
-                         N->getOperand(4));
-    }
-
-    // Fold (br_cc (xor X, Y), 0, eq/ne, dest) ->
-    //      (br_cc X, Y, eq/ne, trueV, falseV)
-    if (LHS.getOpcode() == ISD::XOR && isNullConstant(RHS))
-      return DAG.getNode(RISCVISD::BR_CC, SDLoc(N), N->getValueType(0),
-                         N->getOperand(0), LHS.getOperand(0), LHS.getOperand(1),
-                         N->getOperand(3), N->getOperand(4));
-
-    // (br_cc X, 1, setne, br_cc) ->
-    // (br_cc X, 0, seteq, br_cc) if we can prove X is 0/1.
-    // This can occur when legalizing some floating point comparisons.
-    APInt Mask = APInt::getBitsSetFrom(LHS.getValueSizeInBits(), 1);
-    if (isOneConstant(RHS) && DAG.MaskedValueIsZero(LHS, Mask)) {
-      SDLoc DL(N);
-      CCVal = ISD::getSetCCInverse(CCVal, LHS.getValueType());
-      SDValue TargetCC = DAG.getCondCode(CCVal);
-      RHS = DAG.getConstant(0, DL, LHS.getValueType());
-      return DAG.getNode(RISCVISD::BR_CC, DL, N->getValueType(0),
-                         N->getOperand(0), LHS, RHS, TargetCC,
-                         N->getOperand(4));
-    }
-    break;
+                         N->getOperand(0), LHS, RHS, CC, N->getOperand(4));
+
+    return SDValue();
   }
   case ISD::BITREVERSE:
     return performBITREVERSECombine(N, DAG, Subtarget);
@@ -9299,6 +9346,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
 
 bool RISCVTargetLowering::isDesirableToCommuteWithShift(
     const SDNode *N, CombineLevel Level) const {
+  assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
+          N->getOpcode() == ISD::SRL) &&
+         "Expected shift op");
+
   // The following folds are only desirable if `(OP _, c1 << c2)` can be
   // materialised in fewer instructions than `(OP _, c1)`:
   //
@@ -9357,7 +9408,8 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
     return false;
 
   // Only handle AND for now.
-  if (Op.getOpcode() != ISD::AND)
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode != ISD::AND && Opcode != ISD::OR && Opcode != ISD::XOR)
     return false;
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
@@ -9376,12 +9428,13 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
   auto IsLegalMask = [ShrunkMask, ExpandedMask](const APInt &Mask) -> bool {
     return ShrunkMask.isSubsetOf(Mask) && Mask.isSubsetOf(ExpandedMask);
   };
-  auto UseMask = [Mask, Op, VT, &TLO](const APInt &NewMask) -> bool {
+  auto UseMask = [Mask, Op, &TLO](const APInt &NewMask) -> bool {
     if (NewMask == Mask)
       return true;
     SDLoc DL(Op);
-    SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
-    SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
+    SDValue NewC = TLO.DAG.getConstant(NewMask, DL, Op.getValueType());
+    SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
+                                    Op.getOperand(0), NewC);
     return TLO.CombineTo(Op, NewOp);
   };
 
@@ -9390,18 +9443,21 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
   if (ShrunkMask.isSignedIntN(12))
     return false;
 
-  // Preserve (and X, 0xffff) when zext.h is supported.
-  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
-    APInt NewMask = APInt(Mask.getBitWidth(), 0xffff);
-    if (IsLegalMask(NewMask))
-      return UseMask(NewMask);
-  }
+  // And has a few special cases for zext.
+  if (Opcode == ISD::AND) {
+    // Preserve (and X, 0xffff) when zext.h is supported.
+    if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbp()) {
+      APInt NewMask = APInt(Mask.getBitWidth(), 0xffff);
+      if (IsLegalMask(NewMask))
+        return UseMask(NewMask);
+    }
 
-  // Try to preserve (and X, 0xffffffff), the (zext_inreg X, i32) pattern.
-  if (VT == MVT::i64) {
-    APInt NewMask = APInt(64, 0xffffffff);
-    if (IsLegalMask(NewMask))
-      return UseMask(NewMask);
+    // Try to preserve (and X, 0xffffffff), the (zext_inreg X, i32) pattern.
+    if (VT == MVT::i64) {
+      APInt NewMask = APInt(64, 0xffffffff);
+      if (IsLegalMask(NewMask))
+        return UseMask(NewMask);
+    }
   }
 
   // For the remaining optimizations, we need to be able to make a negative
@@ -9414,10 +9470,11 @@ bool RISCVTargetLowering::targetShrinkDemandedConstant(
 
   // Try to make a 12 bit negative immediate. If that fails try to make a 32
   // bit negative immediate unless the shrunk immediate already fits in 32 bits.
+  // If we can't create a simm12, we shouldn't change opaque constants.
   APInt NewMask = ShrunkMask;
   if (MinSignedBits <= 12)
     NewMask.setBitsFrom(11);
-  else if (MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32))
+  else if (!C->isOpaque() && MinSignedBits <= 32 && !ShrunkMask.isSignedIntN(32))
     NewMask.setBitsFrom(31);
   else
     return false;
@@ -10015,15 +10072,15 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
       LastSelectPseudo = &*SequenceMBBI;
       SequenceMBBI->collectDebugValues(SelectDebugValues);
       SelectDests.insert(SequenceMBBI->getOperand(0).getReg());
-    } else {
-      if (SequenceMBBI->hasUnmodeledSideEffects() ||
-          SequenceMBBI->mayLoadOrStore())
-        break;
-      if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
-            return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
-          }))
-        break;
+      continue;
     }
+    if (SequenceMBBI->hasUnmodeledSideEffects() ||
+        SequenceMBBI->mayLoadOrStore())
+      break;
+    if (llvm::any_of(SequenceMBBI->operands(), [&](MachineOperand &MO) {
+          return MO.isReg() && MO.isUse() && SelectDests.count(MO.getReg());
+        }))
+      break;
   }
 
   const RISCVInstrInfo &TII = *Subtarget.getInstrInfo();
@@ -12159,7 +12216,8 @@ bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const {
   // FIXME: This doesn't work for zve32, but that's already broken
   // elsewhere for the same reason.
   assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported");
-  assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed");
+  static_assert(RISCV::RVVBitsPerBlock == 64,
+                "RVVBitsPerBlock changed, audit needed");
   return true;
 }
 
@@ -12214,10 +12272,12 @@ bool RISCVTargetLowering::shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned)
 bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
                                                  SDValue C) const {
   // Check integral scalar types.
+  const bool HasExtMOrZmmul =
+      Subtarget.hasStdExtM() || Subtarget.hasStdExtZmmul();
   if (VT.isScalarInteger()) {
     // Omit the optimization if the sub target has the M extension and the data
     // size exceeds XLen.
-    if (Subtarget.hasStdExtM() && VT.getSizeInBits() > Subtarget.getXLen())
+    if (HasExtMOrZmmul && VT.getSizeInBits() > Subtarget.getXLen())
       return false;
     if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
       // Break the MUL to a SLLI and an ADD/SUB.
@@ -12232,7 +12292,7 @@ bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
         return true;
       // Omit the following optimization if the sub target has the M extension
       // and the data size >= XLen.
-      if (Subtarget.hasStdExtM() && VT.getSizeInBits() >= Subtarget.getXLen())
+      if (HasExtMOrZmmul && VT.getSizeInBits() >= Subtarget.getXLen())
         return false;
       // Break the MUL to two SLLI instructions and an ADD/SUB, if Imm needs
       // a pair of LUI/ADDI.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 685604ad9a59..75a79895330f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -637,6 +637,64 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   }
 }
 
+MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
+    VirtRegMap *VRM) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  // The below optimizations narrow the load so they are only valid for little
+  // endian.
+  // TODO: Support big endian by adding an offset into the frame object?
+  if (MF.getDataLayout().isBigEndian())
+    return nullptr;
+
+  // Fold load from stack followed by sext.w into lw.
+  // TODO: Fold with sext.b, sext.h, zext.b, zext.h, zext.w?
+  if (Ops.size() != 1 || Ops[0] != 1)
+   return nullptr;
+
+  unsigned LoadOpc;
+  switch (MI.getOpcode()) {
+  default:
+    if (RISCV::isSEXT_W(MI)) {
+      LoadOpc = RISCV::LW;
+      break;
+    }
+    if (RISCV::isZEXT_W(MI)) {
+      LoadOpc = RISCV::LWU;
+      break;
+    }
+    if (RISCV::isZEXT_B(MI)) {
+      LoadOpc = RISCV::LBU;
+      break;
+    }
+    return nullptr;
+  case RISCV::SEXT_H:
+    LoadOpc = RISCV::LH;
+    break;
+  case RISCV::SEXT_B:
+    LoadOpc = RISCV::LB;
+    break;
+  case RISCV::ZEXT_H_RV32:
+  case RISCV::ZEXT_H_RV64:
+    LoadOpc = RISCV::LHU;
+    break;
+  }
+
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(MF, FrameIndex),
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlign(FrameIndex));
+
+  Register DstReg = MI.getOperand(0).getReg();
+  return BuildMI(*MI.getParent(), InsertPt, MI.getDebugLoc(), get(LoadOpc),
+                 DstReg)
+      .addFrameIndex(FrameIndex)
+      .addImm(0)
+      .addMemOperand(MMO);
+}
+
 void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             const DebugLoc &DL, Register DstReg, uint64_t Val,
@@ -1799,17 +1857,30 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
         .addReg(VL, RegState::Kill)
         .addImm(ShiftAmount)
         .setMIFlag(Flag);
-  } else if ((NumOfVReg == 3 || NumOfVReg == 5 || NumOfVReg == 9) &&
-             STI.hasStdExtZba()) {
-    // We can use Zba SHXADD instructions for multiply in some cases.
-    // TODO: Generalize to SHXADD+SLLI.
+  } else if (STI.hasStdExtZba() &&
+             ((NumOfVReg % 3 == 0 && isPowerOf2_64(NumOfVReg / 3)) ||
+              (NumOfVReg % 5 == 0 && isPowerOf2_64(NumOfVReg / 5)) ||
+              (NumOfVReg % 9 == 0 && isPowerOf2_64(NumOfVReg / 9)))) {
+    // We can use Zba SHXADD+SLLI instructions for multiply in some cases.
     unsigned Opc;
-    switch (NumOfVReg) {
-    default: llvm_unreachable("Unexpected number of vregs");
-    case 3: Opc = RISCV::SH1ADD; break;
-    case 5: Opc = RISCV::SH2ADD; break;
-    case 9: Opc = RISCV::SH3ADD; break;
+    uint32_t ShiftAmount;
+    if (NumOfVReg % 9 == 0) {
+      Opc = RISCV::SH3ADD;
+      ShiftAmount = Log2_64(NumOfVReg / 9);
+    } else if (NumOfVReg % 5 == 0) {
+      Opc = RISCV::SH2ADD;
+      ShiftAmount = Log2_64(NumOfVReg / 5);
+    } else if (NumOfVReg % 3 == 0) {
+      Opc = RISCV::SH1ADD;
+      ShiftAmount = Log2_64(NumOfVReg / 3);
+    } else {
+      llvm_unreachable("Unexpected number of vregs");
     }
+    if (ShiftAmount)
+      BuildMI(MBB, II, DL, get(RISCV::SLLI), VL)
+          .addReg(VL, RegState::Kill)
+          .addImm(ShiftAmount)
+          .setMIFlag(Flag);
     BuildMI(MBB, II, DL, get(Opc), VL)
         .addReg(VL, RegState::Kill)
         .addReg(VL)
@@ -1839,10 +1910,11 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
   } else {
     Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass);
     movImm(MBB, II, DL, N, NumOfVReg, Flag);
-    if (!STI.hasStdExtM())
+    if (!STI.hasStdExtM() && !STI.hasStdExtZmmul())
       MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
           MF.getFunction(),
-          "M-extension must be enabled to calculate the vscaled size/offset."});
+          "M- or Zmmul-extension must be enabled to calculate the vscaled size/"
+          "offset."});
     BuildMI(MBB, II, DL, get(RISCV::MUL), VL)
         .addReg(VL, RegState::Kill)
         .addReg(N, RegState::Kill)
@@ -1852,6 +1924,24 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
   return VL;
 }
 
+// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
+bool RISCV::isSEXT_W(const MachineInstr &MI) {
+  return MI.getOpcode() == RISCV::ADDIW && MI.getOperand(1).isReg() &&
+         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0;
+}
+
+// Returns true if this is the zext.w pattern, adduw rd, rs1, x0.
+bool RISCV::isZEXT_W(const MachineInstr &MI) {
+  return MI.getOpcode() == RISCV::ADD_UW && MI.getOperand(1).isReg() &&
+         MI.getOperand(2).isReg() && MI.getOperand(2).getReg() == RISCV::X0;
+}
+
+// Returns true if this is the zext.b pattern, andi rd, rs1, 255.
+bool RISCV::isZEXT_B(const MachineInstr &MI) {
+  return MI.getOpcode() == RISCV::ANDI && MI.getOperand(1).isReg() &&
+         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 255;
+}
+
 static bool isRVVWholeLoadStore(unsigned Opcode) {
   switch (Opcode) {
   default:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 5368437618bd..4aa9ded5b3a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -69,6 +69,14 @@ public:
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
+  using TargetInstrInfo::foldMemoryOperandImpl;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const override;
+
   // Materializes the given integer Val into DstReg.
   void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
               const DebugLoc &DL, Register DstReg, uint64_t Val,
@@ -183,6 +191,11 @@ protected:
 
 namespace RISCV {
 
+// Returns true if this is the sext.w pattern, addiw rd, rs1, 0.
+bool isSEXT_W(const MachineInstr &MI);
+bool isZEXT_W(const MachineInstr &MI);
+bool isZEXT_B(const MachineInstr &MI);
+
 // Returns true if the given MI is an RVV instruction opcode for which we may
 // expect to see a FrameIndex operand.
 bool isRVVSpill(const MachineInstr &MI);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 06a90438838e..78fd09fbf387 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1278,6 +1278,13 @@ def : Pat<(setgt GPR:$rs1, simm12_minus1_nonzero:$imm),
 def : Pat<(setugt GPR:$rs1, simm12_minus1_nonzero:$imm),
           (XORI (SLTIU GPR:$rs1, (ImmPlus1 simm12_minus1_nonzero:$imm)), 1)>;
 
+// If negating a pattern that requires an XORI above, we can fold the XORI with
+// the NEG. The XORI is equivalent to 1-X and negating gives X-1.
+def : Pat<(ineg (setuge GPR:$rs1, GPR:$rs2)), (ADDI (SLTU GPR:$rs1, GPR:$rs2), -1)>;
+def : Pat<(ineg (setule GPR:$rs1, GPR:$rs2)), (ADDI (SLTU GPR:$rs2, GPR:$rs1), -1)>;
+def : Pat<(ineg (setge GPR:$rs1, GPR:$rs2)), (ADDI (SLT GPR:$rs1, GPR:$rs2), -1)>;
+def : Pat<(ineg (setle GPR:$rs1, GPR:$rs2)), (ADDI (SLT GPR:$rs2, GPR:$rs1), -1)>;
+
 def IntCCtoRISCVCC : SDNodeXForm<riscv_selectcc, [{
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   RISCVCC::CondCode BrCC = getRISCVCCForIntCC(CC);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index 72ba8460116f..662604b138d2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -24,7 +24,7 @@ def riscv_remuw : SDNode<"RISCVISD::REMUW", SDT_RISCVIntBinOpW>;
 // Instructions
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtM] in {
+let Predicates = [HasStdExtMOrZmmul] in {
 def MUL     : ALU_rr<0b0000001, 0b000, "mul", /*Commutable*/1>,
               Sched<[WriteIMul, ReadIMul, ReadIMul]>;
 def MULH    : ALU_rr<0b0000001, 0b001, "mulh", /*Commutable*/1>,
@@ -33,6 +33,9 @@ def MULHSU  : ALU_rr<0b0000001, 0b010, "mulhsu">,
               Sched<[WriteIMul, ReadIMul, ReadIMul]>;
 def MULHU   : ALU_rr<0b0000001, 0b011, "mulhu", /*Commutable*/1>,
               Sched<[WriteIMul, ReadIMul, ReadIMul]>;
+} // Predicates = [HasStdExtMOrZmmul]
+
+let Predicates = [HasStdExtM] in {
 def DIV     : ALU_rr<0b0000001, 0b100, "div">,
               Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
 def DIVU    : ALU_rr<0b0000001, 0b101, "divu">,
@@ -43,9 +46,12 @@ def REMU    : ALU_rr<0b0000001, 0b111, "remu">,
               Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
 } // Predicates = [HasStdExtM]
 
-let Predicates = [HasStdExtM, IsRV64] in {
+let Predicates = [HasStdExtMOrZmmul, IsRV64] in {
 def MULW    : ALUW_rr<0b0000001, 0b000, "mulw", /*Commutable*/1>,
               Sched<[WriteIMul32, ReadIMul32, ReadIMul32]>;
+} // Predicates = [HasStdExtMOrZmmul, IsRV64]
+
+let Predicates = [HasStdExtM, IsRV64] in {
 def DIVW    : ALUW_rr<0b0000001, 0b100, "divw">,
               Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>;
 def DIVUW   : ALUW_rr<0b0000001, 0b101, "divuw">,
@@ -60,21 +66,25 @@ def REMUW   : ALUW_rr<0b0000001, 0b111, "remuw">,
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtM] in {
+let Predicates = [HasStdExtMOrZmmul] in {
 def : PatGprGpr<mul, MUL>;
 def : PatGprGpr<mulhs, MULH>;
 def : PatGprGpr<mulhu, MULHU>;
 def : PatGprGpr<riscv_mulhsu, MULHSU>;
+} // Predicates = [HasStdExtMOrZmmul]
+
+let Predicates = [HasStdExtM] in {
 def : PatGprGpr<sdiv, DIV>;
 def : PatGprGpr<udiv, DIVU>;
 def : PatGprGpr<srem, REM>;
 def : PatGprGpr<urem, REMU>;
 } // Predicates = [HasStdExtM]
 
-let Predicates = [HasStdExtM, IsRV64] in {
 // Select W instructions if only the lower 32-bits of the result are used.
+let Predicates = [HasStdExtMOrZmmul, IsRV64] in
 def : PatGprGpr<binop_allwusers<mul>, MULW>;
 
+let Predicates = [HasStdExtM, IsRV64] in {
 def : PatGprGpr<riscv_divw, DIVW>;
 def : PatGprGpr<riscv_divuw, DIVUW>;
 def : PatGprGpr<riscv_remuw, REMUW>;
@@ -96,11 +106,11 @@ def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))),
           (REMW GPR:$rs1, GPR:$rs2)>;
 } // Predicates = [HasStdExtM, IsRV64]
 
-let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in {
+let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in {
 // Special case for calculating the full 64-bit product of a 32x32 unsigned
 // multiply where the inputs aren't known to be zero extended. We can shift the
 // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish
 // zeroing the upper 32 bits.
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
           (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
-} // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba]
+} // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba]
diff --git a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
index dadf8f81a2c0..920729e9ebbf 100644
--- a/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSExtWRemoval.cpp
@@ -443,8 +443,7 @@ bool RISCVSExtWRemoval::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr *MI = &*I++;
 
       // We're looking for the sext.w pattern ADDIW rd, rs1, 0.
-      if (MI->getOpcode() != RISCV::ADDIW || !MI->getOperand(2).isImm() ||
-          MI->getOperand(2).getImm() != 0 || !MI->getOperand(1).isReg())
+      if (!RISCV::isSEXT_W(*MI))
         continue;
 
       // Input should be a virtual register.
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 7589b44b81d3..0446edefa979 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -202,11 +202,9 @@ bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
 }
 
 bool RISCVSubtarget::enableSubRegLiveness() const {
-  if (EnableSubRegLiveness.getNumOccurrences())
-    return EnableSubRegLiveness;
-  // Enable subregister liveness for RVV to better handle LMUL>1 and segment
-  // load/store.
-  return hasVInstructions();
+  // FIXME: Enable subregister liveness by default for RVV to better handle
+  // LMUL>1 and segment load/store.
+  return EnableSubRegLiveness;
 }
 
 void RISCVSubtarget::getPostRAMutations(
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 831f7fadaa62..6eb949fa551c 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -89,6 +89,7 @@ private:
   bool HasStdExtZicbom = false;
   bool HasStdExtZicboz = false;
   bool HasStdExtZicbop = false;
+  bool HasStdExtZmmul = false;
   bool HasRV64 = false;
   bool IsRV32E = false;
   bool EnableLinkerRelax = false;
@@ -184,6 +185,7 @@ public:
   bool hasStdExtZicbom() const { return HasStdExtZicbom; }
   bool hasStdExtZicboz() const { return HasStdExtZicboz; }
   bool hasStdExtZicbop() const { return HasStdExtZicbop; }
+  bool hasStdExtZmmul() const { return HasStdExtZmmul; }
   bool is64Bit() const { return HasRV64; }
   bool isRV32E() const { return IsRV32E; }
   bool enableLinkerRelax() const { return EnableLinkerRelax; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index b2707b753e87..50fcb00e6c63 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -49,6 +49,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeGlobalISel(*PR);
   initializeRISCVMakeCompressibleOptPass(*PR);
   initializeRISCVGatherScatterLoweringPass(*PR);
+  initializeRISCVCodeGenPreparePass(*PR);
   initializeRISCVMergeBaseOffsetOptPass(*PR);
   initializeRISCVSExtWRemovalPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
@@ -187,7 +188,11 @@ TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
 void RISCVPassConfig::addIRPasses() {
   addPass(createAtomicExpandPass());
 
-  addPass(createRISCVGatherScatterLoweringPass());
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createRISCVGatherScatterLoweringPass());
+
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createRISCVCodeGenPreparePass());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 29d3c5e491de..f9cd5ffb512b 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -65,7 +65,7 @@ InstructionCost RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
     if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
       return TTI::TCC_Free;
     // zext.w
-    if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZbb())
+    if (Imm == UINT64_C(0xffffffff) && ST->hasStdExtZba())
       return TTI::TCC_Free;
     LLVM_FALLTHROUGH;
   case Instruction::Add:
@@ -198,6 +198,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       //   vid.v v9
       //   vrsub.vx v10, v9, a0
       //   vrgather.vv v9, v8, v10
+      if (Tp->getElementType()->isIntegerTy(1))
+        // Mask operation additionally required extend and truncate
+        return LT.first * 9;
       return LT.first * 6;
     }
   }
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
index 1a3e35a5f901..220fd76305aa 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp
@@ -1068,5 +1068,15 @@ StringRef getKernelProfilingInfoName(KernelProfilingInfo e) {
   }
   llvm_unreachable("Unexpected operand");
 }
+
+std::string getExtInstSetName(InstructionSet e) {
+  switch (e) {
+    CASE(InstructionSet, OpenCL_std)
+    CASE(InstructionSet, GLSL_std_450)
+    CASE(InstructionSet, SPV_AMD_shader_trinary_minmax)
+    break;
+  }
+  llvm_unreachable("Unexpected operand");
+}
 } // namespace SPIRV
 } // namespace llvm
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
index 2aa9f076c78e..9482723993a2 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h
@@ -706,6 +706,19 @@ enum class KernelProfilingInfo : uint32_t {
   CmdExecTime = 0x1,
 };
 StringRef getKernelProfilingInfoName(KernelProfilingInfo e);
+
+enum class InstructionSet : uint32_t {
+  OpenCL_std = 0,
+  GLSL_std_450 = 1,
+  SPV_AMD_shader_trinary_minmax = 2,
+};
+std::string getExtInstSetName(InstructionSet e);
+
+// TODO: implement other mnemonics.
+enum class Opcode : uint32_t {
+  InBoundsPtrAccessChain = 70,
+  PtrCastToGeneric = 121,
+};
 } // namespace SPIRV
 } // namespace llvm
 
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 3105baa02c90..d60e61f36270 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -59,7 +59,7 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
 }
 
 void SPIRVInstPrinter::recordOpExtInstImport(const MCInst *MI) {
-  llvm_unreachable("Unimplemented recordOpExtInstImport");
+  // TODO: insert {Reg, Set} into ExtInstSetIDs map.
 }
 
 void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
@@ -176,7 +176,18 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
 }
 
 void SPIRVInstPrinter::printOpExtInst(const MCInst *MI, raw_ostream &O) {
-  llvm_unreachable("Unimplemented printOpExtInst");
+  // The fixed operands have already been printed, so just need to decide what
+  // type of ExtInst operands to print based on the instruction set and number.
+  MCInstrDesc MCDesc = MII.get(MI->getOpcode());
+  unsigned NumFixedOps = MCDesc.getNumOperands();
+  const auto NumOps = MI->getNumOperands();
+  if (NumOps == NumFixedOps)
+    return;
+
+  O << ' ';
+
+  // TODO: implement special printing for OpenCLExtInst::vstor*.
+  printRemainingVariableOps(MI, NumFixedOps, O, true);
 }
 
 void SPIRVInstPrinter::printOpDecorate(const MCInst *MI, raw_ostream &O) {
diff --git a/llvm/lib/Target/SPIRV/SPIRV.h b/llvm/lib/Target/SPIRV/SPIRV.h
index 8da54a5d6e61..5a7f2e51afb8 100644
--- a/llvm/lib/Target/SPIRV/SPIRV.h
+++ b/llvm/lib/Target/SPIRV/SPIRV.h
@@ -19,6 +19,7 @@ class SPIRVSubtarget;
 class InstructionSelector;
 class RegisterBankInfo;
 
+ModulePass *createSPIRVPrepareFunctionsPass();
 FunctionPass *createSPIRVPreLegalizerPass();
 FunctionPass *createSPIRVEmitIntrinsicsPass(SPIRVTargetMachine *TM);
 InstructionSelector *
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 605bf949187f..6d60bd5e3c97 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -21,6 +21,7 @@
 #include "SPIRVUtils.h"
 #include "TargetInfo/SPIRVTargetInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -58,9 +59,14 @@ public:
   void outputModuleSection(SPIRV::ModuleSectionType MSType);
   void outputEntryPoints();
   void outputDebugSourceAndStrings(const Module &M);
+  void outputOpExtInstImports(const Module &M);
   void outputOpMemoryModel();
   void outputOpFunctionEnd();
   void outputExtFuncDecls();
+  void outputExecutionModeFromMDNode(Register Reg, MDNode *Node,
+                                     SPIRV::ExecutionMode EM);
+  void outputExecutionMode(const Module &M);
+  void outputAnnotations(const Module &M);
   void outputModuleSections();
 
   void emitInstruction(const MachineInstr *MI) override;
@@ -127,6 +133,8 @@ void SPIRVAsmPrinter::emitFunctionBodyEnd() {
 }
 
 void SPIRVAsmPrinter::emitOpLabel(const MachineBasicBlock &MBB) {
+  if (MAI->MBBsToSkip.contains(&MBB))
+    return;
   MCInst LabelInst;
   LabelInst.setOpcode(SPIRV::OpLabel);
   LabelInst.addOperand(MCOperand::createReg(MAI->getOrCreateMBBRegister(MBB)));
@@ -237,6 +245,13 @@ void SPIRVAsmPrinter::outputModuleSection(SPIRV::ModuleSectionType MSType) {
 }
 
 void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) {
+  // Output OpSourceExtensions.
+  for (auto &Str : MAI->SrcExt) {
+    MCInst Inst;
+    Inst.setOpcode(SPIRV::OpSourceExtension);
+    addStringImm(Str.first(), Inst);
+    outputMCInst(Inst);
+  }
   // Output OpSource.
   MCInst Inst;
   Inst.setOpcode(SPIRV::OpSource);
@@ -246,6 +261,19 @@ void SPIRVAsmPrinter::outputDebugSourceAndStrings(const Module &M) {
   outputMCInst(Inst);
 }
 
+void SPIRVAsmPrinter::outputOpExtInstImports(const Module &M) {
+  for (auto &CU : MAI->ExtInstSetMap) {
+    unsigned Set = CU.first;
+    Register Reg = CU.second;
+    MCInst Inst;
+    Inst.setOpcode(SPIRV::OpExtInstImport);
+    Inst.addOperand(MCOperand::createReg(Reg));
+    addStringImm(getExtInstSetName(static_cast<SPIRV::InstructionSet>(Set)),
+                 Inst);
+    outputMCInst(Inst);
+  }
+}
+
 void SPIRVAsmPrinter::outputOpMemoryModel() {
   MCInst Inst;
   Inst.setOpcode(SPIRV::OpMemoryModel);
@@ -301,6 +329,135 @@ void SPIRVAsmPrinter::outputExtFuncDecls() {
   }
 }
 
+// Encode LLVM type by SPIR-V execution mode VecTypeHint.
+static unsigned encodeVecTypeHint(Type *Ty) {
+  if (Ty->isHalfTy())
+    return 4;
+  if (Ty->isFloatTy())
+    return 5;
+  if (Ty->isDoubleTy())
+    return 6;
+  if (IntegerType *IntTy = dyn_cast<IntegerType>(Ty)) {
+    switch (IntTy->getIntegerBitWidth()) {
+    case 8:
+      return 0;
+    case 16:
+      return 1;
+    case 32:
+      return 2;
+    case 64:
+      return 3;
+    default:
+      llvm_unreachable("invalid integer type");
+    }
+  }
+  if (FixedVectorType *VecTy = dyn_cast<FixedVectorType>(Ty)) {
+    Type *EleTy = VecTy->getElementType();
+    unsigned Size = VecTy->getNumElements();
+    return Size << 16 | encodeVecTypeHint(EleTy);
+  }
+  llvm_unreachable("invalid type");
+}
+
+static void addOpsFromMDNode(MDNode *MDN, MCInst &Inst,
+                             SPIRV::ModuleAnalysisInfo *MAI) {
+  for (const MDOperand &MDOp : MDN->operands()) {
+    if (auto *CMeta = dyn_cast<ConstantAsMetadata>(MDOp)) {
+      Constant *C = CMeta->getValue();
+      if (ConstantInt *Const = dyn_cast<ConstantInt>(C)) {
+        Inst.addOperand(MCOperand::createImm(Const->getZExtValue()));
+      } else if (auto *CE = dyn_cast<Function>(C)) {
+        Register FuncReg = MAI->getFuncReg(CE->getName().str());
+        assert(FuncReg.isValid());
+        Inst.addOperand(MCOperand::createReg(FuncReg));
+      }
+    }
+  }
+}
+
+void SPIRVAsmPrinter::outputExecutionModeFromMDNode(Register Reg, MDNode *Node,
+                                                    SPIRV::ExecutionMode EM) {
+  MCInst Inst;
+  Inst.setOpcode(SPIRV::OpExecutionMode);
+  Inst.addOperand(MCOperand::createReg(Reg));
+  Inst.addOperand(MCOperand::createImm(static_cast<unsigned>(EM)));
+  addOpsFromMDNode(Node, Inst, MAI);
+  outputMCInst(Inst);
+}
+
+void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
+  NamedMDNode *Node = M.getNamedMetadata("spirv.ExecutionMode");
+  if (Node) {
+    for (unsigned i = 0; i < Node->getNumOperands(); i++) {
+      MCInst Inst;
+      Inst.setOpcode(SPIRV::OpExecutionMode);
+      addOpsFromMDNode(cast<MDNode>(Node->getOperand(i)), Inst, MAI);
+      outputMCInst(Inst);
+    }
+  }
+  for (auto FI = M.begin(), E = M.end(); FI != E; ++FI) {
+    const Function &F = *FI;
+    if (F.isDeclaration())
+      continue;
+    Register FReg = MAI->getFuncReg(F.getGlobalIdentifier());
+    assert(FReg.isValid());
+    if (MDNode *Node = F.getMetadata("reqd_work_group_size"))
+      outputExecutionModeFromMDNode(FReg, Node,
+                                    SPIRV::ExecutionMode::LocalSize);
+    if (MDNode *Node = F.getMetadata("work_group_size_hint"))
+      outputExecutionModeFromMDNode(FReg, Node,
+                                    SPIRV::ExecutionMode::LocalSizeHint);
+    if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size"))
+      outputExecutionModeFromMDNode(FReg, Node,
+                                    SPIRV::ExecutionMode::SubgroupSize);
+    if (MDNode *Node = F.getMetadata("vec_type_hint")) {
+      MCInst Inst;
+      Inst.setOpcode(SPIRV::OpExecutionMode);
+      Inst.addOperand(MCOperand::createReg(FReg));
+      unsigned EM = static_cast<unsigned>(SPIRV::ExecutionMode::VecTypeHint);
+      Inst.addOperand(MCOperand::createImm(EM));
+      unsigned TypeCode = encodeVecTypeHint(getMDOperandAsType(Node, 0));
+      Inst.addOperand(MCOperand::createImm(TypeCode));
+      outputMCInst(Inst);
+    }
+  }
+}
+
+void SPIRVAsmPrinter::outputAnnotations(const Module &M) {
+  outputModuleSection(SPIRV::MB_Annotations);
+  // Process llvm.global.annotations special global variable.
+  for (auto F = M.global_begin(), E = M.global_end(); F != E; ++F) {
+    if ((*F).getName() != "llvm.global.annotations")
+      continue;
+    const GlobalVariable *V = &(*F);
+    const ConstantArray *CA = cast<ConstantArray>(V->getOperand(0));
+    for (Value *Op : CA->operands()) {
+      ConstantStruct *CS = cast<ConstantStruct>(Op);
+      // The first field of the struct contains a pointer to
+      // the annotated variable.
+      Value *AnnotatedVar = CS->getOperand(0)->stripPointerCasts();
+      if (!isa<Function>(AnnotatedVar))
+        llvm_unreachable("Unsupported value in llvm.global.annotations");
+      Function *Func = cast<Function>(AnnotatedVar);
+      Register Reg = MAI->getFuncReg(Func->getGlobalIdentifier());
+
+      // The second field contains a pointer to a global annotation string.
+      GlobalVariable *GV =
+          cast<GlobalVariable>(CS->getOperand(1)->stripPointerCasts());
+
+      StringRef AnnotationString;
+      getConstantStringInfo(GV, AnnotationString);
+      MCInst Inst;
+      Inst.setOpcode(SPIRV::OpDecorate);
+      Inst.addOperand(MCOperand::createReg(Reg));
+      unsigned Dec = static_cast<unsigned>(SPIRV::Decoration::UserSemantic);
+      Inst.addOperand(MCOperand::createImm(Dec));
+      addStringImm(AnnotationString, Inst);
+      outputMCInst(Inst);
+    }
+  }
+}
+
 void SPIRVAsmPrinter::outputModuleSections() {
   const Module *M = MMI->getModule();
   // Get the global subtarget to output module-level info.
@@ -311,13 +468,14 @@ void SPIRVAsmPrinter::outputModuleSections() {
   // Output instructions according to the Logical Layout of a Module:
   // TODO: 1,2. All OpCapability instructions, then optional OpExtension
   // instructions.
-  // TODO: 3. Optional OpExtInstImport instructions.
+  // 3. Optional OpExtInstImport instructions.
+  outputOpExtInstImports(*M);
   // 4. The single required OpMemoryModel instruction.
   outputOpMemoryModel();
   // 5. All entry point declarations, using OpEntryPoint.
   outputEntryPoints();
   // 6. Execution-mode declarations, using OpExecutionMode or OpExecutionModeId.
-  // TODO:
+  outputExecutionMode(*M);
   // 7a. Debug: all OpString, OpSourceExtension, OpSource, and
   // OpSourceContinued, without forward references.
   outputDebugSourceAndStrings(*M);
@@ -326,7 +484,7 @@ void SPIRVAsmPrinter::outputModuleSections() {
   // 7c. Debug: all OpModuleProcessed instructions.
   outputModuleSection(SPIRV::MB_DebugModuleProcessed);
   // 8. All annotation instructions (all decorations).
-  outputModuleSection(SPIRV::MB_Annotations);
+  outputAnnotations(*M);
   // 9. All type declarations (OpTypeXXX instructions), all constant
   // instructions, and all global variable declarations. This section is
   // the first section to allow use of: OpLine and OpNoLine debug information;
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 5b6b82aebf30..e8fedfeffde7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -24,9 +24,8 @@
 using namespace llvm;
 
 SPIRVCallLowering::SPIRVCallLowering(const SPIRVTargetLowering &TLI,
-                                     const SPIRVSubtarget &ST,
                                      SPIRVGlobalRegistry *GR)
-    : CallLowering(&TLI), ST(ST), GR(GR) {}
+    : CallLowering(&TLI), GR(GR) {}
 
 bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                     const Value *Val, ArrayRef<Register> VRegs,
@@ -36,11 +35,13 @@ bool SPIRVCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
   // TODO: handle the case of multiple registers.
   if (VRegs.size() > 1)
     return false;
-  if (Val)
+  if (Val) {
+    const auto &STI = MIRBuilder.getMF().getSubtarget();
     return MIRBuilder.buildInstr(SPIRV::OpReturnValue)
         .addUse(VRegs[0])
-        .constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
-                          *ST.getRegBankInfo());
+        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                          *STI.getRegBankInfo());
+  }
   MIRBuilder.buildInstr(SPIRV::OpReturn);
   return true;
 }
@@ -63,6 +64,56 @@ static uint32_t getFunctionControl(const Function &F) {
   return FuncControl;
 }
 
+static ConstantInt *getConstInt(MDNode *MD, unsigned NumOp) {
+  if (MD->getNumOperands() > NumOp) {
+    auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(NumOp));
+    if (CMeta)
+      return dyn_cast<ConstantInt>(CMeta->getValue());
+  }
+  return nullptr;
+}
+
+// This code restores function args/retvalue types for composite cases
+// because the final types should still be aggregate whereas they're i32
+// during the translation to cope with aggregate flattening etc.
+static FunctionType *getOriginalFunctionType(const Function &F) {
+  auto *NamedMD = F.getParent()->getNamedMetadata("spv.cloned_funcs");
+  if (NamedMD == nullptr)
+    return F.getFunctionType();
+
+  Type *RetTy = F.getFunctionType()->getReturnType();
+  SmallVector<Type *, 4> ArgTypes;
+  for (auto &Arg : F.args())
+    ArgTypes.push_back(Arg.getType());
+
+  auto ThisFuncMDIt =
+      std::find_if(NamedMD->op_begin(), NamedMD->op_end(), [&F](MDNode *N) {
+        return isa<MDString>(N->getOperand(0)) &&
+               cast<MDString>(N->getOperand(0))->getString() == F.getName();
+      });
+  // TODO: probably one function can have numerous type mutations,
+  // so we should support this.
+  if (ThisFuncMDIt != NamedMD->op_end()) {
+    auto *ThisFuncMD = *ThisFuncMDIt;
+    MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1));
+    assert(MD && "MDNode operand is expected");
+    ConstantInt *Const = getConstInt(MD, 0);
+    if (Const) {
+      auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
+      assert(CMeta && "ConstantAsMetadata operand is expected");
+      assert(Const->getSExtValue() >= -1);
+      // Currently -1 indicates return value, greater values mean
+      // argument numbers.
+      if (Const->getSExtValue() == -1)
+        RetTy = CMeta->getType();
+      else
+        ArgTypes[Const->getSExtValue()] = CMeta->getType();
+    }
+  }
+
+  return FunctionType::get(RetTy, ArgTypes, F.isVarArg());
+}
+
 bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                              const Function &F,
                                              ArrayRef<ArrayRef<Register>> VRegs,
@@ -71,7 +122,8 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   GR->setCurrentFunc(MIRBuilder.getMF());
 
   // Assign types and names to all args, and store their types for later.
-  SmallVector<Register, 4> ArgTypeVRegs;
+  FunctionType *FTy = getOriginalFunctionType(F);
+  SmallVector<SPIRVType *, 4> ArgTypeVRegs;
   if (VRegs.size() > 0) {
     unsigned i = 0;
     for (const auto &Arg : F.args()) {
@@ -79,9 +131,18 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
       // TODO: handle the case of multiple registers.
       if (VRegs[i].size() > 1)
         return false;
-      auto *SpirvTy =
-          GR->assignTypeToVReg(Arg.getType(), VRegs[i][0], MIRBuilder);
-      ArgTypeVRegs.push_back(GR->getSPIRVTypeID(SpirvTy));
+      Type *ArgTy = FTy->getParamType(i);
+      SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite;
+      MDNode *Node = F.getMetadata("kernel_arg_access_qual");
+      if (Node && i < Node->getNumOperands()) {
+        StringRef AQString = cast<MDString>(Node->getOperand(i))->getString();
+        if (AQString.compare("read_only") == 0)
+          AQ = SPIRV::AccessQualifier::ReadOnly;
+        else if (AQString.compare("write_only") == 0)
+          AQ = SPIRV::AccessQualifier::WriteOnly;
+      }
+      auto *SpirvTy = GR->assignTypeToVReg(ArgTy, VRegs[i][0], MIRBuilder, AQ);
+      ArgTypeVRegs.push_back(SpirvTy);
 
       if (Arg.hasName())
         buildOpName(VRegs[i][0], Arg.getName(), MIRBuilder);
@@ -92,8 +153,10 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                           SPIRV::Decoration::MaxByteOffset, {DerefBytes});
       }
       if (Arg.hasAttribute(Attribute::Alignment)) {
+        auto Alignment = static_cast<unsigned>(
+            Arg.getAttribute(Attribute::Alignment).getValueAsInt());
         buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Alignment,
-                        {static_cast<unsigned>(Arg.getParamAlignment())});
+                        {Alignment});
       }
       if (Arg.hasAttribute(Attribute::ReadOnly)) {
         auto Attr =
@@ -107,6 +170,38 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
         buildOpDecorate(VRegs[i][0], MIRBuilder,
                         SPIRV::Decoration::FuncParamAttr, {Attr});
       }
+      if (Arg.hasAttribute(Attribute::NoAlias)) {
+        auto Attr =
+            static_cast<unsigned>(SPIRV::FunctionParameterAttribute::NoAlias);
+        buildOpDecorate(VRegs[i][0], MIRBuilder,
+                        SPIRV::Decoration::FuncParamAttr, {Attr});
+      }
+      Node = F.getMetadata("kernel_arg_type_qual");
+      if (Node && i < Node->getNumOperands()) {
+        StringRef TypeQual = cast<MDString>(Node->getOperand(i))->getString();
+        if (TypeQual.compare("volatile") == 0)
+          buildOpDecorate(VRegs[i][0], MIRBuilder, SPIRV::Decoration::Volatile,
+                          {});
+      }
+      Node = F.getMetadata("spirv.ParameterDecorations");
+      if (Node && i < Node->getNumOperands() &&
+          isa<MDNode>(Node->getOperand(i))) {
+        MDNode *MD = cast<MDNode>(Node->getOperand(i));
+        for (const MDOperand &MDOp : MD->operands()) {
+          MDNode *MD2 = dyn_cast<MDNode>(MDOp);
+          assert(MD2 && "Metadata operand is expected");
+          ConstantInt *Const = getConstInt(MD2, 0);
+          assert(Const && "MDOperand should be ConstantInt");
+          auto Dec = static_cast<SPIRV::Decoration>(Const->getZExtValue());
+          std::vector<uint32_t> DecVec;
+          for (unsigned j = 1; j < MD2->getNumOperands(); j++) {
+            ConstantInt *Const = getConstInt(MD2, j);
+            assert(Const && "MDOperand should be ConstantInt");
+            DecVec.push_back(static_cast<uint32_t>(Const->getZExtValue()));
+          }
+          buildOpDecorate(VRegs[i][0], MIRBuilder, Dec, DecVec);
+        }
+      }
       ++i;
     }
   }
@@ -117,30 +212,30 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
   if (F.isDeclaration())
     GR->add(&F, &MIRBuilder.getMF(), FuncVReg);
-
-  auto *FTy = F.getFunctionType();
-  auto FuncTy = GR->assignTypeToVReg(FTy, FuncVReg, MIRBuilder);
+  SPIRVType *RetTy = GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
+  SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
+      FTy, RetTy, ArgTypeVRegs, MIRBuilder);
 
   // Build the OpTypeFunction declaring it.
-  Register ReturnTypeID = FuncTy->getOperand(1).getReg();
   uint32_t FuncControl = getFunctionControl(F);
 
   MIRBuilder.buildInstr(SPIRV::OpFunction)
       .addDef(FuncVReg)
-      .addUse(ReturnTypeID)
+      .addUse(GR->getSPIRVTypeID(RetTy))
       .addImm(FuncControl)
       .addUse(GR->getSPIRVTypeID(FuncTy));
 
   // Add OpFunctionParameters.
-  const unsigned NumArgs = ArgTypeVRegs.size();
-  for (unsigned i = 0; i < NumArgs; ++i) {
+  int i = 0;
+  for (const auto &Arg : F.args()) {
     assert(VRegs[i].size() == 1 && "Formal arg has multiple vregs");
     MRI->setRegClass(VRegs[i][0], &SPIRV::IDRegClass);
     MIRBuilder.buildInstr(SPIRV::OpFunctionParameter)
         .addDef(VRegs[i][0])
-        .addUse(ArgTypeVRegs[i]);
+        .addUse(GR->getSPIRVTypeID(ArgTypeVRegs[i]));
     if (F.isDeclaration())
-      GR->add(F.getArg(i), &MIRBuilder.getMF(), VRegs[i][0]);
+      GR->add(&Arg, &MIRBuilder.getMF(), VRegs[i][0]);
+    i++;
   }
   // Name the function.
   if (F.hasName())
@@ -169,48 +264,51 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   // TODO: handle the case of multiple registers.
   if (Info.OrigRet.Regs.size() > 1)
     return false;
+  MachineFunction &MF = MIRBuilder.getMF();
+  GR->setCurrentFunc(MF);
+  FunctionType *FTy = nullptr;
+  const Function *CF = nullptr;
 
-  GR->setCurrentFunc(MIRBuilder.getMF());
-  Register ResVReg =
-      Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
   // Emit a regular OpFunctionCall. If it's an externally declared function,
-  // be sure to emit its type and function declaration here. It will be
-  // hoisted globally later.
+  // be sure to emit its type and function declaration here. It will be hoisted
+  // globally later.
   if (Info.Callee.isGlobal()) {
-    auto *CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal());
+    CF = dyn_cast_or_null<const Function>(Info.Callee.getGlobal());
     // TODO: support constexpr casts and indirect calls.
     if (CF == nullptr)
       return false;
-    if (CF->isDeclaration()) {
-      // Emit the type info and forward function declaration to the first MBB
-      // to ensure VReg definition dependencies are valid across all MBBs.
-      MachineBasicBlock::iterator OldII = MIRBuilder.getInsertPt();
-      MachineBasicBlock &OldBB = MIRBuilder.getMBB();
-      MachineBasicBlock &FirstBB = *MIRBuilder.getMF().getBlockNumbered(0);
-      MIRBuilder.setInsertPt(FirstBB, FirstBB.instr_end());
-
-      SmallVector<ArrayRef<Register>, 8> VRegArgs;
-      SmallVector<SmallVector<Register, 1>, 8> ToInsert;
-      for (const Argument &Arg : CF->args()) {
-        if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero())
-          continue; // Don't handle zero sized types.
-        ToInsert.push_back({MIRBuilder.getMRI()->createGenericVirtualRegister(
-            LLT::scalar(32))});
-        VRegArgs.push_back(ToInsert.back());
-      }
-      // TODO: Reuse FunctionLoweringInfo.
-      FunctionLoweringInfo FuncInfo;
-      lowerFormalArguments(MIRBuilder, *CF, VRegArgs, FuncInfo);
-      MIRBuilder.setInsertPt(OldBB, OldII);
+    FTy = getOriginalFunctionType(*CF);
+  }
+
+  Register ResVReg =
+      Info.OrigRet.Regs.empty() ? Register(0) : Info.OrigRet.Regs[0];
+  if (CF && CF->isDeclaration() &&
+      !GR->find(CF, &MIRBuilder.getMF()).isValid()) {
+    // Emit the type info and forward function declaration to the first MBB
+    // to ensure VReg definition dependencies are valid across all MBBs.
+    MachineIRBuilder FirstBlockBuilder;
+    FirstBlockBuilder.setMF(MF);
+    FirstBlockBuilder.setMBB(*MF.getBlockNumbered(0));
+
+    SmallVector<ArrayRef<Register>, 8> VRegArgs;
+    SmallVector<SmallVector<Register, 1>, 8> ToInsert;
+    for (const Argument &Arg : CF->args()) {
+      if (MIRBuilder.getDataLayout().getTypeStoreSize(Arg.getType()).isZero())
+        continue; // Don't handle zero sized types.
+      ToInsert.push_back(
+          {MIRBuilder.getMRI()->createGenericVirtualRegister(LLT::scalar(32))});
+      VRegArgs.push_back(ToInsert.back());
     }
+    // TODO: Reuse FunctionLoweringInfo
+    FunctionLoweringInfo FuncInfo;
+    lowerFormalArguments(FirstBlockBuilder, *CF, VRegArgs, FuncInfo);
   }
 
   // Make sure there's a valid return reg, even for functions returning void.
-  if (!ResVReg.isValid()) {
+  if (!ResVReg.isValid())
     ResVReg = MIRBuilder.getMRI()->createVirtualRegister(&SPIRV::IDRegClass);
-  }
   SPIRVType *RetType =
-      GR->assignTypeToVReg(Info.OrigRet.Ty, ResVReg, MIRBuilder);
+      GR->assignTypeToVReg(FTy->getReturnType(), ResVReg, MIRBuilder);
 
   // Emit the OpFunctionCall and its args.
   auto MIB = MIRBuilder.buildInstr(SPIRV::OpFunctionCall)
@@ -224,6 +322,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       return false;
     MIB.addUse(Arg.Regs[0]);
   }
-  return MIB.constrainAllUses(MIRBuilder.getTII(), *ST.getRegisterInfo(),
-                              *ST.getRegBankInfo());
+  const auto &STI = MF.getSubtarget();
+  return MIB.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
+                              *STI.getRegBankInfo());
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
index c179bb35154b..c2d6ad82d507 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.h
@@ -13,23 +13,21 @@
 #ifndef LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H
 #define LLVM_LIB_TARGET_SPIRV_SPIRVCALLLOWERING_H
 
+#include "SPIRVGlobalRegistry.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 
 namespace llvm {
 
 class SPIRVGlobalRegistry;
-class SPIRVSubtarget;
 class SPIRVTargetLowering;
 
 class SPIRVCallLowering : public CallLowering {
 private:
-  const SPIRVSubtarget &ST;
   // Used to create and assign function, argument, and return type information.
   SPIRVGlobalRegistry *GR;
 
 public:
-  SPIRVCallLowering(const SPIRVTargetLowering &TLI, const SPIRVSubtarget &ST,
-                    SPIRVGlobalRegistry *GR);
+  SPIRVCallLowering(const SPIRVTargetLowering &TLI, SPIRVGlobalRegistry *GR);
 
   // Built OpReturn or OpReturnValue.
   bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
index 57cd4bafd351..1926977ea66e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.cpp
@@ -92,4 +92,4 @@ void SPIRVGeneralDuplicatesTracker::buildDepsGraph(
       }
     }
   }
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
index 58ae1f86ce42..ab22c3d2a647 100644
--- a/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
+++ b/llvm/lib/Target/SPIRV/SPIRVDuplicatesTracker.h
@@ -169,6 +169,8 @@ public:
   Register find(const Argument *Arg, const MachineFunction *MF) {
     return AT.find(const_cast<Argument *>(Arg), MF);
   }
+
+  const SPIRVDuplicatesTracker<Type> *getTypes() { return &TT; }
 };
 } // namespace llvm
-#endif
-\ No newline at end of file
+#endif // LLVM_LIB_TARGET_SPIRV_SPIRVDUPLICATESTRACKER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 9624482e3622..0075f547b6d6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -87,6 +87,7 @@ public:
   Instruction *visitLoadInst(LoadInst &I);
   Instruction *visitStoreInst(StoreInst &I);
   Instruction *visitAllocaInst(AllocaInst &I);
+  Instruction *visitAtomicCmpXchgInst(AtomicCmpXchgInst &I);
   bool runOnFunction(Function &F) override;
 };
 } // namespace
@@ -103,7 +104,7 @@ static inline bool isAssignTypeInstr(const Instruction *I) {
 
 static bool isMemInstrToReplace(Instruction *I) {
   return isa<StoreInst>(I) || isa<LoadInst>(I) || isa<InsertValueInst>(I) ||
-         isa<ExtractValueInst>(I);
+         isa<ExtractValueInst>(I) || isa<AtomicCmpXchgInst>(I);
 }
 
 static bool isAggrToReplace(const Value *V) {
@@ -134,13 +135,14 @@ void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old,
                                               Instruction *New) {
   while (!Old->user_empty()) {
     auto *U = Old->user_back();
-    if (isMemInstrToReplace(U) || isa<ReturnInst>(U)) {
-      U->replaceUsesOfWith(Old, New);
-    } else if (isAssignTypeInstr(U)) {
+    if (isAssignTypeInstr(U)) {
       IRB->SetInsertPoint(U);
       SmallVector<Value *, 2> Args = {New, U->getOperand(1)};
       IRB->CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args);
       U->eraseFromParent();
+    } else if (isMemInstrToReplace(U) || isa<ReturnInst>(U) ||
+               isa<CallInst>(U)) {
+      U->replaceUsesOfWith(Old, New);
     } else {
       llvm_unreachable("illegal aggregate intrinsic user");
     }
@@ -301,10 +303,10 @@ Instruction *SPIRVEmitIntrinsics::visitStoreInst(StoreInst &I) {
   MachineMemOperand::Flags Flags =
       TLI->getStoreMemOperandFlags(I, F->getParent()->getDataLayout());
   auto *PtrOp = I.getPointerOperand();
-  auto *NewI =
-      IRB->CreateIntrinsic(Intrinsic::spv_store, {PtrOp->getType()},
-                           {I.getValueOperand(), PtrOp, IRB->getInt16(Flags),
-                            IRB->getInt8(I.getAlign().value())});
+  auto *NewI = IRB->CreateIntrinsic(
+      Intrinsic::spv_store, {I.getValueOperand()->getType(), PtrOp->getType()},
+      {I.getValueOperand(), PtrOp, IRB->getInt16(Flags),
+       IRB->getInt8(I.getAlign().value())});
   I.eraseFromParent();
   return NewI;
 }
@@ -314,6 +316,22 @@ Instruction *SPIRVEmitIntrinsics::visitAllocaInst(AllocaInst &I) {
   return &I;
 }
 
+Instruction *SPIRVEmitIntrinsics::visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+  assert(I.getType()->isAggregateType() && "Aggregate result is expected");
+  SmallVector<Value *> Args;
+  for (auto &Op : I.operands())
+    Args.push_back(Op);
+  Args.push_back(IRB->getInt32(I.getSyncScopeID()));
+  Args.push_back(IRB->getInt32(
+      static_cast<uint32_t>(getMemSemantics(I.getSuccessOrdering()))));
+  Args.push_back(IRB->getInt32(
+      static_cast<uint32_t>(getMemSemantics(I.getFailureOrdering()))));
+  auto *NewI = IRB->CreateIntrinsic(Intrinsic::spv_cmpxchg,
+                                    {I.getPointerOperand()->getType()}, {Args});
+  replaceMemInstrUses(&I, NewI);
+  return NewI;
+}
+
 void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV) {
   // Skip special artifical variable llvm.global.annotations.
   if (GV.getName() == "llvm.global.annotations")
@@ -351,14 +369,13 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I) {
         // Check GetElementPtrConstantExpr case.
         (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) {
       IRB->SetInsertPoint(I);
-      buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op);
+      if (isa<UndefValue>(Op) && Op->getType()->isAggregateType())
+        buildIntrWithMD(Intrinsic::spv_assign_type, {IRB->getInt32Ty()}, Op,
+                        UndefValue::get(IRB->getInt32Ty()));
+      else
+        buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op);
     }
   }
-  // StoreInst's operand type can be changed in the next stage so we need to
-  // store it in the set.
-  if (isa<StoreInst>(I) &&
-      cast<StoreInst>(I)->getValueOperand()->getType()->isAggregateType())
-    AggrStores.insert(I);
 }
 
 void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) {
@@ -378,7 +395,7 @@ void SPIRVEmitIntrinsics::processInstrAfterVisit(Instruction *I) {
     if ((isa<ConstantAggregateZero>(Op) && Op->getType()->isVectorTy()) ||
         isa<PHINode>(I) || isa<SwitchInst>(I))
       TrackConstants = false;
-    if (isa<ConstantData>(Op) && TrackConstants) {
+    if ((isa<ConstantData>(Op) || isa<ConstantExpr>(Op)) && TrackConstants) {
       unsigned OpNo = Op.getOperandNo();
       if (II && ((II->getIntrinsicID() == Intrinsic::spv_gep && OpNo == 0) ||
                  (II->paramHasAttr(OpNo, Attribute::ImmArg))))
@@ -405,8 +422,20 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) {
   AggrConsts.clear();
   AggrStores.clear();
 
-  IRB->SetInsertPoint(&Func.getEntryBlock().front());
+  // StoreInst's operand type can be changed during the next transformations,
+  // so we need to store it in the set. Also store already transformed types.
+  for (auto &I : instructions(Func)) {
+    StoreInst *SI = dyn_cast<StoreInst>(&I);
+    if (!SI)
+      continue;
+    Type *ElTy = SI->getValueOperand()->getType();
+    PointerType *PTy = cast<PointerType>(SI->getOperand(1)->getType());
+    if (ElTy->isAggregateType() || ElTy->isVectorTy() ||
+        !PTy->isOpaqueOrPointeeTypeMatches(ElTy))
+      AggrStores.insert(&I);
+  }
 
+  IRB->SetInsertPoint(&Func.getEntryBlock().front());
   for (auto &GV : Func.getParent()->globals())
     processGlobalValue(GV);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 5f890c003cbc..5c8fa7adfbdf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -24,6 +24,24 @@ using namespace llvm;
 SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
     : PointerSize(PointerSize) {}
 
+SPIRVType *SPIRVGlobalRegistry::assignIntTypeToVReg(unsigned BitWidth,
+                                                    Register VReg,
+                                                    MachineInstr &I,
+                                                    const SPIRVInstrInfo &TII) {
+  SPIRVType *SpirvType = getOrCreateSPIRVIntegerType(BitWidth, I, TII);
+  assignSPIRVTypeToVReg(SpirvType, VReg, *CurMF);
+  return SpirvType;
+}
+
+SPIRVType *SPIRVGlobalRegistry::assignVectTypeToVReg(
+    SPIRVType *BaseType, unsigned NumElements, Register VReg, MachineInstr &I,
+    const SPIRVInstrInfo &TII) {
+  SPIRVType *SpirvType =
+      getOrCreateSPIRVVectorType(BaseType, NumElements, I, TII);
+  assignSPIRVTypeToVReg(SpirvType, VReg, *CurMF);
+  return SpirvType;
+}
+
 SPIRVType *SPIRVGlobalRegistry::assignTypeToVReg(
     const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder,
     SPIRV::AccessQualifier AccessQual, bool EmitIR) {
@@ -96,6 +114,65 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems,
   return MIB;
 }
 
+std::tuple<Register, ConstantInt *, bool>
+SPIRVGlobalRegistry::getOrCreateConstIntReg(uint64_t Val, SPIRVType *SpvType,
+                                            MachineIRBuilder *MIRBuilder,
+                                            MachineInstr *I,
+                                            const SPIRVInstrInfo *TII) {
+  const IntegerType *LLVMIntTy;
+  if (SpvType)
+    LLVMIntTy = cast<IntegerType>(getTypeForSPIRVType(SpvType));
+  else
+    LLVMIntTy = IntegerType::getInt32Ty(CurMF->getFunction().getContext());
+  bool NewInstr = false;
+  // Find a constant in DT or build a new one.
+  ConstantInt *CI = ConstantInt::get(const_cast<IntegerType *>(LLVMIntTy), Val);
+  Register Res = DT.find(CI, CurMF);
+  if (!Res.isValid()) {
+    unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
+    LLT LLTy = LLT::scalar(32);
+    Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy);
+    if (MIRBuilder)
+      assignTypeToVReg(LLVMIntTy, Res, *MIRBuilder);
+    else
+      assignIntTypeToVReg(BitWidth, Res, *I, *TII);
+    DT.add(CI, CurMF, Res);
+    NewInstr = true;
+  }
+  return std::make_tuple(Res, CI, NewInstr);
+}
+
+Register SPIRVGlobalRegistry::getOrCreateConstInt(uint64_t Val, MachineInstr &I,
+                                                  SPIRVType *SpvType,
+                                                  const SPIRVInstrInfo &TII) {
+  assert(SpvType);
+  ConstantInt *CI;
+  Register Res;
+  bool New;
+  std::tie(Res, CI, New) =
+      getOrCreateConstIntReg(Val, SpvType, nullptr, &I, &TII);
+  // If we have found Res register which is defined by the passed G_CONSTANT
+  // machine instruction, a new constant instruction should be created.
+  if (!New && (!I.getOperand(0).isReg() || Res != I.getOperand(0).getReg()))
+    return Res;
+  MachineInstrBuilder MIB;
+  MachineBasicBlock &BB = *I.getParent();
+  if (Val) {
+    MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
+              .addDef(Res)
+              .addUse(getSPIRVTypeID(SpvType));
+    addNumImm(APInt(getScalarOrVectorBitWidth(SpvType), Val), MIB);
+  } else {
+    MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+              .addDef(Res)
+              .addUse(getSPIRVTypeID(SpvType));
+  }
+  const auto &ST = CurMF->getSubtarget();
+  constrainSelectedInstRegOperands(*MIB, *ST.getInstrInfo(),
+                                   *ST.getRegisterInfo(), *ST.getRegBankInfo());
+  return Res;
+}
+
 Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
                                                MachineIRBuilder &MIRBuilder,
                                                SPIRVType *SpvType,
@@ -112,14 +189,32 @@ Register SPIRVGlobalRegistry::buildConstantInt(uint64_t Val,
   Register Res = DT.find(ConstInt, &MF);
   if (!Res.isValid()) {
     unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
-    Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
-    assignTypeToVReg(LLVMIntTy, Res, MIRBuilder);
-    if (EmitIR)
+    LLT LLTy = LLT::scalar(EmitIR ? BitWidth : 32);
+    Res = MF.getRegInfo().createGenericVirtualRegister(LLTy);
+    assignTypeToVReg(LLVMIntTy, Res, MIRBuilder,
+                     SPIRV::AccessQualifier::ReadWrite, EmitIR);
+    DT.add(ConstInt, &MIRBuilder.getMF(), Res);
+    if (EmitIR) {
       MIRBuilder.buildConstant(Res, *ConstInt);
-    else
-      MIRBuilder.buildInstr(SPIRV::OpConstantI)
-          .addDef(Res)
-          .addImm(ConstInt->getSExtValue());
+    } else {
+      MachineInstrBuilder MIB;
+      if (Val) {
+        assert(SpvType);
+        MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI)
+                  .addDef(Res)
+                  .addUse(getSPIRVTypeID(SpvType));
+        addNumImm(APInt(BitWidth, Val), MIB);
+      } else {
+        assert(SpvType);
+        MIB = MIRBuilder.buildInstr(SPIRV::OpConstantNull)
+                  .addDef(Res)
+                  .addUse(getSPIRVTypeID(SpvType));
+      }
+      const auto &Subtarget = CurMF->getSubtarget();
+      constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(),
+                                       *Subtarget.getRegisterInfo(),
+                                       *Subtarget.getRegBankInfo());
+    }
   }
   return Res;
 }
@@ -142,11 +237,63 @@ Register SPIRVGlobalRegistry::buildConstantFP(APFloat Val,
     unsigned BitWidth = SpvType ? getScalarOrVectorBitWidth(SpvType) : 32;
     Res = MF.getRegInfo().createGenericVirtualRegister(LLT::scalar(BitWidth));
     assignTypeToVReg(LLVMFPTy, Res, MIRBuilder);
+    DT.add(ConstFP, &MF, Res);
     MIRBuilder.buildFConstant(Res, *ConstFP);
   }
   return Res;
 }
 
+Register
+SPIRVGlobalRegistry::getOrCreateConsIntVector(uint64_t Val, MachineInstr &I,
+                                              SPIRVType *SpvType,
+                                              const SPIRVInstrInfo &TII) {
+  const Type *LLVMTy = getTypeForSPIRVType(SpvType);
+  assert(LLVMTy->isVectorTy());
+  const FixedVectorType *LLVMVecTy = cast<FixedVectorType>(LLVMTy);
+  Type *LLVMBaseTy = LLVMVecTy->getElementType();
+  // Find a constant vector in DT or build a new one.
+  const auto ConstInt = ConstantInt::get(LLVMBaseTy, Val);
+  auto ConstVec =
+      ConstantVector::getSplat(LLVMVecTy->getElementCount(), ConstInt);
+  Register Res = DT.find(ConstVec, CurMF);
+  if (!Res.isValid()) {
+    unsigned BitWidth = getScalarOrVectorBitWidth(SpvType);
+    SPIRVType *SpvBaseType = getOrCreateSPIRVIntegerType(BitWidth, I, TII);
+    // SpvScalConst should be created before SpvVecConst to avoid undefined ID
+    // error on validation.
+    // TODO: can moved below once sorting of types/consts/defs is implemented.
+    Register SpvScalConst;
+    if (Val)
+      SpvScalConst = getOrCreateConstInt(Val, I, SpvBaseType, TII);
+    // TODO: maybe use bitwidth of base type.
+    LLT LLTy = LLT::scalar(32);
+    Register SpvVecConst =
+        CurMF->getRegInfo().createGenericVirtualRegister(LLTy);
+    const unsigned ElemCnt = SpvType->getOperand(2).getImm();
+    assignVectTypeToVReg(SpvBaseType, ElemCnt, SpvVecConst, I, TII);
+    DT.add(ConstVec, CurMF, SpvVecConst);
+    MachineInstrBuilder MIB;
+    MachineBasicBlock &BB = *I.getParent();
+    if (Val) {
+      MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantComposite))
+                .addDef(SpvVecConst)
+                .addUse(getSPIRVTypeID(SpvType));
+      for (unsigned i = 0; i < ElemCnt; ++i)
+        MIB.addUse(SpvScalConst);
+    } else {
+      MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+                .addDef(SpvVecConst)
+                .addUse(getSPIRVTypeID(SpvType));
+    }
+    const auto &Subtarget = CurMF->getSubtarget();
+    constrainSelectedInstRegOperands(*MIB, *Subtarget.getInstrInfo(),
+                                     *Subtarget.getRegisterInfo(),
+                                     *Subtarget.getRegBankInfo());
+    return SpvVecConst;
+  }
+  return Res;
+}
+
 Register SPIRVGlobalRegistry::buildGlobalVariable(
     Register ResVReg, SPIRVType *BaseType, StringRef Name,
     const GlobalValue *GV, SPIRV::StorageClass Storage,
@@ -169,7 +316,13 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
     }
     GV = GVar;
   }
-  Register Reg;
+  Register Reg = DT.find(GVar, &MIRBuilder.getMF());
+  if (Reg.isValid()) {
+    if (Reg != ResVReg)
+      MIRBuilder.buildCopy(ResVReg, Reg);
+    return ResVReg;
+  }
+
   auto MIB = MIRBuilder.buildInstr(SPIRV::OpVariable)
                  .addDef(ResVReg)
                  .addUse(getSPIRVTypeID(BaseType))
@@ -234,14 +387,76 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems,
   return MIB;
 }
 
+SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty,
+                                                MachineIRBuilder &MIRBuilder) {
+  assert(Ty->hasName());
+  const StringRef Name = Ty->hasName() ? Ty->getName() : "";
+  Register ResVReg = createTypeVReg(MIRBuilder);
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeOpaque).addDef(ResVReg);
+  addStringImm(Name, MIB);
+  buildOpName(ResVReg, Name, MIRBuilder);
+  return MIB;
+}
+
+SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty,
+                                                MachineIRBuilder &MIRBuilder,
+                                                bool EmitIR) {
+  SmallVector<Register, 4> FieldTypes;
+  for (const auto &Elem : Ty->elements()) {
+    SPIRVType *ElemTy = findSPIRVType(Elem, MIRBuilder);
+    assert(ElemTy && ElemTy->getOpcode() != SPIRV::OpTypeVoid &&
+           "Invalid struct element type");
+    FieldTypes.push_back(getSPIRVTypeID(ElemTy));
+  }
+  Register ResVReg = createTypeVReg(MIRBuilder);
+  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg);
+  for (const auto &Ty : FieldTypes)
+    MIB.addUse(Ty);
+  if (Ty->hasName())
+    buildOpName(ResVReg, Ty->getName(), MIRBuilder);
+  if (Ty->isPacked())
+    buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {});
+  return MIB;
+}
+
+static bool isOpenCLBuiltinType(const StructType *SType) {
+  return SType->isOpaque() && SType->hasName() &&
+         SType->getName().startswith("opencl.");
+}
+
+static bool isSPIRVBuiltinType(const StructType *SType) {
+  return SType->isOpaque() && SType->hasName() &&
+         SType->getName().startswith("spirv.");
+}
+
+static bool isSpecialType(const Type *Ty) {
+  if (auto PType = dyn_cast<PointerType>(Ty)) {
+    if (!PType->isOpaque())
+      Ty = PType->getNonOpaquePointerElementType();
+  }
+  if (auto SType = dyn_cast<StructType>(Ty))
+    return isOpenCLBuiltinType(SType) || isSPIRVBuiltinType(SType);
+  return false;
+}
+
 SPIRVType *SPIRVGlobalRegistry::getOpTypePointer(SPIRV::StorageClass SC,
                                                  SPIRVType *ElemType,
-                                                 MachineIRBuilder &MIRBuilder) {
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypePointer)
-                 .addDef(createTypeVReg(MIRBuilder))
-                 .addImm(static_cast<uint32_t>(SC))
-                 .addUse(getSPIRVTypeID(ElemType));
-  return MIB;
+                                                 MachineIRBuilder &MIRBuilder,
+                                                 Register Reg) {
+  if (!Reg.isValid())
+    Reg = createTypeVReg(MIRBuilder);
+  return MIRBuilder.buildInstr(SPIRV::OpTypePointer)
+      .addDef(Reg)
+      .addImm(static_cast<uint32_t>(SC))
+      .addUse(getSPIRVTypeID(ElemType));
+}
+
+SPIRVType *
+SPIRVGlobalRegistry::getOpTypeForwardPointer(SPIRV::StorageClass SC,
+                                             MachineIRBuilder &MIRBuilder) {
+  return MIRBuilder.buildInstr(SPIRV::OpTypeForwardPointer)
+      .addUse(createTypeVReg(MIRBuilder))
+      .addImm(static_cast<uint32_t>(SC));
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction(
@@ -255,10 +470,49 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction(
   return MIB;
 }
 
+SPIRVType *SPIRVGlobalRegistry::getOrCreateOpTypeFunctionWithArgs(
+    const Type *Ty, SPIRVType *RetType,
+    const SmallVectorImpl<SPIRVType *> &ArgTypes,
+    MachineIRBuilder &MIRBuilder) {
+  Register Reg = DT.find(Ty, &MIRBuilder.getMF());
+  if (Reg.isValid())
+    return getSPIRVTypeForVReg(Reg);
+  SPIRVType *SpirvType = getOpTypeFunction(RetType, ArgTypes, MIRBuilder);
+  return finishCreatingSPIRVType(Ty, SpirvType);
+}
+
+SPIRVType *SPIRVGlobalRegistry::findSPIRVType(const Type *Ty,
+                                              MachineIRBuilder &MIRBuilder,
+                                              SPIRV::AccessQualifier AccQual,
+                                              bool EmitIR) {
+  Register Reg = DT.find(Ty, &MIRBuilder.getMF());
+  if (Reg.isValid())
+    return getSPIRVTypeForVReg(Reg);
+  if (ForwardPointerTypes.find(Ty) != ForwardPointerTypes.end())
+    return ForwardPointerTypes[Ty];
+  return restOfCreateSPIRVType(Ty, MIRBuilder, AccQual, EmitIR);
+}
+
+Register SPIRVGlobalRegistry::getSPIRVTypeID(const SPIRVType *SpirvType) const {
+  assert(SpirvType && "Attempting to get type id for nullptr type.");
+  if (SpirvType->getOpcode() == SPIRV::OpTypeForwardPointer)
+    return SpirvType->uses().begin()->getReg();
+  return SpirvType->defs().begin()->getReg();
+}
+
 SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty,
                                                 MachineIRBuilder &MIRBuilder,
                                                 SPIRV::AccessQualifier AccQual,
                                                 bool EmitIR) {
+  assert(!isSpecialType(Ty));
+  auto &TypeToSPIRVTypeMap = DT.getTypes()->getAllUses();
+  auto t = TypeToSPIRVTypeMap.find(Ty);
+  if (t != TypeToSPIRVTypeMap.end()) {
+    auto tt = t->second.find(&MIRBuilder.getMF());
+    if (tt != t->second.end())
+      return getSPIRVTypeForVReg(tt->second);
+  }
+
   if (auto IType = dyn_cast<IntegerType>(Ty)) {
     const unsigned Width = IType->getBitWidth();
     return Width == 1 ? getOpTypeBool(MIRBuilder)
@@ -269,21 +523,25 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty,
   if (Ty->isVoidTy())
     return getOpTypeVoid(MIRBuilder);
   if (Ty->isVectorTy()) {
-    auto El = getOrCreateSPIRVType(cast<FixedVectorType>(Ty)->getElementType(),
-                                   MIRBuilder);
+    SPIRVType *El =
+        findSPIRVType(cast<FixedVectorType>(Ty)->getElementType(), MIRBuilder);
     return getOpTypeVector(cast<FixedVectorType>(Ty)->getNumElements(), El,
                            MIRBuilder);
   }
   if (Ty->isArrayTy()) {
-    auto *El = getOrCreateSPIRVType(Ty->getArrayElementType(), MIRBuilder);
+    SPIRVType *El = findSPIRVType(Ty->getArrayElementType(), MIRBuilder);
     return getOpTypeArray(Ty->getArrayNumElements(), El, MIRBuilder, EmitIR);
   }
-  assert(!isa<StructType>(Ty) && "Unsupported StructType");
+  if (auto SType = dyn_cast<StructType>(Ty)) {
+    if (SType->isOpaque())
+      return getOpTypeOpaque(SType, MIRBuilder);
+    return getOpTypeStruct(SType, MIRBuilder, EmitIR);
+  }
   if (auto FType = dyn_cast<FunctionType>(Ty)) {
-    SPIRVType *RetTy = getOrCreateSPIRVType(FType->getReturnType(), MIRBuilder);
+    SPIRVType *RetTy = findSPIRVType(FType->getReturnType(), MIRBuilder);
     SmallVector<SPIRVType *, 4> ParamTypes;
     for (const auto &t : FType->params()) {
-      ParamTypes.push_back(getOrCreateSPIRVType(t, MIRBuilder));
+      ParamTypes.push_back(findSPIRVType(t, MIRBuilder));
     }
     return getOpTypeFunction(RetTy, ParamTypes, MIRBuilder);
   }
@@ -292,24 +550,51 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(const Type *Ty,
     // At the moment, all opaque pointers correspond to i8 element type.
     // TODO: change the implementation once opaque pointers are supported
     // in the SPIR-V specification.
-    if (PType->isOpaque()) {
+    if (PType->isOpaque())
       SpvElementType = getOrCreateSPIRVIntegerType(8, MIRBuilder);
-    } else {
-      Type *ElemType = PType->getNonOpaquePointerElementType();
-      // TODO: support OpenCL and SPIRV builtins like image2d_t that are passed
-      // as pointers, but should be treated as custom types like OpTypeImage.
-      assert(!isa<StructType>(ElemType) && "Unsupported StructType pointer");
-
-      // Otherwise, treat it as a regular pointer type.
-      SpvElementType = getOrCreateSPIRVType(
-          ElemType, MIRBuilder, SPIRV::AccessQualifier::ReadWrite, EmitIR);
-    }
+    else
+      SpvElementType =
+          findSPIRVType(PType->getNonOpaquePointerElementType(), MIRBuilder,
+                        SPIRV::AccessQualifier::ReadWrite, EmitIR);
     auto SC = addressSpaceToStorageClass(PType->getAddressSpace());
-    return getOpTypePointer(SC, SpvElementType, MIRBuilder);
+    // Null pointer means we have a loop in type definitions, make and
+    // return corresponding OpTypeForwardPointer.
+    if (SpvElementType == nullptr) {
+      if (ForwardPointerTypes.find(Ty) == ForwardPointerTypes.end())
+        ForwardPointerTypes[PType] = getOpTypeForwardPointer(SC, MIRBuilder);
+      return ForwardPointerTypes[PType];
+    }
+    Register Reg(0);
+    // If we have forward pointer associated with this type, use its register
+    // operand to create OpTypePointer.
+    if (ForwardPointerTypes.find(PType) != ForwardPointerTypes.end())
+      Reg = getSPIRVTypeID(ForwardPointerTypes[PType]);
+
+    return getOpTypePointer(SC, SpvElementType, MIRBuilder, Reg);
   }
   llvm_unreachable("Unable to convert LLVM type to SPIRVType");
 }
 
+SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
+    const Type *Ty, MachineIRBuilder &MIRBuilder,
+    SPIRV::AccessQualifier AccessQual, bool EmitIR) {
+  if (TypesInProcessing.count(Ty) && !Ty->isPointerTy())
+    return nullptr;
+  TypesInProcessing.insert(Ty);
+  SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
+  TypesInProcessing.erase(Ty);
+  VRegToTypeMap[&MIRBuilder.getMF()][getSPIRVTypeID(SpirvType)] = SpirvType;
+  SPIRVToLLVMType[SpirvType] = Ty;
+  Register Reg = DT.find(Ty, &MIRBuilder.getMF());
+  // Do not add OpTypeForwardPointer to DT, a corresponding normal pointer type
+  // will be added later. For special types it is already added to DT.
+  if (SpirvType->getOpcode() != SPIRV::OpTypeForwardPointer && !Reg.isValid() &&
+      !isSpecialType(Ty))
+    DT.add(Ty, &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType));
+
+  return SpirvType;
+}
+
 SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const {
   auto t = VRegToTypeMap.find(CurMF);
   if (t != VRegToTypeMap.end()) {
@@ -321,13 +606,26 @@ SPIRVType *SPIRVGlobalRegistry::getSPIRVTypeForVReg(Register VReg) const {
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
-    const Type *Type, MachineIRBuilder &MIRBuilder,
+    const Type *Ty, MachineIRBuilder &MIRBuilder,
     SPIRV::AccessQualifier AccessQual, bool EmitIR) {
-  Register Reg = DT.find(Type, &MIRBuilder.getMF());
+  Register Reg = DT.find(Ty, &MIRBuilder.getMF());
   if (Reg.isValid())
     return getSPIRVTypeForVReg(Reg);
-  SPIRVType *SpirvType = createSPIRVType(Type, MIRBuilder, AccessQual, EmitIR);
-  return restOfCreateSPIRVType(Type, SpirvType);
+  TypesInProcessing.clear();
+  SPIRVType *STy = restOfCreateSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
+  // Create normal pointer types for the corresponding OpTypeForwardPointers.
+  for (auto &CU : ForwardPointerTypes) {
+    const Type *Ty2 = CU.first;
+    SPIRVType *STy2 = CU.second;
+    if ((Reg = DT.find(Ty2, &MIRBuilder.getMF())).isValid())
+      STy2 = getSPIRVTypeForVReg(Reg);
+    else
+      STy2 = restOfCreateSPIRVType(Ty2, MIRBuilder, AccessQual, EmitIR);
+    if (Ty == Ty2)
+      STy = STy2;
+  }
+  ForwardPointerTypes.clear();
+  return STy;
 }
 
 bool SPIRVGlobalRegistry::isScalarOfType(Register VReg,
@@ -393,8 +691,8 @@ SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(unsigned BitWidth,
       MIRBuilder);
 }
 
-SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(const Type *LLVMTy,
-                                                      SPIRVType *SpirvType) {
+SPIRVType *SPIRVGlobalRegistry::finishCreatingSPIRVType(const Type *LLVMTy,
+                                                        SPIRVType *SpirvType) {
   assert(CurMF == SpirvType->getMF());
   VRegToTypeMap[CurMF][getSPIRVTypeID(SpirvType)] = SpirvType;
   SPIRVToLLVMType[SpirvType] = LLVMTy;
@@ -413,7 +711,7 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVIntegerType(
                  .addDef(createTypeVReg(CurMF->getRegInfo()))
                  .addImm(BitWidth)
                  .addImm(0);
-  return restOfCreateSPIRVType(LLVMTy, MIB);
+  return finishCreatingSPIRVType(LLVMTy, MIB);
 }
 
 SPIRVType *
@@ -423,6 +721,19 @@ SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder) {
       MIRBuilder);
 }
 
+SPIRVType *
+SPIRVGlobalRegistry::getOrCreateSPIRVBoolType(MachineInstr &I,
+                                              const SPIRVInstrInfo &TII) {
+  Type *LLVMTy = IntegerType::get(CurMF->getFunction().getContext(), 1);
+  Register Reg = DT.find(LLVMTy, CurMF);
+  if (Reg.isValid())
+    return getSPIRVTypeForVReg(Reg);
+  MachineBasicBlock &BB = *I.getParent();
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeBool))
+                 .addDef(createTypeVReg(CurMF->getRegInfo()));
+  return finishCreatingSPIRVType(LLVMTy, MIB);
+}
+
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
     SPIRVType *BaseType, unsigned NumElements, MachineIRBuilder &MIRBuilder) {
   return getOrCreateSPIRVType(
@@ -436,12 +747,15 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
     const SPIRVInstrInfo &TII) {
   Type *LLVMTy = FixedVectorType::get(
       const_cast<Type *>(getTypeForSPIRVType(BaseType)), NumElements);
+  Register Reg = DT.find(LLVMTy, CurMF);
+  if (Reg.isValid())
+    return getSPIRVTypeForVReg(Reg);
   MachineBasicBlock &BB = *I.getParent();
   auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypeVector))
                  .addDef(createTypeVReg(CurMF->getRegInfo()))
                  .addUse(getSPIRVTypeID(BaseType))
                  .addImm(NumElements);
-  return restOfCreateSPIRVType(LLVMTy, MIB);
+  return finishCreatingSPIRVType(LLVMTy, MIB);
 }
 
 SPIRVType *
@@ -460,10 +774,39 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVPointerType(
   Type *LLVMTy =
       PointerType::get(const_cast<Type *>(getTypeForSPIRVType(BaseType)),
                        storageClassToAddressSpace(SC));
+  Register Reg = DT.find(LLVMTy, CurMF);
+  if (Reg.isValid())
+    return getSPIRVTypeForVReg(Reg);
   MachineBasicBlock &BB = *I.getParent();
   auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpTypePointer))
                  .addDef(createTypeVReg(CurMF->getRegInfo()))
                  .addImm(static_cast<uint32_t>(SC))
                  .addUse(getSPIRVTypeID(BaseType));
-  return restOfCreateSPIRVType(LLVMTy, MIB);
+  return finishCreatingSPIRVType(LLVMTy, MIB);
+}
+
+Register SPIRVGlobalRegistry::getOrCreateUndef(MachineInstr &I,
+                                               SPIRVType *SpvType,
+                                               const SPIRVInstrInfo &TII) {
+  assert(SpvType);
+  const Type *LLVMTy = getTypeForSPIRVType(SpvType);
+  assert(LLVMTy);
+  // Find a constant in DT or build a new one.
+  UndefValue *UV = UndefValue::get(const_cast<Type *>(LLVMTy));
+  Register Res = DT.find(UV, CurMF);
+  if (Res.isValid())
+    return Res;
+  LLT LLTy = LLT::scalar(32);
+  Res = CurMF->getRegInfo().createGenericVirtualRegister(LLTy);
+  assignSPIRVTypeToVReg(SpvType, Res, *CurMF);
+  DT.add(UV, CurMF, Res);
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpUndef))
+            .addDef(Res)
+            .addUse(getSPIRVTypeID(SpvType));
+  const auto &ST = CurMF->getSubtarget();
+  constrainSelectedInstRegOperands(*MIB, *ST.getInstrInfo(),
+                                   *ST.getRegisterInfo(), *ST.getRegBankInfo());
+  return Res;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 13dcc20a3e0a..59ac2712a02f 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -30,7 +30,7 @@ class SPIRVGlobalRegistry {
   // Do not confuse this with DuplicatesTracker as DT maps Type* to <MF, Reg>
   // where Reg = OpType...
   // while VRegToTypeMap tracks SPIR-V type assigned to other regs (i.e. not
-  // type-declaring ones)
+  // type-declaring ones).
   DenseMap<const MachineFunction *, DenseMap<Register, SPIRVType *>>
       VRegToTypeMap;
 
@@ -38,6 +38,9 @@ class SPIRVGlobalRegistry {
 
   DenseMap<SPIRVType *, const Type *> SPIRVToLLVMType;
 
+  SmallPtrSet<const Type *, 4> TypesInProcessing;
+  DenseMap<const Type *, SPIRVType *> ForwardPointerTypes;
+
   // Number of bits pointers and size_t integers require.
   const unsigned PointerSize;
 
@@ -46,6 +49,14 @@ class SPIRVGlobalRegistry {
   createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder,
                   SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite,
                   bool EmitIR = true);
+  SPIRVType *findSPIRVType(
+      const Type *Ty, MachineIRBuilder &MIRBuilder,
+      SPIRV::AccessQualifier accessQual = SPIRV::AccessQualifier::ReadWrite,
+      bool EmitIR = true);
+  SPIRVType *restOfCreateSPIRVType(const Type *Type,
+                                   MachineIRBuilder &MIRBuilder,
+                                   SPIRV::AccessQualifier AccessQual,
+                                   bool EmitIR);
 
 public:
   SPIRVGlobalRegistry(unsigned PointerSize);
@@ -91,6 +102,11 @@ public:
       const Type *Type, Register VReg, MachineIRBuilder &MIRBuilder,
       SPIRV::AccessQualifier AQ = SPIRV::AccessQualifier::ReadWrite,
       bool EmitIR = true);
+  SPIRVType *assignIntTypeToVReg(unsigned BitWidth, Register VReg,
+                                 MachineInstr &I, const SPIRVInstrInfo &TII);
+  SPIRVType *assignVectTypeToVReg(SPIRVType *BaseType, unsigned NumElements,
+                                  Register VReg, MachineInstr &I,
+                                  const SPIRVInstrInfo &TII);
 
   // In cases where the SPIR-V type is already known, this function can be
   // used to map it to the given VReg via an ASSIGN_TYPE instruction.
@@ -123,10 +139,7 @@ public:
   }
 
   // Return the VReg holding the result of the given OpTypeXXX instruction.
-  Register getSPIRVTypeID(const SPIRVType *SpirvType) const {
-    assert(SpirvType && "Attempting to get type id for nullptr type.");
-    return SpirvType->defs().begin()->getReg();
-  }
+  Register getSPIRVTypeID(const SPIRVType *SpirvType) const;
 
   void setCurrentFunc(MachineFunction &MF) { CurMF = &MF; }
 
@@ -167,19 +180,38 @@ private:
   SPIRVType *getOpTypeArray(uint32_t NumElems, SPIRVType *ElemType,
                             MachineIRBuilder &MIRBuilder, bool EmitIR = true);
 
+  SPIRVType *getOpTypeOpaque(const StructType *Ty,
+                             MachineIRBuilder &MIRBuilder);
+
+  SPIRVType *getOpTypeStruct(const StructType *Ty, MachineIRBuilder &MIRBuilder,
+                             bool EmitIR = true);
+
   SPIRVType *getOpTypePointer(SPIRV::StorageClass SC, SPIRVType *ElemType,
-                              MachineIRBuilder &MIRBuilder);
+                              MachineIRBuilder &MIRBuilder, Register Reg);
+
+  SPIRVType *getOpTypeForwardPointer(SPIRV::StorageClass SC,
+                                     MachineIRBuilder &MIRBuilder);
 
   SPIRVType *getOpTypeFunction(SPIRVType *RetType,
                                const SmallVectorImpl<SPIRVType *> &ArgTypes,
                                MachineIRBuilder &MIRBuilder);
-  SPIRVType *restOfCreateSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType);
+  std::tuple<Register, ConstantInt *, bool> getOrCreateConstIntReg(
+      uint64_t Val, SPIRVType *SpvType, MachineIRBuilder *MIRBuilder,
+      MachineInstr *I = nullptr, const SPIRVInstrInfo *TII = nullptr);
+  SPIRVType *finishCreatingSPIRVType(const Type *LLVMTy, SPIRVType *SpirvType);
 
 public:
   Register buildConstantInt(uint64_t Val, MachineIRBuilder &MIRBuilder,
                             SPIRVType *SpvType = nullptr, bool EmitIR = true);
+  Register getOrCreateConstInt(uint64_t Val, MachineInstr &I,
+                               SPIRVType *SpvType, const SPIRVInstrInfo &TII);
   Register buildConstantFP(APFloat Val, MachineIRBuilder &MIRBuilder,
                            SPIRVType *SpvType = nullptr);
+  Register getOrCreateConsIntVector(uint64_t Val, MachineInstr &I,
+                                    SPIRVType *SpvType,
+                                    const SPIRVInstrInfo &TII);
+  Register getOrCreateUndef(MachineInstr &I, SPIRVType *SpvType,
+                            const SPIRVInstrInfo &TII);
   Register
   buildGlobalVariable(Register Reg, SPIRVType *BaseType, StringRef Name,
                       const GlobalValue *GV, SPIRV::StorageClass Storage,
@@ -193,19 +225,24 @@ public:
   SPIRVType *getOrCreateSPIRVIntegerType(unsigned BitWidth, MachineInstr &I,
                                          const SPIRVInstrInfo &TII);
   SPIRVType *getOrCreateSPIRVBoolType(MachineIRBuilder &MIRBuilder);
+  SPIRVType *getOrCreateSPIRVBoolType(MachineInstr &I,
+                                      const SPIRVInstrInfo &TII);
   SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
                                         unsigned NumElements,
                                         MachineIRBuilder &MIRBuilder);
   SPIRVType *getOrCreateSPIRVVectorType(SPIRVType *BaseType,
                                         unsigned NumElements, MachineInstr &I,
                                         const SPIRVInstrInfo &TII);
-
   SPIRVType *getOrCreateSPIRVPointerType(
       SPIRVType *BaseType, MachineIRBuilder &MIRBuilder,
       SPIRV::StorageClass SClass = SPIRV::StorageClass::Function);
   SPIRVType *getOrCreateSPIRVPointerType(
       SPIRVType *BaseType, MachineInstr &I, const SPIRVInstrInfo &TII,
       SPIRV::StorageClass SClass = SPIRV::StorageClass::Function);
+  SPIRVType *getOrCreateOpTypeFunctionWithArgs(
+      const Type *Ty, SPIRVType *RetType,
+      const SmallVectorImpl<SPIRVType *> &ArgTypes,
+      MachineIRBuilder &MIRBuilder);
 };
 } // end namespace llvm
 #endif // LLLVM_LIB_TARGET_SPIRV_SPIRVTYPEMANAGER_H
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index 754906308114..66d8b17b4296 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -52,7 +52,7 @@ bool SPIRVInstrInfo::isTypeDeclInstr(const MachineInstr &MI) const {
     auto DefRegClass = MRI.getRegClassOrNull(MI.getOperand(0).getReg());
     return DefRegClass && DefRegClass->getID() == SPIRV::TYPERegClass.getID();
   } else {
-    return false;
+    return MI.getOpcode() == SPIRV::OpTypeForwardPointer;
   }
 }
 
@@ -193,3 +193,15 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   auto &MRI = I->getMF()->getRegInfo();
   MRI.replaceRegWith(DstOp.getReg(), SrcOp.getReg());
 }
+
+bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
+  if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID ||
+      MI.getOpcode() == SPIRV::GET_pID || MI.getOpcode() == SPIRV::GET_vfID ||
+      MI.getOpcode() == SPIRV::GET_vID) {
+    auto &MRI = MI.getMF()->getRegInfo();
+    MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
index 2600d9cfca2e..334351c8eeae 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.h
@@ -48,6 +48,7 @@ public:
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
+  bool expandPostRAPseudo(MachineInstr &MI) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index d6fec5fd0785..d1c20795f804 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -449,6 +449,7 @@ def OpCopyLogical: UnOp<"OpCopyLogical", 400>;
 
 def OpSNegate: UnOp<"OpSNegate", 126>;
 def OpFNegate: UnOpTyped<"OpFNegate", 127, fID, fneg>;
+def OpFNegateV: UnOpTyped<"OpFNegate", 127, vfID, fneg>;
 defm OpIAdd: BinOpTypedGen<"OpIAdd", 128, add, 0, 1>;
 defm OpFAdd: BinOpTypedGen<"OpFAdd", 129, fadd, 1, 1>;
 
@@ -618,8 +619,10 @@ def OpAtomicCompareExchange: Op<230, (outs ID:$res),
                   (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$eq,
                    ID:$neq, ID:$val, ID:$cmp),
                   "$res = OpAtomicCompareExchange $ty $ptr $sc $eq $neq $val $cmp">;
-// TODO Currently the following deprecated opcode is missing:
-// OpAtomicCompareExchangeWeak
+def OpAtomicCompareExchangeWeak: Op<231, (outs ID:$res),
+                   (ins TYPE:$ty, ID:$ptr, ID:$sc, ID:$eq,
+                    ID:$neq, ID:$val, ID:$cmp),
+                   "$res = OpAtomicCompareExchangeWeak $ty $ptr $sc $eq $neq $val $cmp">;
 
 def OpAtomicIIncrement: AtomicOp<"OpAtomicIIncrement", 232>;
 def OpAtomicIDecrement: AtomicOp<"OpAtomicIDecrement", 233>;
@@ -660,6 +663,11 @@ def OpMemoryNamedBarrier: Op<329, (outs), (ins ID:$barr, ID:$mem, ID:$sem),
 
 // 3.42.21. Group and Subgroup Instructions
 
+def OpGroupAsyncCopy: Op<259, (outs ID:$res), (ins TYPE:$ty, ID:$scope,
+                  ID:$dst, ID:$src, ID:$nelts, ID:$stride, ID:$event),
+                  "$res = OpGroupAsyncCopy $ty $scope $dst $src $nelts $stride $event">;
+def OpGroupWaitEvents: Op<260, (outs), (ins ID:$scope, ID:$nelts, ID:$elist),
+                  "OpGroupWaitEvents $scope $nelts $elist">;
 def OpGroupAll: Op<261, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr),
                   "$res = OpGroupAll $ty $scope $pr">;
 def OpGroupAny: Op<262, (outs ID:$res), (ins TYPE:$ty, ID:$scope, ID:$pr),
@@ -680,6 +688,18 @@ def OpGroupUMax: OpGroup<"UMax", 270>;
 def OpGroupSMax: OpGroup<"SMax", 271>;
 
 // TODO: 3.42.22. Device-Side Enqueue Instructions
+def OpRetainEvent: Op<297, (outs), (ins ID:$event), "OpRetainEvent $event">;
+def OpReleaseEvent: Op<298, (outs), (ins ID:$event), "OpReleaseEvent $event">;
+def OpCreateUserEvent: Op<299, (outs ID:$res), (ins TYPE:$type),
+                  "$res = OpCreateUserEvent $type">;
+def OpIsValidEvent: Op<300, (outs ID:$res), (ins TYPE:$type, ID:$event),
+                  "$res = OpIsValidEvent $type $event ">;
+def OpSetUserEventStatus: Op<301, (outs), (ins ID:$event, ID:$status),
+                  "OpSetUserEventStatus $event $status">;
+def OpCaptureEventProfilingInfo: Op<302, (outs),
+                  (ins ID:$event, ID:$info, ID:$value),
+                  "OpCaptureEventProfilingInfo $event $info $value">;
+
 // TODO: 3.42.23. Pipe Instructions
 
 // 3.42.24. Non-Uniform Instructions
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 90b921a06f21..9365fd22e4e7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -197,6 +197,8 @@ void SPIRVInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
   InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
 }
 
+static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI);
+
 // Defined in SPIRVLegalizerInfo.cpp.
 extern bool isTypeFoldingSupported(unsigned Opcode);
 
@@ -335,6 +337,30 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
     return selectUnOp(ResVReg, ResType, I, SPIRV::OpBitcast);
   case TargetOpcode::G_ADDRSPACE_CAST:
     return selectAddrSpaceCast(ResVReg, ResType, I);
+  case TargetOpcode::G_PTR_ADD: {
+    // Currently, we get G_PTR_ADD only as a result of translating
+    // global variables, initialized with constant expressions like GV + Const
+    // (see test opencl/basic/progvar_prog_scope_init.ll).
+    // TODO: extend the handler once we have other cases.
+    assert(I.getOperand(1).isReg() && I.getOperand(2).isReg());
+    Register GV = I.getOperand(1).getReg();
+    MachineRegisterInfo::def_instr_iterator II = MRI->def_instr_begin(GV);
+    assert(((*II).getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
+            (*II).getOpcode() == TargetOpcode::COPY ||
+            (*II).getOpcode() == SPIRV::OpVariable) &&
+           isImm(I.getOperand(2), MRI));
+    Register Idx = buildZerosVal(GR.getOrCreateSPIRVIntegerType(32, I, TII), I);
+    MachineBasicBlock &BB = *I.getParent();
+    auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
+                   .addDef(ResVReg)
+                   .addUse(GR.getSPIRVTypeID(ResType))
+                   .addImm(static_cast<uint32_t>(
+                       SPIRV::Opcode::InBoundsPtrAccessChain))
+                   .addUse(GV)
+                   .addUse(Idx)
+                   .addUse(I.getOperand(2).getReg());
+    return MIB.constrainAllUses(TII, TRI, RBI);
+  }
 
   case TargetOpcode::G_ATOMICRMW_OR:
     return selectAtomicRMW(ResVReg, ResType, I, SPIRV::OpAtomicOr);
@@ -387,23 +413,6 @@ bool SPIRVInstructionSelector::selectUnOp(Register ResVReg,
                            Opcode);
 }
 
-static SPIRV::MemorySemantics getMemSemantics(AtomicOrdering Ord) {
-  switch (Ord) {
-  case AtomicOrdering::Acquire:
-    return SPIRV::MemorySemantics::Acquire;
-  case AtomicOrdering::Release:
-    return SPIRV::MemorySemantics::Release;
-  case AtomicOrdering::AcquireRelease:
-    return SPIRV::MemorySemantics::AcquireRelease;
-  case AtomicOrdering::SequentiallyConsistent:
-    return SPIRV::MemorySemantics::SequentiallyConsistent;
-  case AtomicOrdering::Unordered:
-  case AtomicOrdering::Monotonic:
-  case AtomicOrdering::NotAtomic:
-    return SPIRV::MemorySemantics::None;
-  }
-}
-
 static SPIRV::Scope getScope(SyncScope::ID Ord) {
   switch (Ord) {
   case SyncScope::SingleThread:
@@ -484,16 +493,15 @@ bool SPIRVInstructionSelector::selectMemOperation(Register ResVReg,
                                                   MachineInstr &I) const {
   MachineBasicBlock &BB = *I.getParent();
   auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCopyMemorySized))
-                 .addDef(I.getOperand(0).getReg())
+                 .addUse(I.getOperand(0).getReg())
                  .addUse(I.getOperand(1).getReg())
                  .addUse(I.getOperand(2).getReg());
   if (I.getNumMemOperands())
     addMemoryOperands(*I.memoperands_begin(), MIB);
   bool Result = MIB.constrainAllUses(TII, TRI, RBI);
-  if (ResVReg.isValid() && ResVReg != MIB->getOperand(0).getReg()) {
+  if (ResVReg.isValid() && ResVReg != MIB->getOperand(0).getReg())
     BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), ResVReg)
         .addUse(MIB->getOperand(0).getReg());
-  }
   return Result;
 }
 
@@ -541,36 +549,71 @@ bool SPIRVInstructionSelector::selectFence(MachineInstr &I) const {
 bool SPIRVInstructionSelector::selectAtomicCmpXchg(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I) const {
-  assert(I.hasOneMemOperand());
-  const MachineMemOperand *MemOp = *I.memoperands_begin();
-  uint32_t Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID()));
-  Register ScopeReg = buildI32Constant(Scope, I);
-
+  Register ScopeReg;
+  Register MemSemEqReg;
+  Register MemSemNeqReg;
   Register Ptr = I.getOperand(2).getReg();
+  if (I.getOpcode() != TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) {
+    assert(I.hasOneMemOperand());
+    const MachineMemOperand *MemOp = *I.memoperands_begin();
+    unsigned Scope = static_cast<uint32_t>(getScope(MemOp->getSyncScopeID()));
+    ScopeReg = buildI32Constant(Scope, I);
+
+    unsigned ScSem = static_cast<uint32_t>(
+        getMemSemanticsForStorageClass(GR.getPointerStorageClass(Ptr)));
+    AtomicOrdering AO = MemOp->getSuccessOrdering();
+    unsigned MemSemEq = static_cast<uint32_t>(getMemSemantics(AO)) | ScSem;
+    MemSemEqReg = buildI32Constant(MemSemEq, I);
+    AtomicOrdering FO = MemOp->getFailureOrdering();
+    unsigned MemSemNeq = static_cast<uint32_t>(getMemSemantics(FO)) | ScSem;
+    MemSemNeqReg =
+        MemSemEq == MemSemNeq ? MemSemEqReg : buildI32Constant(MemSemNeq, I);
+  } else {
+    ScopeReg = I.getOperand(5).getReg();
+    MemSemEqReg = I.getOperand(6).getReg();
+    MemSemNeqReg = I.getOperand(7).getReg();
+  }
+
   Register Cmp = I.getOperand(3).getReg();
   Register Val = I.getOperand(4).getReg();
-
   SPIRVType *SpvValTy = GR.getSPIRVTypeForVReg(Val);
-  SPIRV::StorageClass SC = GR.getPointerStorageClass(Ptr);
-  uint32_t ScSem = static_cast<uint32_t>(getMemSemanticsForStorageClass(SC));
-  AtomicOrdering AO = MemOp->getSuccessOrdering();
-  uint32_t MemSemEq = static_cast<uint32_t>(getMemSemantics(AO)) | ScSem;
-  Register MemSemEqReg = buildI32Constant(MemSemEq, I);
-  AtomicOrdering FO = MemOp->getFailureOrdering();
-  uint32_t MemSemNeq = static_cast<uint32_t>(getMemSemantics(FO)) | ScSem;
-  Register MemSemNeqReg =
-      MemSemEq == MemSemNeq ? MemSemEqReg : buildI32Constant(MemSemNeq, I);
+  Register ACmpRes = MRI->createVirtualRegister(&SPIRV::IDRegClass);
   const DebugLoc &DL = I.getDebugLoc();
-  return BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpAtomicCompareExchange))
-      .addDef(ResVReg)
-      .addUse(GR.getSPIRVTypeID(SpvValTy))
-      .addUse(Ptr)
-      .addUse(ScopeReg)
-      .addUse(MemSemEqReg)
-      .addUse(MemSemNeqReg)
-      .addUse(Val)
-      .addUse(Cmp)
-      .constrainAllUses(TII, TRI, RBI);
+  bool Result =
+      BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpAtomicCompareExchange))
+          .addDef(ACmpRes)
+          .addUse(GR.getSPIRVTypeID(SpvValTy))
+          .addUse(Ptr)
+          .addUse(ScopeReg)
+          .addUse(MemSemEqReg)
+          .addUse(MemSemNeqReg)
+          .addUse(Val)
+          .addUse(Cmp)
+          .constrainAllUses(TII, TRI, RBI);
+  Register CmpSuccReg = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+  SPIRVType *BoolTy = GR.getOrCreateSPIRVBoolType(I, TII);
+  Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpIEqual))
+                .addDef(CmpSuccReg)
+                .addUse(GR.getSPIRVTypeID(BoolTy))
+                .addUse(ACmpRes)
+                .addUse(Cmp)
+                .constrainAllUses(TII, TRI, RBI);
+  Register TmpReg = MRI->createVirtualRegister(&SPIRV::IDRegClass);
+  Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpCompositeInsert))
+                .addDef(TmpReg)
+                .addUse(GR.getSPIRVTypeID(ResType))
+                .addUse(ACmpRes)
+                .addUse(GR.getOrCreateUndef(I, ResType, TII))
+                .addImm(0)
+                .constrainAllUses(TII, TRI, RBI);
+  Result |= BuildMI(*I.getParent(), I, DL, TII.get(SPIRV::OpCompositeInsert))
+                .addDef(ResVReg)
+                .addUse(GR.getSPIRVTypeID(ResType))
+                .addUse(CmpSuccReg)
+                .addUse(TmpReg)
+                .addImm(1)
+                .constrainAllUses(TII, TRI, RBI);
+  return Result;
 }
 
 static bool isGenericCastablePtr(SPIRV::StorageClass SC) {
@@ -592,6 +635,27 @@ static bool isGenericCastablePtr(SPIRV::StorageClass SC) {
 bool SPIRVInstructionSelector::selectAddrSpaceCast(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I) const {
+  // If the AddrSpaceCast user is single and in OpConstantComposite or
+  // OpVariable, we should select OpSpecConstantOp.
+  auto UIs = MRI->use_instructions(ResVReg);
+  if (!UIs.empty() && ++UIs.begin() == UIs.end() &&
+      (UIs.begin()->getOpcode() == SPIRV::OpConstantComposite ||
+       UIs.begin()->getOpcode() == SPIRV::OpVariable ||
+       isSpvIntrinsic(*UIs.begin(), Intrinsic::spv_init_global))) {
+    Register NewReg = I.getOperand(1).getReg();
+    MachineBasicBlock &BB = *I.getParent();
+    SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+    ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII,
+                                             SPIRV::StorageClass::Generic);
+    bool Result =
+        BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpSpecConstantOp))
+            .addDef(ResVReg)
+            .addUse(GR.getSPIRVTypeID(ResType))
+            .addImm(static_cast<uint32_t>(SPIRV::Opcode::PtrCastToGeneric))
+            .addUse(NewReg)
+            .constrainAllUses(TII, TRI, RBI);
+    return Result;
+  }
   Register SrcPtr = I.getOperand(1).getReg();
   SPIRVType *SrcPtrTy = GR.getSPIRVTypeForVReg(SrcPtr);
   SPIRV::StorageClass SrcSC = GR.getPointerStorageClass(SrcPtr);
@@ -842,7 +906,9 @@ bool SPIRVInstructionSelector::selectFCmp(Register ResVReg,
 
 Register SPIRVInstructionSelector::buildZerosVal(const SPIRVType *ResType,
                                                  MachineInstr &I) const {
-  return buildI32Constant(0, I, ResType);
+  if (ResType->getOpcode() == SPIRV::OpTypeVector)
+    return GR.getOrCreateConsIntVector(0, I, ResType, TII);
+  return GR.getOrCreateConstInt(0, I, ResType, TII);
 }
 
 Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes,
@@ -851,20 +917,9 @@ Register SPIRVInstructionSelector::buildOnesVal(bool AllOnes,
   unsigned BitWidth = GR.getScalarOrVectorBitWidth(ResType);
   APInt One = AllOnes ? APInt::getAllOnesValue(BitWidth)
                       : APInt::getOneBitSet(BitWidth, 0);
-  Register OneReg = buildI32Constant(One.getZExtValue(), I, ResType);
-  if (ResType->getOpcode() == SPIRV::OpTypeVector) {
-    const unsigned NumEles = ResType->getOperand(2).getImm();
-    Register OneVec = MRI->createVirtualRegister(&SPIRV::IDRegClass);
-    unsigned Opcode = SPIRV::OpConstantComposite;
-    auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode))
-                   .addDef(OneVec)
-                   .addUse(GR.getSPIRVTypeID(ResType));
-    for (unsigned i = 0; i < NumEles; ++i)
-      MIB.addUse(OneReg);
-    constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
-    return OneVec;
-  }
-  return OneReg;
+  if (ResType->getOpcode() == SPIRV::OpTypeVector)
+    return GR.getOrCreateConsIntVector(One.getZExtValue(), I, ResType, TII);
+  return GR.getOrCreateConstInt(One.getZExtValue(), I, ResType, TII);
 }
 
 bool SPIRVInstructionSelector::selectSelect(Register ResVReg,
@@ -959,13 +1014,23 @@ bool SPIRVInstructionSelector::selectConst(Register ResVReg,
                                            const SPIRVType *ResType,
                                            const APInt &Imm,
                                            MachineInstr &I) const {
-  assert(ResType->getOpcode() != SPIRV::OpTypePointer || Imm.isNullValue());
+  unsigned TyOpcode = ResType->getOpcode();
+  assert(TyOpcode != SPIRV::OpTypePointer || Imm.isNullValue());
   MachineBasicBlock &BB = *I.getParent();
-  if (ResType->getOpcode() == SPIRV::OpTypePointer && Imm.isNullValue()) {
+  if ((TyOpcode == SPIRV::OpTypePointer || TyOpcode == SPIRV::OpTypeEvent) &&
+      Imm.isNullValue())
     return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
         .addDef(ResVReg)
         .addUse(GR.getSPIRVTypeID(ResType))
         .constrainAllUses(TII, TRI, RBI);
+  if (TyOpcode == SPIRV::OpTypeInt) {
+    Register Reg = GR.getOrCreateConstInt(Imm.getZExtValue(), I, ResType, TII);
+    if (Reg == ResVReg)
+      return true;
+    return BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
+        .addDef(ResVReg)
+        .addUse(Reg)
+        .constrainAllUses(TII, TRI, RBI);
   }
   auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantI))
                  .addDef(ResVReg)
@@ -1006,29 +1071,29 @@ bool SPIRVInstructionSelector::selectInsertVal(Register ResVReg,
                                                const SPIRVType *ResType,
                                                MachineInstr &I) const {
   MachineBasicBlock &BB = *I.getParent();
-  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeInsert))
-      .addDef(ResVReg)
-      .addUse(GR.getSPIRVTypeID(ResType))
-      // object to insert
-      .addUse(I.getOperand(3).getReg())
-      // composite to insert into
-      .addUse(I.getOperand(2).getReg())
-      // TODO: support arbitrary number of indices
-      .addImm(foldImm(I.getOperand(4), MRI))
-      .constrainAllUses(TII, TRI, RBI);
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeInsert))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType))
+                 // object to insert
+                 .addUse(I.getOperand(3).getReg())
+                 // composite to insert into
+                 .addUse(I.getOperand(2).getReg());
+  for (unsigned i = 4; i < I.getNumOperands(); i++)
+    MIB.addImm(foldImm(I.getOperand(i), MRI));
+  return MIB.constrainAllUses(TII, TRI, RBI);
 }
 
 bool SPIRVInstructionSelector::selectExtractVal(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
   MachineBasicBlock &BB = *I.getParent();
-  return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
-      .addDef(ResVReg)
-      .addUse(GR.getSPIRVTypeID(ResType))
-      .addUse(I.getOperand(2).getReg())
-      // TODO: support arbitrary number of indices
-      .addImm(foldImm(I.getOperand(3), MRI))
-      .constrainAllUses(TII, TRI, RBI);
+  auto MIB = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType))
+                 .addUse(I.getOperand(2).getReg());
+  for (unsigned i = 3; i < I.getNumOperands(); i++)
+    MIB.addImm(foldImm(I.getOperand(i), MRI));
+  return MIB.constrainAllUses(TII, TRI, RBI);
 }
 
 bool SPIRVInstructionSelector::selectInsertElt(Register ResVReg,
@@ -1154,6 +1219,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     }
     return MIB.constrainAllUses(TII, TRI, RBI);
   } break;
+  case Intrinsic::spv_cmpxchg:
+    return selectAtomicCmpXchg(ResVReg, ResType, I);
+    break;
   default:
     llvm_unreachable("Intrinsic selection not implemented");
   }
@@ -1239,8 +1307,32 @@ bool SPIRVInstructionSelector::selectGlobalValue(
       GV->getType(), MIRBuilder, SPIRV::AccessQualifier::ReadWrite, false);
 
   std::string GlobalIdent = GV->getGlobalIdentifier();
-  // TODO: suport @llvm.global.annotations.
+  // We have functions as operands in tests with blocks of instruction e.g. in
+  // transcoding/global_block.ll. These operands are not used and should be
+  // substituted by zero constants. Their type is expected to be always
+  // OpTypePointer Function %uchar.
+  if (isa<Function>(GV)) {
+    const Constant *ConstVal = GV;
+    MachineBasicBlock &BB = *I.getParent();
+    Register NewReg = GR.find(ConstVal, GR.CurMF);
+    if (!NewReg.isValid()) {
+      SPIRVType *SpvBaseTy = GR.getOrCreateSPIRVIntegerType(8, I, TII);
+      ResType = GR.getOrCreateSPIRVPointerType(SpvBaseTy, I, TII);
+      Register NewReg = ResVReg;
+      GR.add(ConstVal, GR.CurMF, NewReg);
+      return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpConstantNull))
+          .addDef(NewReg)
+          .addUse(GR.getSPIRVTypeID(ResType))
+          .constrainAllUses(TII, TRI, RBI);
+    }
+    assert(NewReg != ResVReg);
+    return BuildMI(BB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
+        .addDef(ResVReg)
+        .addUse(NewReg)
+        .constrainAllUses(TII, TRI, RBI);
+  }
   auto GlobalVar = cast<GlobalVariable>(GV);
+  assert(GlobalVar->getName() != "llvm.global.annotations");
 
   bool HasInit = GlobalVar->hasInitializer() &&
                  !isa<UndefValue>(GlobalVar->getInitializer());
diff --git a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
index 8e4ab973bf07..8aaac50c94d7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMCInstLower.cpp
@@ -45,7 +45,12 @@ void SPIRVMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI,
       break;
     }
     case MachineOperand::MO_Immediate:
-      MCOp = MCOperand::createImm(MO.getImm());
+      if (MI->getOpcode() == SPIRV::OpExtInst && i == 2) {
+        Register Reg = MAI->getExtInstSetReg(MO.getImm());
+        MCOp = MCOperand::createReg(Reg);
+      } else {
+        MCOp = MCOperand::createImm(MO.getImm());
+      }
       break;
     case MachineOperand::MO_FPImmediate:
       MCOp = MCOperand::createDFPImm(
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index a39df5234935..143ddf7297dc 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -60,62 +60,50 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) {
   MAI.InstrsToDelete.clear();
   MAI.FuncNameMap.clear();
   MAI.GlobalVarList.clear();
+  MAI.ExtInstSetMap.clear();
 
   // TODO: determine memory model and source language from the configuratoin.
-  MAI.Mem = SPIRV::MemoryModel::OpenCL;
-  MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_C;
-  unsigned PtrSize = ST->getPointerSize();
-  MAI.Addr = PtrSize == 32   ? SPIRV::AddressingModel::Physical32
-             : PtrSize == 64 ? SPIRV::AddressingModel::Physical64
-                             : SPIRV::AddressingModel::Logical;
+  if (auto MemModel = M.getNamedMetadata("spirv.MemoryModel")) {
+    auto MemMD = MemModel->getOperand(0);
+    MAI.Addr = static_cast<SPIRV::AddressingModel>(getMetadataUInt(MemMD, 0));
+    MAI.Mem = static_cast<SPIRV::MemoryModel>(getMetadataUInt(MemMD, 1));
+  } else {
+    MAI.Mem = SPIRV::MemoryModel::OpenCL;
+    unsigned PtrSize = ST->getPointerSize();
+    MAI.Addr = PtrSize == 32   ? SPIRV::AddressingModel::Physical32
+               : PtrSize == 64 ? SPIRV::AddressingModel::Physical64
+                               : SPIRV::AddressingModel::Logical;
+  }
   // Get the OpenCL version number from metadata.
   // TODO: support other source languages.
-  MAI.SrcLangVersion = 0;
   if (auto VerNode = M.getNamedMetadata("opencl.ocl.version")) {
-    // Construct version literal according to OpenCL 2.2 environment spec.
+    MAI.SrcLang = SPIRV::SourceLanguage::OpenCL_C;
+    // Construct version literal in accordance with SPIRV-LLVM-Translator.
+    // TODO: support multiple OCL version metadata.
+    assert(VerNode->getNumOperands() > 0 && "Invalid SPIR");
     auto VersionMD = VerNode->getOperand(0);
     unsigned MajorNum = getMetadataUInt(VersionMD, 0, 2);
     unsigned MinorNum = getMetadataUInt(VersionMD, 1);
     unsigned RevNum = getMetadataUInt(VersionMD, 2);
-    MAI.SrcLangVersion = 0 | (MajorNum << 16) | (MinorNum << 8) | RevNum;
+    MAI.SrcLangVersion = (MajorNum * 100 + MinorNum) * 1000 + RevNum;
+  } else {
+    MAI.SrcLang = SPIRV::SourceLanguage::Unknown;
+    MAI.SrcLangVersion = 0;
   }
-}
 
-// True if there is an instruction in the MS list with all the same operands as
-// the given instruction has (after the given starting index).
-// TODO: maybe it needs to check Opcodes too.
-static bool findSameInstrInMS(const MachineInstr &A,
-                              SPIRV::ModuleSectionType MSType,
-                              SPIRV::ModuleAnalysisInfo &MAI,
-                              bool UpdateRegAliases,
-                              unsigned StartOpIndex = 0) {
-  for (const auto *B : MAI.MS[MSType]) {
-    const unsigned NumAOps = A.getNumOperands();
-    if (NumAOps == B->getNumOperands() && A.getNumDefs() == B->getNumDefs()) {
-      bool AllOpsMatch = true;
-      for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) {
-        if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) {
-          Register RegA = A.getOperand(i).getReg();
-          Register RegB = B->getOperand(i).getReg();
-          AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) ==
-                        MAI.getRegisterAlias(B->getMF(), RegB);
-        } else {
-          AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i));
-        }
-      }
-      if (AllOpsMatch) {
-        if (UpdateRegAliases) {
-          assert(A.getOperand(0).isReg() && B->getOperand(0).isReg());
-          Register LocalReg = A.getOperand(0).getReg();
-          Register GlobalReg =
-              MAI.getRegisterAlias(B->getMF(), B->getOperand(0).getReg());
-          MAI.setRegisterAlias(A.getMF(), LocalReg, GlobalReg);
-        }
-        return true;
-      }
+  if (auto ExtNode = M.getNamedMetadata("opencl.used.extensions")) {
+    for (unsigned I = 0, E = ExtNode->getNumOperands(); I != E; ++I) {
+      MDNode *MD = ExtNode->getOperand(I);
+      if (!MD || MD->getNumOperands() == 0)
+        continue;
+      for (unsigned J = 0, N = MD->getNumOperands(); J != N; ++J)
+        MAI.SrcExt.insert(cast<MDString>(MD->getOperand(J))->getString());
     }
   }
-  return false;
+
+  // TODO: check if it's required by default.
+  MAI.ExtInstSetMap[static_cast<unsigned>(SPIRV::InstructionSet::OpenCL_std)] =
+      Register::index2VirtReg(MAI.getNextID());
 }
 
 // Collect MI which defines the register in the given machine function.
@@ -135,7 +123,7 @@ void SPIRVModuleAnalysis::collectGlobalEntities(
     const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
     SPIRV::ModuleSectionType MSType,
     std::function<bool(const SPIRV::DTSortableEntry *)> Pred,
-    bool UsePreOrder) {
+    bool UsePreOrder = false) {
   DenseSet<const SPIRV::DTSortableEntry *> Visited;
   for (const auto *E : DepsGraph) {
     std::function<void(const SPIRV::DTSortableEntry *)> RecHoistUtil;
@@ -188,13 +176,41 @@ void SPIRVModuleAnalysis::processDefInstrs(const Module &M) {
 
   collectGlobalEntities(
       DepsGraph, SPIRV::MB_TypeConstVars,
-      [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); }, false);
+      [](const SPIRV::DTSortableEntry *E) { return !E->getIsFunc(); });
 
   collectGlobalEntities(
       DepsGraph, SPIRV::MB_ExtFuncDecls,
       [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true);
 }
 
+// True if there is an instruction in the MS list with all the same operands as
+// the given instruction has (after the given starting index).
+// TODO: maybe it needs to check Opcodes too.
+static bool findSameInstrInMS(const MachineInstr &A,
+                              SPIRV::ModuleSectionType MSType,
+                              SPIRV::ModuleAnalysisInfo &MAI,
+                              unsigned StartOpIndex = 0) {
+  for (const auto *B : MAI.MS[MSType]) {
+    const unsigned NumAOps = A.getNumOperands();
+    if (NumAOps != B->getNumOperands() || A.getNumDefs() != B->getNumDefs())
+      continue;
+    bool AllOpsMatch = true;
+    for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) {
+      if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) {
+        Register RegA = A.getOperand(i).getReg();
+        Register RegB = B->getOperand(i).getReg();
+        AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) ==
+                      MAI.getRegisterAlias(B->getMF(), RegB);
+      } else {
+        AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i));
+      }
+    }
+    if (AllOpsMatch)
+      return true;
+  }
+  return false;
+}
+
 // Look for IDs declared with Import linkage, and map the imported name string
 // to the register defining that variable (which will usually be the result of
 // an OpFunction). This lets us call externally imported functions using
@@ -228,12 +244,16 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
 // numbering has already occurred by this point. We can directly compare reg
 // arguments when detecting duplicates.
 static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
-                              SPIRV::ModuleSectionType MSType) {
+                              SPIRV::ModuleSectionType MSType,
+                              bool Append = true) {
   MAI.setSkipEmission(&MI);
-  if (findSameInstrInMS(MI, MSType, MAI, false))
+  if (findSameInstrInMS(MI, MSType, MAI))
     return; // Found a duplicate, so don't add it.
   // No duplicates, so add it.
-  MAI.MS[MSType].push_back(&MI);
+  if (Append)
+    MAI.MS[MSType].push_back(&MI);
+  else
+    MAI.MS[MSType].insert(MAI.MS[MSType].begin(), &MI);
 }
 
 // Some global instructions make reference to function-local ID regs, so cannot
@@ -256,15 +276,22 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
         } else if (TII->isDecorationInstr(MI)) {
           collectOtherInstr(MI, MAI, SPIRV::MB_Annotations);
           collectFuncNames(MI, *F);
+        } else if (TII->isConstantInstr(MI)) {
+          // Now OpSpecConstant*s are not in DT,
+          // but they need to be collected anyway.
+          collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars);
         } else if (OpCode == SPIRV::OpFunction) {
           collectFuncNames(MI, *F);
+        } else if (OpCode == SPIRV::OpTypeForwardPointer) {
+          collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, false);
         }
       }
   }
 }
 
 // Number registers in all functions globally from 0 onwards and store
-// the result in global register alias table.
+// the result in global register alias table. Some registers are already
+// numbered in collectGlobalEntities.
 void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     if ((*F).isDeclaration())
@@ -282,11 +309,50 @@ void SPIRVModuleAnalysis::numberRegistersGlobally(const Module &M) {
           Register NewReg = Register::index2VirtReg(MAI.getNextID());
           MAI.setRegisterAlias(MF, Reg, NewReg);
         }
+        if (MI.getOpcode() != SPIRV::OpExtInst)
+          continue;
+        auto Set = MI.getOperand(2).getImm();
+        if (MAI.ExtInstSetMap.find(Set) == MAI.ExtInstSetMap.end())
+          MAI.ExtInstSetMap[Set] = Register::index2VirtReg(MAI.getNextID());
       }
     }
   }
 }
 
+// Find OpIEqual and OpBranchConditional instructions originating from
+// OpSwitches, mark them skipped for emission. Also mark MBB skipped if it
+// contains only these instructions.
+static void processSwitches(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
+                            MachineModuleInfo *MMI) {
+  DenseSet<Register> SwitchRegs;
+  for (auto F = M.begin(), E = M.end(); F != E; ++F) {
+    MachineFunction *MF = MMI->getMachineFunction(*F);
+    if (!MF)
+      continue;
+    for (MachineBasicBlock &MBB : *MF)
+      for (MachineInstr &MI : MBB) {
+        if (MAI.getSkipEmission(&MI))
+          continue;
+        if (MI.getOpcode() == SPIRV::OpSwitch) {
+          assert(MI.getOperand(0).isReg());
+          SwitchRegs.insert(MI.getOperand(0).getReg());
+        }
+        if (MI.getOpcode() != SPIRV::OpIEqual || !MI.getOperand(2).isReg() ||
+            !SwitchRegs.contains(MI.getOperand(2).getReg()))
+          continue;
+        Register CmpReg = MI.getOperand(0).getReg();
+        MachineInstr *CBr = MI.getNextNode();
+        assert(CBr && CBr->getOpcode() == SPIRV::OpBranchConditional &&
+               CBr->getOperand(0).isReg() &&
+               CBr->getOperand(0).getReg() == CmpReg);
+        MAI.setSkipEmission(&MI);
+        MAI.setSkipEmission(CBr);
+        if (&MBB.front() == &MI && &MBB.back() == CBr)
+          MAI.MBBsToSkip.insert(&MBB);
+      }
+  }
+}
+
 struct SPIRV::ModuleAnalysisInfo SPIRVModuleAnalysis::MAI;
 
 void SPIRVModuleAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -305,7 +371,9 @@ bool SPIRVModuleAnalysis::runOnModule(Module &M) {
 
   setBaseInfo(M);
 
-  // TODO: Process type/const/global var/func decl instructions, number their
+  processSwitches(M, MAI, MMI);
+
+  // Process type/const/global var/func decl instructions, number their
   // destination registers from 0 to N, collect Extensions and Capabilities.
   processDefInstrs(M);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
index 585868909d28..9bcdf6e9ae2a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.h
@@ -52,6 +52,9 @@ struct ModuleAnalysisInfo {
   SPIRV::AddressingModel Addr;
   SPIRV::SourceLanguage SrcLang;
   unsigned SrcLangVersion;
+  StringSet<> SrcExt;
+  // Maps ExtInstSet to corresponding ID register.
+  DenseMap<unsigned, Register> ExtInstSetMap;
   // Contains the list of all global OpVariables in the module.
   SmallVector<MachineInstr *, 4> GlobalVarList;
   // Maps function names to coresponding function ID registers.
@@ -59,6 +62,9 @@ struct ModuleAnalysisInfo {
   // The set contains machine instructions which are necessary
   // for correct MIR but will not be emitted in function bodies.
   DenseSet<MachineInstr *> InstrsToDelete;
+  // The set contains machine basic blocks which are necessary
+  // for correct MIR but will not be emitted.
+  DenseSet<MachineBasicBlock *> MBBsToSkip;
   // The table contains global aliases of local registers for each machine
   // function. The aliases are used to substitute local registers during
   // code emission.
@@ -75,6 +81,7 @@ struct ModuleAnalysisInfo {
     assert(FuncReg != FuncNameMap.end() && "Cannot find function Id");
     return FuncReg->second;
   }
+  Register getExtInstSetReg(unsigned SetNum) { return ExtInstSetMap[SetNum]; }
   InstrList &getMSInstrs(unsigned MSType) { return MS[MSType]; }
   void setSkipEmission(MachineInstr *MI) { InstrsToDelete.insert(MI); }
   bool getSkipEmission(const MachineInstr *MI) {
@@ -123,7 +130,6 @@ public:
 
 private:
   void setBaseInfo(const Module &M);
-  template <typename T> void collectTypesConstsVars();
   void collectGlobalEntities(
       const std::vector<SPIRV::DTSortableEntry *> &DepsGraph,
       SPIRV::ModuleSectionType MSType,
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 687f84046650..e620226dcc7a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -39,11 +39,58 @@ public:
 };
 } // namespace
 
-static bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) {
-  if (MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
-      MI.getIntrinsicID() == IntrinsicID)
-    return true;
-  return false;
+static void addConstantsToTrack(MachineFunction &MF, SPIRVGlobalRegistry *GR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DenseMap<MachineInstr *, Register> RegsAlreadyAddedToDT;
+  SmallVector<MachineInstr *, 10> ToErase, ToEraseComposites;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isSpvIntrinsic(MI, Intrinsic::spv_track_constant))
+        continue;
+      ToErase.push_back(&MI);
+      auto *Const =
+          cast<Constant>(cast<ConstantAsMetadata>(
+                             MI.getOperand(3).getMetadata()->getOperand(0))
+                             ->getValue());
+      if (auto *GV = dyn_cast<GlobalValue>(Const)) {
+        Register Reg = GR->find(GV, &MF);
+        if (!Reg.isValid())
+          GR->add(GV, &MF, MI.getOperand(2).getReg());
+        else
+          RegsAlreadyAddedToDT[&MI] = Reg;
+      } else {
+        Register Reg = GR->find(Const, &MF);
+        if (!Reg.isValid()) {
+          if (auto *ConstVec = dyn_cast<ConstantDataVector>(Const)) {
+            auto *BuildVec = MRI.getVRegDef(MI.getOperand(2).getReg());
+            assert(BuildVec &&
+                   BuildVec->getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+            for (unsigned i = 0; i < ConstVec->getNumElements(); ++i)
+              GR->add(ConstVec->getElementAsConstant(i), &MF,
+                      BuildVec->getOperand(1 + i).getReg());
+          }
+          GR->add(Const, &MF, MI.getOperand(2).getReg());
+        } else {
+          RegsAlreadyAddedToDT[&MI] = Reg;
+          // This MI is unused and will be removed. If the MI uses
+          // const_composite, it will be unused and should be removed too.
+          assert(MI.getOperand(2).isReg() && "Reg operand is expected");
+          MachineInstr *SrcMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+          if (SrcMI && isSpvIntrinsic(*SrcMI, Intrinsic::spv_const_composite))
+            ToEraseComposites.push_back(SrcMI);
+        }
+      }
+    }
+  }
+  for (MachineInstr *MI : ToErase) {
+    Register Reg = MI->getOperand(2).getReg();
+    if (RegsAlreadyAddedToDT.find(MI) != RegsAlreadyAddedToDT.end())
+      Reg = RegsAlreadyAddedToDT[MI];
+    MRI.replaceRegWith(MI->getOperand(0).getReg(), Reg);
+    MI->eraseFromParent();
+  }
+  for (MachineInstr *MI : ToEraseComposites)
+    MI->eraseFromParent();
 }
 
 static void foldConstantsIntoIntrinsics(MachineFunction &MF) {
@@ -120,6 +167,7 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
       }
       case TargetOpcode::G_TRUNC:
       case TargetOpcode::G_ADDRSPACE_CAST:
+      case TargetOpcode::G_PTR_ADD:
       case TargetOpcode::COPY: {
         MachineOperand &Op = MI->getOperand(1);
         MachineInstr *Def = Op.isReg() ? MRI.getVRegDef(Op.getReg()) : nullptr;
@@ -308,6 +356,22 @@ static void processInstrsWithTypeFolding(MachineFunction &MF,
         processInstr(MI, MIB, MRI, GR);
     }
   }
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // We need to rewrite dst types for ASSIGN_TYPE instrs to be able
+      // to perform tblgen'erated selection and we can't do that on Legalizer
+      // as it operates on gMIR only.
+      if (MI.getOpcode() != SPIRV::ASSIGN_TYPE)
+        continue;
+      Register SrcReg = MI.getOperand(1).getReg();
+      if (!isTypeFoldingSupported(MRI.getVRegDef(SrcReg)->getOpcode()))
+        continue;
+      Register DstReg = MI.getOperand(0).getReg();
+      if (MRI.getType(DstReg).isVector())
+        MRI.setRegClass(DstReg, &SPIRV::IDRegClass);
+      MRI.setType(DstReg, LLT::scalar(32));
+    }
+  }
 }
 
 static void processSwitches(MachineFunction &MF, SPIRVGlobalRegistry *GR,
@@ -421,6 +485,7 @@ bool SPIRVPreLegalizer::runOnMachineFunction(MachineFunction &MF) {
   SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
   GR->setCurrentFunc(MF);
   MachineIRBuilder MIB(MF);
+  addConstantsToTrack(MF, GR);
   foldConstantsIntoIntrinsics(MF);
   insertBitcasts(MF, GR, MIB);
   generateAssignInstrs(MF, GR, MIB);
diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
new file mode 100644
index 000000000000..13c3c12c1b41
--- /dev/null
+++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp
@@ -0,0 +1,288 @@
+//===-- SPIRVPrepareFunctions.cpp - modify function signatures --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass modifies function signatures containing aggregate arguments
+// and/or return value. Also it substitutes some llvm intrinsic calls by
+// function calls, generating these functions as the translator does.
+//
+// NOTE: this pass is a module-level one due to the necessity to modify
+// GVs/functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPIRV.h"
+#include "SPIRVTargetMachine.h"
+#include "SPIRVUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeSPIRVPrepareFunctionsPass(PassRegistry &);
+}
+
+namespace {
+
+class SPIRVPrepareFunctions : public ModulePass {
+  Function *processFunctionSignature(Function *F);
+
+public:
+  static char ID;
+  SPIRVPrepareFunctions() : ModulePass(ID) {
+    initializeSPIRVPrepareFunctionsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override { return "SPIRV prepare functions"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    ModulePass::getAnalysisUsage(AU);
+  }
+};
+
+} // namespace
+
+char SPIRVPrepareFunctions::ID = 0;
+
+INITIALIZE_PASS(SPIRVPrepareFunctions, "prepare-functions",
+                "SPIRV prepare functions", false, false)
+
+Function *SPIRVPrepareFunctions::processFunctionSignature(Function *F) {
+  IRBuilder<> B(F->getContext());
+
+  bool IsRetAggr = F->getReturnType()->isAggregateType();
+  bool HasAggrArg =
+      std::any_of(F->arg_begin(), F->arg_end(), [](Argument &Arg) {
+        return Arg.getType()->isAggregateType();
+      });
+  bool DoClone = IsRetAggr || HasAggrArg;
+  if (!DoClone)
+    return F;
+  SmallVector<std::pair<int, Type *>, 4> ChangedTypes;
+  Type *RetType = IsRetAggr ? B.getInt32Ty() : F->getReturnType();
+  if (IsRetAggr)
+    ChangedTypes.push_back(std::pair<int, Type *>(-1, F->getReturnType()));
+  SmallVector<Type *, 4> ArgTypes;
+  for (const auto &Arg : F->args()) {
+    if (Arg.getType()->isAggregateType()) {
+      ArgTypes.push_back(B.getInt32Ty());
+      ChangedTypes.push_back(
+          std::pair<int, Type *>(Arg.getArgNo(), Arg.getType()));
+    } else
+      ArgTypes.push_back(Arg.getType());
+  }
+  FunctionType *NewFTy =
+      FunctionType::get(RetType, ArgTypes, F->getFunctionType()->isVarArg());
+  Function *NewF =
+      Function::Create(NewFTy, F->getLinkage(), F->getName(), *F->getParent());
+
+  ValueToValueMapTy VMap;
+  auto NewFArgIt = NewF->arg_begin();
+  for (auto &Arg : F->args()) {
+    StringRef ArgName = Arg.getName();
+    NewFArgIt->setName(ArgName);
+    VMap[&Arg] = &(*NewFArgIt++);
+  }
+  SmallVector<ReturnInst *, 8> Returns;
+
+  CloneFunctionInto(NewF, F, VMap, CloneFunctionChangeType::LocalChangesOnly,
+                    Returns);
+  NewF->takeName(F);
+
+  NamedMDNode *FuncMD =
+      F->getParent()->getOrInsertNamedMetadata("spv.cloned_funcs");
+  SmallVector<Metadata *, 2> MDArgs;
+  MDArgs.push_back(MDString::get(B.getContext(), NewF->getName()));
+  for (auto &ChangedTyP : ChangedTypes)
+    MDArgs.push_back(MDNode::get(
+        B.getContext(),
+        {ConstantAsMetadata::get(B.getInt32(ChangedTyP.first)),
+         ValueAsMetadata::get(Constant::getNullValue(ChangedTyP.second))}));
+  MDNode *ThisFuncMD = MDNode::get(B.getContext(), MDArgs);
+  FuncMD->addOperand(ThisFuncMD);
+
+  for (auto *U : make_early_inc_range(F->users())) {
+    if (auto *CI = dyn_cast<CallInst>(U))
+      CI->mutateFunctionType(NewF->getFunctionType());
+    U->replaceUsesOfWith(F, NewF);
+  }
+  return NewF;
+}
+
+std::string lowerLLVMIntrinsicName(IntrinsicInst *II) {
+  Function *IntrinsicFunc = II->getCalledFunction();
+  assert(IntrinsicFunc && "Missing function");
+  std::string FuncName = IntrinsicFunc->getName().str();
+  std::replace(FuncName.begin(), FuncName.end(), '.', '_');
+  FuncName = "spirv." + FuncName;
+  return FuncName;
+}
+
+static Function *getOrCreateFunction(Module *M, Type *RetTy,
+                                     ArrayRef<Type *> ArgTypes,
+                                     StringRef Name) {
+  FunctionType *FT = FunctionType::get(RetTy, ArgTypes, false);
+  Function *F = M->getFunction(Name);
+  if (F && F->getFunctionType() == FT)
+    return F;
+  Function *NewF = Function::Create(FT, GlobalValue::ExternalLinkage, Name, M);
+  if (F)
+    NewF->setDSOLocal(F->isDSOLocal());
+  NewF->setCallingConv(CallingConv::SPIR_FUNC);
+  return NewF;
+}
+
+static void lowerFunnelShifts(Module *M, IntrinsicInst *FSHIntrinsic) {
+  // Get a separate function - otherwise, we'd have to rework the CFG of the
+  // current one. Then simply replace the intrinsic uses with a call to the new
+  // function.
+  // Generate LLVM IR for  i* @spirv.llvm_fsh?_i* (i* %a, i* %b, i* %c)
+  FunctionType *FSHFuncTy = FSHIntrinsic->getFunctionType();
+  Type *FSHRetTy = FSHFuncTy->getReturnType();
+  const std::string FuncName = lowerLLVMIntrinsicName(FSHIntrinsic);
+  Function *FSHFunc =
+      getOrCreateFunction(M, FSHRetTy, FSHFuncTy->params(), FuncName);
+
+  if (!FSHFunc->empty()) {
+    FSHIntrinsic->setCalledFunction(FSHFunc);
+    return;
+  }
+  BasicBlock *RotateBB = BasicBlock::Create(M->getContext(), "rotate", FSHFunc);
+  IRBuilder<> IRB(RotateBB);
+  Type *Ty = FSHFunc->getReturnType();
+  // Build the actual funnel shift rotate logic.
+  // In the comments, "int" is used interchangeably with "vector of int
+  // elements".
+  FixedVectorType *VectorTy = dyn_cast<FixedVectorType>(Ty);
+  Type *IntTy = VectorTy ? VectorTy->getElementType() : Ty;
+  unsigned BitWidth = IntTy->getIntegerBitWidth();
+  ConstantInt *BitWidthConstant = IRB.getInt({BitWidth, BitWidth});
+  Value *BitWidthForInsts =
+      VectorTy
+          ? IRB.CreateVectorSplat(VectorTy->getNumElements(), BitWidthConstant)
+          : BitWidthConstant;
+  Value *RotateModVal =
+      IRB.CreateURem(/*Rotate*/ FSHFunc->getArg(2), BitWidthForInsts);
+  Value *FirstShift = nullptr, *SecShift = nullptr;
+  if (FSHIntrinsic->getIntrinsicID() == Intrinsic::fshr) {
+    // Shift the less significant number right, the "rotate" number of bits
+    // will be 0-filled on the left as a result of this regular shift.
+    FirstShift = IRB.CreateLShr(FSHFunc->getArg(1), RotateModVal);
+  } else {
+    // Shift the more significant number left, the "rotate" number of bits
+    // will be 0-filled on the right as a result of this regular shift.
+    FirstShift = IRB.CreateShl(FSHFunc->getArg(0), RotateModVal);
+  }
+  // We want the "rotate" number of the more significant int's LSBs (MSBs) to
+  // occupy the leftmost (rightmost) "0 space" left by the previous operation.
+  // Therefore, subtract the "rotate" number from the integer bitsize...
+  Value *SubRotateVal = IRB.CreateSub(BitWidthForInsts, RotateModVal);
+  if (FSHIntrinsic->getIntrinsicID() == Intrinsic::fshr) {
+    // ...and left-shift the more significant int by this number, zero-filling
+    // the LSBs.
+    SecShift = IRB.CreateShl(FSHFunc->getArg(0), SubRotateVal);
+  } else {
+    // ...and right-shift the less significant int by this number, zero-filling
+    // the MSBs.
+    SecShift = IRB.CreateLShr(FSHFunc->getArg(1), SubRotateVal);
+  }
+  // A simple binary addition of the shifted ints yields the final result.
+  IRB.CreateRet(IRB.CreateOr(FirstShift, SecShift));
+
+  FSHIntrinsic->setCalledFunction(FSHFunc);
+}
+
+static void buildUMulWithOverflowFunc(Module *M, Function *UMulFunc) {
+  // The function body is already created.
+  if (!UMulFunc->empty())
+    return;
+
+  BasicBlock *EntryBB = BasicBlock::Create(M->getContext(), "entry", UMulFunc);
+  IRBuilder<> IRB(EntryBB);
+  // Build the actual unsigned multiplication logic with the overflow
+  // indication. Do unsigned multiplication Mul = A * B. Then check
+  // if unsigned division Div = Mul / A is not equal to B. If so,
+  // then overflow has happened.
+  Value *Mul = IRB.CreateNUWMul(UMulFunc->getArg(0), UMulFunc->getArg(1));
+  Value *Div = IRB.CreateUDiv(Mul, UMulFunc->getArg(0));
+  Value *Overflow = IRB.CreateICmpNE(UMulFunc->getArg(0), Div);
+
+  // umul.with.overflow intrinsic return a structure, where the first element
+  // is the multiplication result, and the second is an overflow bit.
+  Type *StructTy = UMulFunc->getReturnType();
+  Value *Agg = IRB.CreateInsertValue(UndefValue::get(StructTy), Mul, {0});
+  Value *Res = IRB.CreateInsertValue(Agg, Overflow, {1});
+  IRB.CreateRet(Res);
+}
+
+static void lowerUMulWithOverflow(Module *M, IntrinsicInst *UMulIntrinsic) {
+  // Get a separate function - otherwise, we'd have to rework the CFG of the
+  // current one. Then simply replace the intrinsic uses with a call to the new
+  // function.
+  FunctionType *UMulFuncTy = UMulIntrinsic->getFunctionType();
+  Type *FSHLRetTy = UMulFuncTy->getReturnType();
+  const std::string FuncName = lowerLLVMIntrinsicName(UMulIntrinsic);
+  Function *UMulFunc =
+      getOrCreateFunction(M, FSHLRetTy, UMulFuncTy->params(), FuncName);
+  buildUMulWithOverflowFunc(M, UMulFunc);
+  UMulIntrinsic->setCalledFunction(UMulFunc);
+}
+
+static void substituteIntrinsicCalls(Module *M, Function *F) {
+  for (BasicBlock &BB : *F) {
+    for (Instruction &I : BB) {
+      auto Call = dyn_cast<CallInst>(&I);
+      if (!Call)
+        continue;
+      Call->setTailCall(false);
+      Function *CF = Call->getCalledFunction();
+      if (!CF || !CF->isIntrinsic())
+        continue;
+      auto *II = cast<IntrinsicInst>(Call);
+      if (II->getIntrinsicID() == Intrinsic::fshl ||
+          II->getIntrinsicID() == Intrinsic::fshr)
+        lowerFunnelShifts(M, II);
+      else if (II->getIntrinsicID() == Intrinsic::umul_with_overflow)
+        lowerUMulWithOverflow(M, II);
+    }
+  }
+}
+
+bool SPIRVPrepareFunctions::runOnModule(Module &M) {
+  for (Function &F : M)
+    substituteIntrinsicCalls(&M, &F);
+
+  std::vector<Function *> FuncsWorklist;
+  bool Changed = false;
+  for (auto &F : M)
+    FuncsWorklist.push_back(&F);
+
+  for (auto *Func : FuncsWorklist) {
+    Function *F = processFunctionSignature(Func);
+
+    bool CreatedNewF = F != Func;
+
+    if (Func->isDeclaration()) {
+      Changed |= CreatedNewF;
+      continue;
+    }
+
+    if (CreatedNewF)
+      Func->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createSPIRVPrepareFunctionsPass() {
+  return new SPIRVPrepareFunctions();
+}
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index cdf3a160f373..00549c7b5768 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -46,8 +46,7 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
       PointerSize(computePointerSize(TT)), SPIRVVersion(0), InstrInfo(),
       FrameLowering(initSubtargetDependencies(CPU, FS)), TLInfo(TM, *this) {
   GR = std::make_unique<SPIRVGlobalRegistry>(PointerSize);
-  CallLoweringInfo =
-      std::make_unique<SPIRVCallLowering>(TLInfo, *this, GR.get());
+  CallLoweringInfo = std::make_unique<SPIRVCallLowering>(TLInfo, GR.get());
   Legalizer = std::make_unique<SPIRVLegalizerInfo>(*this);
   RegBankInfo = std::make_unique<SPIRVRegisterBankInfo>();
   InstSelector.reset(
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index f7c88a5c6d4a..7f5f14dc3ce8 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -140,7 +140,10 @@ TargetPassConfig *SPIRVTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new SPIRVPassConfig(*this, PM);
 }
 
-void SPIRVPassConfig::addIRPasses() { TargetPassConfig::addIRPasses(); }
+void SPIRVPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  addPass(createSPIRVPrepareFunctionsPass());
+}
 
 void SPIRVPassConfig::addISelPrepare() {
   addPass(createSPIRVEmitIntrinsicsPass(&getTM<SPIRVTargetMachine>()));
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index b92dc12735f8..15671ef3e512 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -45,6 +45,14 @@ static size_t getPaddedLen(const StringRef &Str) {
   return (Len % 4 == 0) ? Len : Len + (4 - (Len % 4));
 }
 
+void addStringImm(const StringRef &Str, MCInst &Inst) {
+  const size_t PaddedLen = getPaddedLen(Str);
+  for (unsigned i = 0; i < PaddedLen; i += 4) {
+    // Add an operand for the 32-bits of chars or padding.
+    Inst.addOperand(MCOperand::createImm(convertCharsToWord(Str, i)));
+  }
+}
+
 void addStringImm(const StringRef &Str, MachineInstrBuilder &MIB) {
   const size_t PaddedLen = getPaddedLen(Str);
   for (unsigned i = 0; i < PaddedLen; i += 4) {
@@ -182,6 +190,24 @@ SPIRV::MemorySemantics getMemSemanticsForStorageClass(SPIRV::StorageClass SC) {
   }
 }
 
+SPIRV::MemorySemantics getMemSemantics(AtomicOrdering Ord) {
+  switch (Ord) {
+  case AtomicOrdering::Acquire:
+    return SPIRV::MemorySemantics::Acquire;
+  case AtomicOrdering::Release:
+    return SPIRV::MemorySemantics::Release;
+  case AtomicOrdering::AcquireRelease:
+    return SPIRV::MemorySemantics::AcquireRelease;
+  case AtomicOrdering::SequentiallyConsistent:
+    return SPIRV::MemorySemantics::SequentiallyConsistent;
+  case AtomicOrdering::Unordered:
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::NotAtomic:
+  default:
+    return SPIRV::MemorySemantics::None;
+  }
+}
+
 MachineInstr *getDefInstrMaybeConstant(Register &ConstReg,
                                        const MachineRegisterInfo *MRI) {
   MachineInstr *ConstInstr = MRI->getVRegDef(ConstReg);
@@ -202,6 +228,11 @@ uint64_t getIConstVal(Register ConstReg, const MachineRegisterInfo *MRI) {
   return MI->getOperand(1).getCImm()->getValue().getZExtValue();
 }
 
+bool isSpvIntrinsic(MachineInstr &MI, Intrinsic::ID IntrinsicID) {
+  return MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS &&
+         MI.getIntrinsicID() == IntrinsicID;
+}
+
 Type *getMDOperandAsType(const MDNode *N, unsigned I) {
   return cast<ValueAsMetadata>(N->getOperand(I))->getType();
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index ffa82c9c1fe4..35e24b076570 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -32,6 +32,7 @@ class SPIRVInstrInfo;
 // Add the given string as a series of integer operand, inserting null
 // terminators and padding to make sure the operands all have 32-bit
 // little-endian words.
+void addStringImm(const llvm::StringRef &Str, llvm::MCInst &Inst);
 void addStringImm(const llvm::StringRef &Str, llvm::MachineInstrBuilder &MIB);
 void addStringImm(const llvm::StringRef &Str, llvm::IRBuilder<> &B,
                   std::vector<llvm::Value *> &Args);
@@ -67,6 +68,8 @@ llvm::SPIRV::StorageClass addressSpaceToStorageClass(unsigned AddrSpace);
 llvm::SPIRV::MemorySemantics
 getMemSemanticsForStorageClass(llvm::SPIRV::StorageClass SC);
 
+llvm::SPIRV::MemorySemantics getMemSemantics(llvm::AtomicOrdering Ord);
+
 // Find def instruction for the given ConstReg, walking through
 // spv_track_constant and ASSIGN_TYPE instructions. Updates ConstReg by def
 // of OpConstant instruction.
@@ -78,6 +81,9 @@ getDefInstrMaybeConstant(llvm::Register &ConstReg,
 uint64_t getIConstVal(llvm::Register ConstReg,
                       const llvm::MachineRegisterInfo *MRI);
 
+// Check if MI is a SPIR-V specific intrinsic call.
+bool isSpvIntrinsic(llvm::MachineInstr &MI, llvm::Intrinsic::ID IntrinsicID);
+
 // Get type of i-th operand of the metadata node.
 llvm::Type *getMDOperandAsType(const llvm::MDNode *N, unsigned I);
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index 1138788ac7fa..1f8837eb0194 100644
--- a/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -24,10 +24,10 @@ Target &llvm::getTheSparcelTarget() {
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSparcTargetInfo() {
-  RegisterTarget<Triple::sparc, /*HasJIT=*/true> X(getTheSparcTarget(), "sparc",
-                                                   "Sparc", "Sparc");
-  RegisterTarget<Triple::sparcv9, /*HasJIT=*/true> Y(
+  RegisterTarget<Triple::sparc, /*HasJIT=*/false> X(getTheSparcTarget(),
+                                                    "sparc", "Sparc", "Sparc");
+  RegisterTarget<Triple::sparcv9, /*HasJIT=*/false> Y(
       getTheSparcV9Target(), "sparcv9", "Sparc V9", "Sparc");
-  RegisterTarget<Triple::sparcel, /*HasJIT=*/true> Z(
+  RegisterTarget<Triple::sparcel, /*HasJIT=*/false> Z(
       getTheSparcelTarget(), "sparcel", "Sparc LE", "Sparc");
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
index 9c73757d7f5c..86eb8365d527 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.cpp
@@ -28,7 +28,3 @@ const MCPhysReg SystemZ::XPLINK64ArgGPRs[SystemZ::XPLINK64NumArgGPRs] = {
 const MCPhysReg SystemZ::XPLINK64ArgFPRs[SystemZ::XPLINK64NumArgFPRs] = {
     SystemZ::F0D, SystemZ::F2D, SystemZ::F4D, SystemZ::F6D
 };
-
-const MCPhysReg SystemZ::XPLINK64ArgVRs[SystemZ::XPLINK64NumArgVRs] = {
-    SystemZ::V24, SystemZ::V25, SystemZ::V26, SystemZ::V27,
-    SystemZ::V28, SystemZ::V29, SystemZ::V30, SystemZ::V31};
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index f82c61c0f344..387411942aba 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -27,9 +27,6 @@ namespace SystemZ {
 
   const unsigned XPLINK64NumArgFPRs = 4;
   extern const MCPhysReg XPLINK64ArgFPRs[XPLINK64NumArgFPRs];
-
-  const unsigned XPLINK64NumArgVRs = 8;
-  extern const MCPhysReg XPLINK64ArgVRs[XPLINK64NumArgVRs];
 } // end namespace SystemZ
 
 class SystemZCCState : public CCState {
@@ -205,41 +202,6 @@ inline bool CC_XPLINK64_Allocate128BitVararg(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-inline bool CC_XPLINK64_Shadow_Stack(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                     CCValAssign::LocInfo &LocInfo,
-                                     ISD::ArgFlagsTy &ArgFlags,
-                                     CCState &State) {
-  ArrayRef<MCPhysReg> RegList;
-
-  switch (LocVT.SimpleTy) {
-  case MVT::i64:
-    RegList = SystemZ::XPLINK64ArgGPRs;
-    break;
-  case MVT::v16i8:
-  case MVT::v8i16:
-  case MVT::v4i32:
-  case MVT::v2i64:
-  case MVT::v4f32:
-  case MVT::v2f64:
-    RegList = SystemZ::XPLINK64ArgVRs;
-    break;
-  case MVT::f32:
-  case MVT::f64:
-  case MVT::f128:
-    RegList = SystemZ::XPLINK64ArgFPRs;
-    break;
-  default:
-    return false;
-  }
-
-  unsigned UnallocatedRegisterIndex = State.getFirstUnallocated(RegList);
-  // Every time we can allocate a register, allocate on the stack.
-  if (UnallocatedRegisterIndex < RegList.size())
-    State.AllocateStack(LocVT.getSizeInBits() / 8, Align(8));
-
-  return false;
-}
-
 inline bool RetCC_SystemZ_Error(unsigned &, MVT &, MVT &,
                                 CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
                                 CCState &) {
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index fdd82a01f211..29b4a26736b2 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -221,9 +221,10 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // XPLINK64 ABI compliant code widens integral types smaller than i64
   // to i64 before placing the parameters either on the stack or in registers.
   CCIfType<[i32], CCIfExtend<CCPromoteToType<i64>>>,
-  // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRS.
-  CCIfType<[f32], CCIfNotFixed<CCPromoteToType<f64>>>,
-  CCIfType<[f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
+  // Promote f32 to f64 and bitcast to i64, if it needs to be passed in GPRs.
+  // Although we assign the f32 vararg to be bitcast, it will first be promoted
+  // to an f64 within convertValVTToLocVT().
+  CCIfType<[f32, f64], CCIfNotFixed<CCBitConvertToType<i64>>>,
   // long double, can only be passed in GPR2 and GPR3, if available,
   // hence R2Q
   CCIfType<[f128], CCIfNotFixed<CCCustom<"CC_XPLINK64_Allocate128BitVararg">>>,
@@ -246,34 +247,29 @@ def CC_SystemZ_XPLINK64 : CallingConv<[
   // The first 3 integer arguments are passed in registers R1D-R3D.
   // The rest will be passed in the user area. The address offset of the user
   // area can be found in register R4D.
-  CCIfType<[i64], CCCustom<"CC_XPLINK64_Shadow_Stack">>,
-  CCIfType<[i64], CCAssignToReg<[R1D, R2D, R3D]>>,
+  CCIfType<[i64], CCAssignToRegAndStack<[R1D, R2D, R3D], 8, 8>>,
 
-  // The first 8 named vector arguments are passed in V24-V31.  Sub-128 vectors
+  // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors
   // are passed in the same way, but they're widened to one of these types
   // during type legalization.
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-             CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
-  CCIfSubtarget<"hasVector()",
-    CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-             CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>>,
+      CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>>,
   CCIfSubtarget<"hasVector()",
     CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-             CCIfFixed<CCAssignToReg<[V24, V25, V26, V27,
-                                      V28, V29, V30, V31]>>>>,
+              CCIfFixed<CCAssignToRegAndStack<[V24, V25, V26, V27,
+                                               V28, V29, V30, V31], 16, 8>>>>,
 
-  // The first 4 named  float and double arguments are passed in registers FPR0-FPR6.
-  // The rest will be passed in the user area.
+  // The first 4 named float and double arguments are passed in registers
+  // FPR0-FPR6. The rest will be passed in the user area.
   CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
-  CCIfType<[f32, f64], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
-  CCIfType<[f32], CCIfFixed<CCAssignToReg<[F0S, F2S, F4S, F6S]>>>,
-  CCIfType<[f64], CCIfFixed<CCAssignToReg<[F0D, F2D, F4D, F6D]>>>,
+  CCIfType<[f32], CCIfFixed<CCAssignToRegAndStack<[F0S, F2S, F4S, F6S], 4, 8>>>,
+  CCIfType<[f64], CCIfFixed<CCAssignToRegAndStack<[F0D, F2D, F4D, F6D], 8, 8>>>,
+
   // The first 2 long double arguments are passed in register FPR0/FPR2
   // and FPR4/FPR6. The rest will be passed in the user area.
   CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Reg">>>,
-  CCIfType<[f128], CCIfFixed<CCCustom<"CC_XPLINK64_Shadow_Stack">>>,
-  CCIfType<[f128], CCIfFixed<CCAssignToReg<[F0Q, F4Q]>>>,
+  CCIfType<[f128], CCIfFixed<CCAssignToRegAndStack<[F0Q, F4Q], 16, 8>>>,
 
   // Other arguments are passed in 8-byte-aligned 8-byte stack slots.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 975eb8862e82..d943507b4112 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -911,6 +911,54 @@ SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering()
         XPLINKSpillOffsetTable[I].Offset;
 }
 
+// Checks if the function is a potential candidate for being a XPLeaf routine.
+static bool isXPLeafCandidate(const MachineFunction &MF) {
+  const MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  auto *Regs =
+      static_cast<SystemZXPLINK64Registers *>(Subtarget.getSpecialRegisters());
+
+  // If function calls other functions including alloca, then it is not a XPLeaf
+  // routine.
+  if (MFFrame.hasCalls())
+    return false;
+
+  // If the function has var Sized Objects, then it is not a XPLeaf routine.
+  if (MFFrame.hasVarSizedObjects())
+    return false;
+
+  // If the function adjusts the stack, then it is not a XPLeaf routine.
+  if (MFFrame.adjustsStack())
+    return false;
+
+  // If function modifies the stack pointer register, then it is not a XPLeaf
+  // routine.
+  if (MRI.isPhysRegModified(Regs->getStackPointerRegister()))
+    return false;
+
+  // If function modifies the ADA register, then it is not a XPLeaf routine.
+  if (MRI.isPhysRegModified(Regs->getAddressOfCalleeRegister()))
+    return false;
+
+  // If function modifies the return address register, then it is not a XPLeaf
+  // routine.
+  if (MRI.isPhysRegModified(Regs->getReturnFunctionAddressRegister()))
+    return false;
+
+  // If the backchain pointer should be stored, then it is not a XPLeaf routine.
+  if (MF.getFunction().hasFnAttribute("backchain"))
+    return false;
+
+  // If function acquires its own stack frame, then it is not a XPLeaf routine.
+  // At the time this function is called, only slots for local variables are
+  // allocated, so this is a very rough estimate.
+  if (MFFrame.estimateStackSize(MF) > 0)
+    return false;
+
+  return true;
+}
+
 bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
     MachineFunction &MF, const TargetRegisterInfo *TRI,
     std::vector<CalleeSavedInfo> &CSI) const {
@@ -920,6 +968,18 @@ bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
   auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
   auto &GRRegClass = SystemZ::GR64BitRegClass;
 
+  // At this point, the result of isXPLeafCandidate() is not accurate because
+  // the size of the save area has not yet been determined. If
+  // isXPLeafCandidate() indicates a potential leaf function, and there are no
+  // callee-save registers, then it is indeed a leaf function, and we can early
+  // exit.
+  // TODO: It is possible for leaf functions to use callee-saved registers.
+  // It can use the 0-2k range between R4 and the caller's stack frame without
+  // acquiring its own stack frame.
+  bool IsLeaf = CSI.empty() && isXPLeafCandidate(MF);
+  if (IsLeaf)
+    return true;
+
   // For non-leaf functions:
   // - the address of callee (entry point) register R6 must be saved
   CSI.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister()));
@@ -1137,16 +1197,16 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
   auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
   MachineInstr *StoreInstr = nullptr;
+
+  determineFrameLayout(MF);
+
   bool HasFP = hasFP(MF);
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
   DebugLoc DL;
   uint64_t Offset = 0;
 
-  // TODO: Support leaf functions; only add size of save+reserved area when
-  // function is non-leaf.
-  MFFrame.setStackSize(MFFrame.getStackSize() + Regs.getCallFrameSize());
-  uint64_t StackSize = MFFrame.getStackSize();
+  const uint64_t StackSize = MFFrame.getStackSize();
 
   if (ZFI->getSpillGPRRegs().LowGPR) {
     // Skip over the GPR saves.
@@ -1213,8 +1273,8 @@ void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
     // Mark the FramePtr as live at the beginning of every block except
     // the entry block.  (We'll have marked R8 as live on entry when
     // saving the GPRs.)
-    for (auto I = std::next(MF.begin()), E = MF.end(); I != E; ++I)
-      I->addLiveIn(Regs.getFramePointerRegister());
+    for (MachineBasicBlock &B : llvm::drop_begin(MF))
+      B.addLiveIn(Regs.getFramePointerRegister());
   }
 }
 
@@ -1321,3 +1381,32 @@ void SystemZXPLINKFrameLowering::processFunctionBeforeFrameFinalized(
   // Setup stack frame offset
   MFFrame.setOffsetAdjustment(Regs.getStackPointerBias());
 }
+
+// Determines the size of the frame, and creates the deferred spill objects.
+void SystemZXPLINKFrameLowering::determineFrameLayout(
+    MachineFunction &MF) const {
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  auto *Regs =
+      static_cast<SystemZXPLINK64Registers *>(Subtarget.getSpecialRegisters());
+
+  uint64_t StackSize = MFFrame.getStackSize();
+  if (StackSize == 0)
+    return;
+
+  // Add the size of the register save area and the reserved area to the size.
+  StackSize += Regs->getCallFrameSize();
+  MFFrame.setStackSize(StackSize);
+
+  // We now know the stack size. Create the fixed spill stack objects for the
+  // register save area now. This has no impact on the stack frame layout, as
+  // this is already computed. However, it makes sure that all callee saved
+  // registers have a valid frame index assigned.
+  const unsigned RegSize = MF.getDataLayout().getPointerSize();
+  for (auto &CS : MFFrame.getCalleeSavedInfo()) {
+    int Offset = RegSpillOffsets[CS.getReg()];
+    if (Offset >= 0)
+      CS.setFrameIdx(
+          MFFrame.CreateFixedSpillStackObject(RegSize, Offset - StackSize));
+  }
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index bec83a9457e0..95f30e3c0d99 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -134,6 +134,8 @@ public:
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                            RegScavenger *RS) const override;
+
+  void determineFrameLayout(MachineFunction &MF) const;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 42c1c77f14e4..ac4531262187 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1404,8 +1404,12 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
     return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value);
   case CCValAssign::BCvt: {
     assert(VA.getLocVT() == MVT::i64 || VA.getLocVT() == MVT::i128);
-    assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f64 ||
-           VA.getValVT() == MVT::f128);
+    assert(VA.getValVT().isVector() || VA.getValVT() == MVT::f32 ||
+           VA.getValVT() == MVT::f64 || VA.getValVT() == MVT::f128);
+    // For an f32 vararg we need to first promote it to an f64 and then
+    // bitcast it to an i64.
+    if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i64)
+      Value = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f64, Value);
     MVT BitCastToType = VA.getValVT().isVector() && VA.getLocVT() == MVT::i64
                             ? MVT::v2i64
                             : VA.getLocVT();
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index 94ebb59c4c77..46bb85606a62 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -418,7 +418,9 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   if (MI.getOpcode() == VE::LDrii ||    // I64
       MI.getOpcode() == VE::LDLSXrii || // I32
       MI.getOpcode() == VE::LDUrii ||   // F32
-      MI.getOpcode() == VE::LDQrii      // F128 (pseudo)
+      MI.getOpcode() == VE::LDQrii ||   // F128 (pseudo)
+      MI.getOpcode() == VE::LDVMrii ||  // VM (pseudo)
+      MI.getOpcode() == VE::LDVM512rii  // VM512 (pseudo)
   ) {
     if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
         MI.getOperand(2).getImm() == 0 && MI.getOperand(3).isImm() &&
@@ -437,10 +439,12 @@ unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
 /// any side effects other than storing to the stack slot.
 unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
                                          int &FrameIndex) const {
-  if (MI.getOpcode() == VE::STrii ||  // I64
-      MI.getOpcode() == VE::STLrii || // I32
-      MI.getOpcode() == VE::STUrii || // F32
-      MI.getOpcode() == VE::STQrii    // F128 (pseudo)
+  if (MI.getOpcode() == VE::STrii ||   // I64
+      MI.getOpcode() == VE::STLrii ||  // I32
+      MI.getOpcode() == VE::STUrii ||  // F32
+      MI.getOpcode() == VE::STQrii ||  // F128 (pseudo)
+      MI.getOpcode() == VE::STVMrii || // VM (pseudo)
+      MI.getOpcode() == VE::STVM512rii // VM512 (pseudo)
   ) {
     if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
         MI.getOperand(1).getImm() == 0 && MI.getOperand(2).isImm() &&
@@ -496,6 +500,20 @@ void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         .addImm(0)
         .addReg(SrcReg, getKillRegState(isKill))
         .addMemOperand(MMO);
+  } else if (RC == &VE::VMRegClass) {
+    BuildMI(MBB, I, DL, get(VE::STVMrii))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
+  } else if (VE::VM512RegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(VE::STVM512rii))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
   } else
     report_fatal_error("Can't store this register to stack slot");
 }
@@ -539,6 +557,18 @@ void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
         .addImm(0)
         .addImm(0)
         .addMemOperand(MMO);
+  } else if (RC == &VE::VMRegClass) {
+    BuildMI(MBB, I, DL, get(VE::LDVMrii), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addMemOperand(MMO);
+  } else if (VE::VM512RegClass.hasSubClassEq(RC)) {
+    BuildMI(MBB, I, DL, get(VE::LDVM512rii), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addMemOperand(MMO);
   } else
     report_fatal_error("Can't load this register from stack slot");
 }
diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td
index 71199717a3a2..0b2f5039e3f3 100644
--- a/llvm/lib/Target/VE/VEInstrPatternsVec.td
+++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td
@@ -25,6 +25,20 @@ def: Pat<(i64 (repl_i32 i32:$val)),
               (zero_f32 (i2l $val)),
               (SLLri (i2l $val), 32))>;
 
+///// Mask Load & Store /////
+
+// Store for v256i1, v512i1 are implemented in 2 ways.  These STVM/STVM512
+// pseudo instruction is used for frameindex related load/store instructions.
+// Custom Lowering is used for other load/store instructions.
+
+def : Pat<(v256i1 (load ADDRrii:$addr)),
+          (LDVMrii ADDRrii:$addr)>;
+def : Pat<(v512i1 (load ADDRrii:$addr)),
+          (LDVM512rii ADDRrii:$addr)>;
+def : Pat<(store v256i1:$vx, ADDRrii:$addr),
+          (STVMrii ADDRrii:$addr, $vx)>;
+def : Pat<(store v512i1:$vx, ADDRrii:$addr),
+          (STVM512rii ADDRrii:$addr, $vx)>;
 
 multiclass vbrd_elem32<ValueType v32, ValueType s32, SDPatternOperator ImmOp,
                        SDNodeXForm ImmCast, OutPatFrag SuperRegCast> {
diff --git a/llvm/lib/Target/VE/VEInstrVec.td b/llvm/lib/Target/VE/VEInstrVec.td
index 4a8476f7288a..327ad9ceacc5 100644
--- a/llvm/lib/Target/VE/VEInstrVec.td
+++ b/llvm/lib/Target/VE/VEInstrVec.td
@@ -2,6 +2,33 @@
 // Vector Instructions
 //===----------------------------------------------------------------------===//
 
+// Pseudo instructions for VM/VM512 spill/restore
+//
+// These pseudo instructions are used for only spill/restore since
+// InlineSpiller assumes storeRegToStackSlot/loadRegFromStackSlot
+// functions emit only single instruction.  Those functions emit a
+// single store/load instruction or one of these pseudo store/load
+// instructions.
+//
+// Specifies hasSideEffects = 0 to disable UnmodeledSideEffects.
+
+let mayLoad = 1, hasSideEffects = 0 in {
+def LDVMrii : Pseudo<
+    (outs VM:$vmx), (ins MEMrii:$addr),
+    "# pseudo ldvm $vmx, $addr", []>;
+def LDVM512rii : Pseudo<
+    (outs VM512:$vmx), (ins MEMrii:$addr),
+    "# pseudo ldvm512 $vmx, $addr", []>;
+}
+let mayStore = 1, hasSideEffects = 0 in {
+def STVMrii : Pseudo<
+    (outs), (ins MEMrii:$addr, VM:$vmx),
+    "# pseudo stvm $addr, $vmx", []>;
+def STVM512rii : Pseudo<
+    (outs), (ins MEMrii:$addr, VM512:$vmx),
+    "# pseudo stvm512 $addr, $vmx", []>;
+}
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions for VM512 modifications
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
index f334af128162..397ea09c9a02 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -180,6 +180,16 @@ class EliminateFrameIndex {
                   int FIOperandNum);
   void processLDQ(MachineInstr &MI, Register FrameReg, int64_t Offset,
                   int FIOperandNum);
+  // Expand and eliminate Frame Index of pseudo STVMrii and LDVMrii.
+  void processSTVM(MachineInstr &MI, Register FrameReg, int64_t Offset,
+                   int FIOperandNum);
+  void processLDVM(MachineInstr &MI, Register FrameReg, int64_t Offset,
+                   int FIOperandNum);
+  // Expand and eliminate Frame Index of pseudo STVM512rii and LDVM512rii.
+  void processSTVM512(MachineInstr &MI, Register FrameReg, int64_t Offset,
+                      int FIOperandNum);
+  void processLDVM512(MachineInstr &MI, Register FrameReg, int64_t Offset,
+                      int FIOperandNum);
 
 public:
   EliminateFrameIndex(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
@@ -271,6 +281,185 @@ void EliminateFrameIndex::processLDQ(MachineInstr &MI, Register FrameReg,
   replaceFI(MI, FrameReg, Offset, FIOperandNum);
 }
 
+void EliminateFrameIndex::processSTVM(MachineInstr &MI, Register FrameReg,
+                                      int64_t Offset, int FIOperandNum) {
+  assert(MI.getOpcode() == VE::STVMrii);
+  LLVM_DEBUG(dbgs() << "processSTVM: "; MI.dump());
+
+  // Original MI is:
+  //   STVMrii frame-index, 0, offset, reg (, memory operand)
+  // Convert it to:
+  //   SVMi   tmp-reg, reg, 0
+  //   STrii  frame-reg, 0, offset, tmp-reg
+  //   SVMi   tmp-reg, reg, 1
+  //   STrii  frame-reg, 0, offset+8, tmp-reg
+  //   SVMi   tmp-reg, reg, 2
+  //   STrii  frame-reg, 0, offset+16, tmp-reg
+  //   SVMi   tmp-reg, reg, 3
+  //   STrii  frame-reg, 0, offset+24, tmp-reg
+
+  prepareReplaceFI(MI, FrameReg, Offset, 24);
+
+  Register SrcReg = MI.getOperand(3).getReg();
+  bool isKill = MI.getOperand(3).isKill();
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving SX16 all of the time.
+  Register TmpReg = VE::SX16;
+  for (int i = 0; i < 3; ++i) {
+    build(VE::SVMmr, TmpReg).addReg(SrcReg).addImm(i);
+    MachineInstr *StMI =
+        build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(
+            TmpReg, getKillRegState(true));
+    replaceFI(*StMI, FrameReg, Offset, 0);
+    Offset += 8;
+  }
+  build(VE::SVMmr, TmpReg).addReg(SrcReg, getKillRegState(isKill)).addImm(3);
+  MI.setDesc(get(VE::STrii));
+  MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true);
+  replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processLDVM(MachineInstr &MI, Register FrameReg,
+                                      int64_t Offset, int FIOperandNum) {
+  assert(MI.getOpcode() == VE::LDVMrii);
+  LLVM_DEBUG(dbgs() << "processLDVM: "; MI.dump());
+
+  // Original MI is:
+  //   LDVMri reg, frame-index, 0, offset (, memory operand)
+  // Convert it to:
+  //   LDrii  tmp-reg, frame-reg, 0, offset
+  //   LVMir vm, 0, tmp-reg
+  //   LDrii  tmp-reg, frame-reg, 0, offset+8
+  //   LVMir_m vm, 1, tmp-reg, vm
+  //   LDrii  tmp-reg, frame-reg, 0, offset+16
+  //   LVMir_m vm, 2, tmp-reg, vm
+  //   LDrii  tmp-reg, frame-reg, 0, offset+24
+  //   LVMir_m vm, 3, tmp-reg, vm
+
+  prepareReplaceFI(MI, FrameReg, Offset, 24);
+
+  Register DestReg = MI.getOperand(0).getReg();
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving SX16 all of the time.
+  unsigned TmpReg = VE::SX16;
+  for (int i = 0; i < 4; ++i) {
+    if (i != 3) {
+      MachineInstr *StMI =
+          build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0);
+      replaceFI(*StMI, FrameReg, Offset, 1);
+      Offset += 8;
+    } else {
+      // Last LDrii replace the target instruction.
+      MI.setDesc(get(VE::LDrii));
+      MI.getOperand(0).ChangeToRegister(TmpReg, true);
+    }
+    // First LVM is LVMir.  Others are LVMir_m.  Last LVM places at the
+    // next of the target instruction.
+    if (i == 0)
+      build(VE::LVMir, DestReg).addImm(i).addReg(TmpReg, getKillRegState(true));
+    else if (i != 3)
+      build(VE::LVMir_m, DestReg)
+          .addImm(i)
+          .addReg(TmpReg, getKillRegState(true))
+          .addReg(DestReg);
+    else
+      BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestReg)
+          .addImm(3)
+          .addReg(TmpReg, getKillRegState(true))
+          .addReg(DestReg);
+  }
+  replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processSTVM512(MachineInstr &MI, Register FrameReg,
+                                         int64_t Offset, int FIOperandNum) {
+  assert(MI.getOpcode() == VE::STVM512rii);
+  LLVM_DEBUG(dbgs() << "processSTVM512: "; MI.dump());
+
+  prepareReplaceFI(MI, FrameReg, Offset, 56);
+
+  Register SrcReg = MI.getOperand(3).getReg();
+  Register SrcLoReg = getSubReg(SrcReg, VE::sub_vm_odd);
+  Register SrcHiReg = getSubReg(SrcReg, VE::sub_vm_even);
+  bool isKill = MI.getOperand(3).isKill();
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving SX16 all of the time.
+  Register TmpReg = VE::SX16;
+  // store low part of VMP
+  MachineInstr *LastMI = nullptr;
+  for (int i = 0; i < 4; ++i) {
+    LastMI = build(VE::SVMmr, TmpReg).addReg(SrcLoReg).addImm(i);
+    MachineInstr *StMI =
+        build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(
+            TmpReg, getKillRegState(true));
+    replaceFI(*StMI, FrameReg, Offset, 0);
+    Offset += 8;
+  }
+  if (isKill)
+    LastMI->addRegisterKilled(SrcLoReg, &TRI, true);
+  // store high part of VMP
+  for (int i = 0; i < 3; ++i) {
+    build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(i);
+    MachineInstr *StMI =
+        build(VE::STrii).addReg(FrameReg).addImm(0).addImm(0).addReg(
+            TmpReg, getKillRegState(true));
+    replaceFI(*StMI, FrameReg, Offset, 0);
+    Offset += 8;
+  }
+  LastMI = build(VE::SVMmr, TmpReg).addReg(SrcHiReg).addImm(3);
+  if (isKill) {
+    LastMI->addRegisterKilled(SrcHiReg, &TRI, true);
+    // Add implicit super-register kills to the particular MI.
+    LastMI->addRegisterKilled(SrcReg, &TRI, true);
+  }
+  MI.setDesc(get(VE::STrii));
+  MI.getOperand(3).ChangeToRegister(TmpReg, false, false, true);
+  replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
+void EliminateFrameIndex::processLDVM512(MachineInstr &MI, Register FrameReg,
+                                         int64_t Offset, int FIOperandNum) {
+  assert(MI.getOpcode() == VE::LDVM512rii);
+  LLVM_DEBUG(dbgs() << "processLDVM512: "; MI.dump());
+
+  prepareReplaceFI(MI, FrameReg, Offset, 56);
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register DestLoReg = getSubReg(DestReg, VE::sub_vm_odd);
+  Register DestHiReg = getSubReg(DestReg, VE::sub_vm_even);
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving SX16 all of the time.
+  Register TmpReg = VE::SX16;
+  build(VE::IMPLICIT_DEF, DestReg);
+  for (int i = 0; i < 4; ++i) {
+    MachineInstr *LdMI =
+        build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0);
+    replaceFI(*LdMI, FrameReg, Offset, 1);
+    build(VE::LVMir_m, DestLoReg)
+        .addImm(i)
+        .addReg(TmpReg, getKillRegState(true))
+        .addReg(DestLoReg);
+    Offset += 8;
+  }
+  for (int i = 0; i < 3; ++i) {
+    MachineInstr *LdMI =
+        build(VE::LDrii, TmpReg).addReg(FrameReg).addImm(0).addImm(0);
+    replaceFI(*LdMI, FrameReg, Offset, 1);
+    build(VE::LVMir_m, DestHiReg)
+        .addImm(i)
+        .addReg(TmpReg, getKillRegState(true))
+        .addReg(DestHiReg);
+    Offset += 8;
+  }
+  MI.setDesc(get(VE::LDrii));
+  MI.getOperand(0).ChangeToRegister(TmpReg, true);
+  BuildMI(*MI.getParent(), std::next(II), DL, get(VE::LVMir_m), DestHiReg)
+      .addImm(3)
+      .addReg(TmpReg, getKillRegState(true))
+      .addReg(DestHiReg);
+  replaceFI(MI, FrameReg, Offset, FIOperandNum);
+}
+
 void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg,
                                     int64_t Offset, int FIOperandNum) {
   switch (MI.getOpcode()) {
@@ -280,6 +469,18 @@ void EliminateFrameIndex::processMI(MachineInstr &MI, Register FrameReg,
   case VE::LDQrii:
     processLDQ(MI, FrameReg, Offset, FIOperandNum);
     return;
+  case VE::STVMrii:
+    processSTVM(MI, FrameReg, Offset, FIOperandNum);
+    return;
+  case VE::LDVMrii:
+    processLDVM(MI, FrameReg, Offset, FIOperandNum);
+    return;
+  case VE::STVM512rii:
+    processSTVM512(MI, FrameReg, Offset, FIOperandNum);
+    return;
+  case VE::LDVM512rii:
+    processLDVM512(MI, FrameReg, Offset, FIOperandNum);
+    return;
   }
   prepareReplaceFI(MI, FrameReg, Offset);
   replaceFI(MI, FrameReg, Offset, FIOperandNum);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 9316826e3d92..d7720604d6dc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -40,7 +40,7 @@ WebAssemblyInstrInfo::WebAssemblyInstrInfo(const WebAssemblySubtarget &STI)
       RI(STI.getTargetTriple()) {}
 
 bool WebAssemblyInstrInfo::isReallyTriviallyReMaterializable(
-    const MachineInstr &MI, AAResults *AA) const {
+    const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case WebAssembly::CONST_I32:
   case WebAssembly::CONST_I64:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
index f45a3792467a..29d700bdf83f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h
@@ -43,8 +43,7 @@ public:
 
   const WebAssemblyRegisterInfo &getRegisterInfo() const { return RI; }
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AAResults *AA) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index d3ad47147ac8..f9ef45bfb41c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -49,7 +49,6 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<MachineDominatorTree>();
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<MachineBlockFrequencyInfo>();
@@ -164,15 +163,15 @@ static void queryCallee(const MachineInstr &MI, bool &Read, bool &Write,
 
 // Determine whether MI reads memory, writes memory, has side effects,
 // and/or uses the stack pointer value.
-static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
-                  bool &Write, bool &Effects, bool &StackPointer) {
+static void query(const MachineInstr &MI, bool &Read, bool &Write,
+                  bool &Effects, bool &StackPointer) {
   assert(!MI.isTerminator());
 
   if (MI.isDebugInstr() || MI.isPosition())
     return;
 
   // Check for loads.
-  if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad(&AA))
+  if (MI.mayLoad() && !MI.isDereferenceableInvariantLoad())
     Read = true;
 
   // Check for stores.
@@ -255,9 +254,9 @@ static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
 }
 
 // Test whether Def is safe and profitable to rematerialize.
-static bool shouldRematerialize(const MachineInstr &Def, AliasAnalysis &AA,
+static bool shouldRematerialize(const MachineInstr &Def,
                                 const WebAssemblyInstrInfo *TII) {
-  return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def, &AA);
+  return Def.isAsCheapAsAMove() && TII->isTriviallyReMaterializable(Def);
 }
 
 // Identify the definition for this register at this point. This is a
@@ -311,7 +310,7 @@ static bool hasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI,
 // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
 // more precise.
 static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
-                         const MachineInstr *Insert, AliasAnalysis &AA,
+                         const MachineInstr *Insert,
                          const WebAssemblyFunctionInfo &MFI,
                          const MachineRegisterInfo &MRI) {
   const MachineInstr *DefI = Def->getParent();
@@ -391,7 +390,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
   }
 
   bool Read = false, Write = false, Effects = false, StackPointer = false;
-  query(*DefI, AA, Read, Write, Effects, StackPointer);
+  query(*DefI, Read, Write, Effects, StackPointer);
 
   // If the instruction does not access memory and has no side effects, it has
   // no additional dependencies.
@@ -406,7 +405,7 @@ static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
     bool InterveningWrite = false;
     bool InterveningEffects = false;
     bool InterveningStackPointer = false;
-    query(*I, AA, InterveningRead, InterveningWrite, InterveningEffects,
+    query(*I, InterveningRead, InterveningWrite, InterveningEffects,
           InterveningStackPointer);
     if (Effects && InterveningEffects)
       return false;
@@ -808,7 +807,6 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
-  AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
   auto &MDT = getAnalysis<MachineDominatorTree>();
   auto &LIS = getAnalysis<LiveIntervals>();
 
@@ -872,8 +870,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         // supports intra-block moves) and it's MachineSink's job to catch all
         // the sinking opportunities anyway.
         bool SameBlock = DefI->getParent() == &MBB;
-        bool CanMove = SameBlock &&
-                       isSafeToMove(Def, &Use, Insert, AA, MFI, MRI) &&
+        bool CanMove = SameBlock && isSafeToMove(Def, &Use, Insert, MFI, MRI) &&
                        !TreeWalker.isOnStack(Reg);
         if (CanMove && hasOneUse(Reg, DefI, MRI, MDT, LIS)) {
           Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI);
@@ -883,7 +880,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           // TODO: Encode this properly as a stackified value.
           if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
             MFI.clearFrameBaseVreg();
-        } else if (shouldRematerialize(*DefI, AA, TII)) {
+        } else if (shouldRematerialize(*DefI, TII)) {
           Insert =
               rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(),
                                     LIS, MFI, MRI, TII, TRI);
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a859176220c7..fa0a6bd415dc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1277,7 +1277,7 @@ class ProcModel<string Name, SchedMachineModel Model,
 // enabled. It has no effect on code generation.
 // NOTE: As a default tuning, "generic" aims to produce code optimized for the
 // most common X86 processors. The tunings might be changed over time. It is
-// recommended to use "x86-64" in lit tests for consistency.
+// recommended to use "tune-cpu"="x86-64" in function attribute for consistency.
 def : ProcModel<"generic", SandyBridgeModel,
                 [FeatureX87, FeatureCX8, FeatureX86_64],
                 [TuningSlow3OpsLEA,
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 16bff201dd03..db6923416177 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -393,12 +393,12 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
   switch (MI->getOpcode()) {
 
   case X86::MOV8rm:
-    // Only replace 8 bit loads with the zero extending versions if
-    // in an inner most loop and not optimizing for size. This takes
-    // an extra byte to encode, and provides limited performance upside.
-    if (MachineLoop *ML = MLI->getLoopFor(&MBB))
-      if (ML->begin() == ML->end() && !OptForSize)
-        return tryReplaceLoad(X86::MOVZX32rm8, MI);
+    // Replace 8-bit loads with the zero-extending version if not optimizing
+    // for size. The extending op is cheaper across a wide range of uarch and
+    // it avoids a potentially expensive partial register stall. It takes an
+    // extra byte to encode, however, so don't do this when optimizing for size.
+    if (!OptForSize)
+      return tryReplaceLoad(X86::MOVZX32rm8, MI);
     break;
 
   case X86::MOV16rm:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 12af6087cb47..5a4533c4bac4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -555,6 +555,39 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
 
+  auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
+    setOperationAction(ISD::FABS, VT, Action);
+    setOperationAction(ISD::FNEG, VT, Action);
+    setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+    setOperationAction(ISD::FREM, VT, Action);
+    setOperationAction(ISD::FMA, VT, Action);
+    setOperationAction(ISD::FMINNUM, VT, Action);
+    setOperationAction(ISD::FMAXNUM, VT, Action);
+    setOperationAction(ISD::FMINIMUM, VT, Action);
+    setOperationAction(ISD::FMAXIMUM, VT, Action);
+    setOperationAction(ISD::FSIN, VT, Action);
+    setOperationAction(ISD::FCOS, VT, Action);
+    setOperationAction(ISD::FSINCOS, VT, Action);
+    setOperationAction(ISD::FSQRT, VT, Action);
+    setOperationAction(ISD::FPOW, VT, Action);
+    setOperationAction(ISD::FLOG, VT, Action);
+    setOperationAction(ISD::FLOG2, VT, Action);
+    setOperationAction(ISD::FLOG10, VT, Action);
+    setOperationAction(ISD::FEXP, VT, Action);
+    setOperationAction(ISD::FEXP2, VT, Action);
+    setOperationAction(ISD::FCEIL, VT, Action);
+    setOperationAction(ISD::FFLOOR, VT, Action);
+    setOperationAction(ISD::FNEARBYINT, VT, Action);
+    setOperationAction(ISD::FRINT, VT, Action);
+    setOperationAction(ISD::BR_CC, VT, Action);
+    setOperationAction(ISD::SETCC, VT, Action);
+    setOperationAction(ISD::SELECT, VT, Custom);
+    setOperationAction(ISD::SELECT_CC, VT, Action);
+    setOperationAction(ISD::FROUND, VT, Action);
+    setOperationAction(ISD::FROUNDEVEN, VT, Action);
+    setOperationAction(ISD::FTRUNC, VT, Action);
+  };
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
     // f16, f32 and f64 use SSE.
     // Set up the FP register classes.
@@ -592,40 +625,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     // Half type will be promoted by default.
-    setOperationAction(ISD::FABS, MVT::f16, Promote);
-    setOperationAction(ISD::FNEG, MVT::f16, Promote);
-    setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+    setF16Action(MVT::f16, Promote);
     setOperationAction(ISD::FADD, MVT::f16, Promote);
     setOperationAction(ISD::FSUB, MVT::f16, Promote);
     setOperationAction(ISD::FMUL, MVT::f16, Promote);
     setOperationAction(ISD::FDIV, MVT::f16, Promote);
-    setOperationAction(ISD::FREM, MVT::f16, Promote);
-    setOperationAction(ISD::FMA, MVT::f16, Promote);
-    setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMINIMUM, MVT::f16, Promote);
-    setOperationAction(ISD::FMAXIMUM, MVT::f16, Promote);
-    setOperationAction(ISD::FSIN, MVT::f16, Promote);
-    setOperationAction(ISD::FCOS, MVT::f16, Promote);
-    setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
-    setOperationAction(ISD::FSQRT, MVT::f16, Promote);
-    setOperationAction(ISD::FPOW, MVT::f16, Promote);
-    setOperationAction(ISD::FLOG, MVT::f16, Promote);
-    setOperationAction(ISD::FLOG2, MVT::f16, Promote);
-    setOperationAction(ISD::FLOG10, MVT::f16, Promote);
-    setOperationAction(ISD::FEXP, MVT::f16, Promote);
-    setOperationAction(ISD::FEXP2, MVT::f16, Promote);
-    setOperationAction(ISD::FCEIL, MVT::f16, Promote);
-    setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
-    setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
-    setOperationAction(ISD::FRINT, MVT::f16, Promote);
-    setOperationAction(ISD::BR_CC, MVT::f16, Promote);
-    setOperationAction(ISD::SETCC, MVT::f16, Promote);
-    setOperationAction(ISD::SELECT, MVT::f16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
-    setOperationAction(ISD::FROUND, MVT::f16, Promote);
-    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Promote);
-    setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
     setOperationAction(ISD::FP_ROUND, MVT::f16, LibCall);
     setOperationAction(ISD::FP_EXTEND, MVT::f32, LibCall);
     setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
@@ -1003,6 +1007,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
+    addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
+                                                    : &X86::VR128RegClass);
     addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
                                                     : &X86::VR128RegClass);
     addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
@@ -1084,7 +1090,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
 
-    for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
+    for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
@@ -1095,19 +1101,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
     }
+    setF16Action(MVT::v8f16, Expand);
+    setOperationAction(ISD::FADD, MVT::v8f16, Expand);
+    setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
+    setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
+    setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
 
     // Custom lower v2i64 and v2f64 selects.
     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
     setOperationAction(ISD::SELECT,             MVT::v4i32, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8i16, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8f16, Custom);
     setOperationAction(ISD::SELECT,             MVT::v16i8, Custom);
 
-    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Custom);
     setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v2i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v4i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v2i32, Custom);
 
     // Custom legalize these to avoid over promotion or custom promotion.
@@ -1118,8 +1130,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
     }
 
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v4i32, Custom);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
     setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v2i32, Custom);
 
@@ -1304,6 +1316,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
+    addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
+                                                     : &X86::VR256RegClass);
     addRegisterClass(MVT::v8i32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
                                                      : &X86::VR256RegClass);
     addRegisterClass(MVT::v8f32,  Subtarget.hasVLX() ? &X86::VR256XRegClass
@@ -1340,12 +1354,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationPromotedToType(ISD::FP_TO_UINT,        MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
-    setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT,                MVT::v8i32, Custom);
     setOperationAction(ISD::FP_TO_UINT,                MVT::v8i32, Custom);
-    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::STRICT_FP_TO_SINT,         MVT::v8i32, Custom);
 
-    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
-    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP,  MVT::v8i32, Custom);
+    setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Custom);
 
     setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v4f32, Legal);
     setOperationAction(ISD::STRICT_FADD,        MVT::v8f32, Legal);
@@ -1356,7 +1372,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FMUL,        MVT::v4f64, Legal);
     setOperationAction(ISD::STRICT_FDIV,        MVT::v8f32, Legal);
     setOperationAction(ISD::STRICT_FDIV,        MVT::v4f64, Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
     setOperationAction(ISD::STRICT_FSQRT,       MVT::v8f32, Legal);
     setOperationAction(ISD::STRICT_FSQRT,       MVT::v4f64, Legal);
 
@@ -1386,6 +1401,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8i32, Custom);
     setOperationAction(ISD::SELECT,            MVT::v16i16, Custom);
+    setOperationAction(ISD::SELECT,            MVT::v16f16, Custom);
     setOperationAction(ISD::SELECT,            MVT::v32i8, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
@@ -1507,7 +1523,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // Custom lower several nodes for 256-bit types.
     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
-                    MVT::v8f32, MVT::v4f64 }) {
+                    MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
       setOperationAction(ISD::VSELECT,            VT, Custom);
@@ -1518,6 +1534,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
       setOperationAction(ISD::STORE,              VT, Custom);
     }
+    setF16Action(MVT::v16f16, Expand);
+    setOperationAction(ISD::FADD, MVT::v16f16, Expand);
+    setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
+    setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
+    setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
 
     if (HasInt256) {
       setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
@@ -1532,11 +1553,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
 
-  if (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) {
-    setOperationAction(ISD::FP_ROUND,             MVT::f16,    Custom);
-    setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16,    Custom);
-    setOperationAction(ISD::FP_EXTEND,            MVT::f32,    Custom);
-    setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32,    Custom);
+  if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
+      Subtarget.hasF16C()) {
+    for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
+      setOperationAction(ISD::FP_ROUND,           VT, Custom);
+      setOperationAction(ISD::STRICT_FP_ROUND,    VT, Custom);
+    }
+    for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {
+      setOperationAction(ISD::FP_EXTEND,          VT, Custom);
+      setOperationAction(ISD::STRICT_FP_EXTEND,   VT, Custom);
+    }
+    for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
+      setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
+      setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
+    }
+
+    setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);
   }
 
   // This block controls legalization of the mask vector sizes that are
@@ -1619,6 +1652,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+    addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
@@ -1645,14 +1679,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
       setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
     }
-    setOperationAction(ISD::FP_TO_SINT,        MVT::v16i32, Legal);
-    setOperationAction(ISD::FP_TO_UINT,        MVT::v16i32, Legal);
-    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
-    setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Legal);
-    setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Legal);
-    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
-    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT,        MVT::v16i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT,        MVT::v16i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP,        MVT::v16i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,        MVT::v16i32, Custom);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);
+    setOperationAction(ISD::FP_EXTEND,         MVT::v8f64,  Custom);
+    setOperationAction(ISD::STRICT_FP_EXTEND,  MVT::v8f64,  Custom);
 
     setOperationAction(ISD::STRICT_FADD,      MVT::v16f32, Legal);
     setOperationAction(ISD::STRICT_FADD,      MVT::v8f64,  Legal);
@@ -1664,7 +1700,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FDIV,      MVT::v8f64,  Legal);
     setOperationAction(ISD::STRICT_FSQRT,     MVT::v16f32, Legal);
     setOperationAction(ISD::STRICT_FSQRT,     MVT::v8f64,  Legal);
-    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64,  Legal);
     setOperationAction(ISD::STRICT_FP_ROUND,  MVT::v8f32,  Legal);
 
     setTruncStoreAction(MVT::v8i64,   MVT::v8i8,   Legal);
@@ -1799,15 +1834,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FSHR,      MVT::v16i32, Custom);
 
     if (Subtarget.hasDQI()) {
-      setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
-      setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
-      setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
-      setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
-      setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
-      setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
-      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
-      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
-
+      for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+                       ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+                       ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
+        setOperationAction(Opc,           MVT::v8i64, Custom);
       setOperationAction(ISD::MUL,        MVT::v8i64, Legal);
     }
 
@@ -1831,7 +1861,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 
     for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
-                     MVT::v16f32, MVT::v8f64 }) {
+                     MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::SELECT,             VT, Custom);
@@ -1842,6 +1872,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
     }
+    setF16Action(MVT::v32f16, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);
+    setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);
+    setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);
+    setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);
+    for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
+      setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
+      setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
+    }
 
     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::MLOAD,               VT, Legal);
@@ -1881,23 +1920,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     // These operations are handled on non-VLX by artificially widening in
     // isel patterns.
 
-    setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
-    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v4i32, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
-    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
-                       Subtarget.hasVLX() ? Legal : Custom);
 
     if (Subtarget.hasDQI()) {
       // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
@@ -1934,25 +1959,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MSCATTER, VT, Custom);
 
     if (Subtarget.hasDQI()) {
-      for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
-        setOperationAction(ISD::SINT_TO_FP, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::UINT_TO_FP, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::FP_TO_SINT, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::FP_TO_UINT, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
-                           Subtarget.hasVLX() ? Legal : Custom);
-        setOperationAction(ISD::MUL,               VT, Legal);
+      for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
+                       ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
+                       ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {
+        setOperationAction(Opc, MVT::v2i64, Custom);
+        setOperationAction(Opc, MVT::v4i64, Custom);
       }
+      setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+      setOperationAction(ISD::MUL, MVT::v4i64, Legal);
     }
 
     if (Subtarget.hasCDI()) {
@@ -2052,7 +2066,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // AVX512_FP16 scalar operations
     setGroup(MVT::f16);
-    addRegisterClass(MVT::f16,    &X86::FR16XRegClass);
     setOperationAction(ISD::FREM,                 MVT::f16, Promote);
     setOperationAction(ISD::STRICT_FREM,          MVT::f16, Promote);
     setOperationAction(ISD::SELECT_CC,            MVT::f16, Expand);
@@ -2066,6 +2079,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FROUNDEVEN,    MVT::f16, Legal);
     setOperationAction(ISD::FP_ROUND,             MVT::f16, Custom);
     setOperationAction(ISD::STRICT_FP_ROUND,      MVT::f16, Custom);
+    setOperationAction(ISD::FP_EXTEND,            MVT::f32, Legal);
     setOperationAction(ISD::STRICT_FP_EXTEND,     MVT::f32, Legal);
 
     setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);
@@ -2073,14 +2087,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     if (Subtarget.useAVX512Regs()) {
       setGroup(MVT::v32f16);
-      addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
       setOperationAction(ISD::SCALAR_TO_VECTOR,       MVT::v32f16, Custom);
       setOperationAction(ISD::SINT_TO_FP,             MVT::v32i16, Legal);
       setOperationAction(ISD::STRICT_SINT_TO_FP,      MVT::v32i16, Legal);
       setOperationAction(ISD::UINT_TO_FP,             MVT::v32i16, Legal);
       setOperationAction(ISD::STRICT_UINT_TO_FP,      MVT::v32i16, Legal);
+      setOperationAction(ISD::FP_ROUND,               MVT::v16f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,        MVT::v16f16, Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v16f32, Legal);
       setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v16f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,              MVT::v8f64,  Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND,       MVT::v8f64,  Legal);
       setOperationAction(ISD::INSERT_VECTOR_ELT,      MVT::v32f16, Custom);
 
       setOperationAction(ISD::FP_TO_SINT,             MVT::v32i16, Custom);
@@ -2112,8 +2129,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
 
     if (Subtarget.hasVLX()) {
-      addRegisterClass(MVT::v8f16,  &X86::VR128XRegClass);
-      addRegisterClass(MVT::v16f16, &X86::VR256XRegClass);
       setGroup(MVT::v8f16);
       setGroup(MVT::v16f16);
 
@@ -2132,8 +2147,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FP_TO_SINT,  MVT::v8i16, Custom);
       setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Custom);
       setOperationAction(ISD::STRICT_FP_TO_UINT,  MVT::v8i16, Custom);
+      setOperationAction(ISD::FP_ROUND,           MVT::v8f16, Legal);
       setOperationAction(ISD::STRICT_FP_ROUND,    MVT::v8f16, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
       setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v8f32, Legal);
+      setOperationAction(ISD::FP_EXTEND,          MVT::v4f64, Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND,   MVT::v4f64, Legal);
 
       // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
       setOperationAction(ISD::INSERT_VECTOR_ELT,    MVT::v8f16,  Custom);
@@ -2347,7 +2366,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                        ISD::FP16_TO_FP,
                        ISD::FP_EXTEND,
                        ISD::STRICT_FP_EXTEND,
-                       ISD::FP_ROUND});
+                       ISD::FP_ROUND,
+                       ISD::STRICT_FP_ROUND});
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
@@ -2404,6 +2424,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
     return TypeSplitVector;
 
   if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
+      !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
+    return TypeSplitVector;
+
+  if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
       VT.getVectorElementType() != MVT::i1)
     return TypeWidenVector;
 
@@ -2447,22 +2471,21 @@ handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                      CallingConv::ID CC,
                                                      EVT VT) const {
-  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
-      Subtarget.hasAVX512()) {
-    unsigned NumElts = VT.getVectorNumElements();
+  if (VT.isVector()) {
+    if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
+      unsigned NumElts = VT.getVectorNumElements();
 
-    MVT RegisterVT;
-    unsigned NumRegisters;
-    std::tie(RegisterVT, NumRegisters) =
-        handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
-    if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
-      return RegisterVT;
-  }
+      MVT RegisterVT;
+      unsigned NumRegisters;
+      std::tie(RegisterVT, NumRegisters) =
+          handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+      if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+        return RegisterVT;
+    }
 
-  // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
-  // So its default register type is f16. We override the type to v8f16 here.
-  if (VT == MVT::v3f16 && Subtarget.hasFP16())
-    return MVT::v8f16;
+    if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
+      return MVT::v8f16;
+  }
 
   // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.
   if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&
@@ -2475,22 +2498,21 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                           CallingConv::ID CC,
                                                           EVT VT) const {
-  if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
-      Subtarget.hasAVX512()) {
-    unsigned NumElts = VT.getVectorNumElements();
+  if (VT.isVector()) {
+    if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {
+      unsigned NumElts = VT.getVectorNumElements();
 
-    MVT RegisterVT;
-    unsigned NumRegisters;
-    std::tie(RegisterVT, NumRegisters) =
-        handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
-    if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
-      return NumRegisters;
-  }
+      MVT RegisterVT;
+      unsigned NumRegisters;
+      std::tie(RegisterVT, NumRegisters) =
+          handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+      if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+        return NumRegisters;
+    }
 
-  // v3f16 will be widen to v4f16. But we don't assign register class for v4f16.
-  // So its default register number is 3. We override the number to 1 here.
-  if (VT == MVT::v3f16 && Subtarget.hasFP16())
-    return 1;
+    if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)
+      return 1;
+  }
 
   // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if
   // x87 is disabled.
@@ -9646,13 +9668,13 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
     EVT CVT = Ld.getValueType();
     assert(!CVT.isVector() && "Must not broadcast a vector type");
 
-    // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
+    // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
     // For size optimization, also splat v2f64 and v2i64, and for size opt
     // with AVX2, also splat i8 and i16.
     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
     if (ScalarSize == 32 ||
         (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
-        (ScalarSize == 16 && Subtarget.hasFP16() && CVT.isFloatingPoint()) ||
+        CVT == MVT::f16 ||
         (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
       const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
@@ -14129,6 +14151,16 @@ static bool isShuffleFoldableLoad(SDValue V) {
          ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());
 }
 
+template<typename T>
+static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
+  return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();
+}
+
+template<typename T>
+bool X86TargetLowering::isSoftFP16(T VT) const {
+  return ::isSoftFP16(VT, Subtarget);
+}
+
 /// Try to lower insertion of a single element into a zero vector.
 ///
 /// This is a common pattern that we have especially efficient patterns to lower
@@ -14140,6 +14172,9 @@ static SDValue lowerShuffleAsElementInsertion(
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
 
+  if (isSoftFP16(EltVT, Subtarget))
+    return SDValue();
+
   int V2Index =
       find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
       Mask.begin();
@@ -19444,6 +19479,15 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(1);
   SDValue RHS = Op.getOperand(2);
 
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  if (isSoftFP16(VT)) {
+    MVT NVT = VT.changeVectorElementTypeToInteger();
+    return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
+                                          DAG.getBitcast(NVT, LHS),
+                                          DAG.getBitcast(NVT, RHS)));
+  }
+
   // A vselect where all conditions and data are constants can be optimized into
   // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
   if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
@@ -19467,8 +19511,6 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget.hasSSE41())
     return SDValue();
 
-  SDLoc dl(Op);
-  MVT VT = Op.getSimpleValueType();
   unsigned EltSize = VT.getScalarSizeInBits();
   unsigned NumElts = VT.getVectorNumElements();
 
@@ -20856,16 +20898,6 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
   return Cvt;
 }
 
-template<typename T>
-static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {
-  return VT == MVT::f16 && !Subtarget.hasFP16();
-}
-
-template<typename T>
-bool X86TargetLowering::isSoftFP16(T VT) const {
-  return ::isSoftFP16(VT, Subtarget);
-}
-
 static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
@@ -20885,6 +20917,26 @@ static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
 }
 
+static bool isLegalConversion(MVT VT, bool IsSigned,
+                              const X86Subtarget &Subtarget) {
+  if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
+    return true;
+  if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
+    return true;
+  if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
+    return true;
+  if (Subtarget.useAVX512Regs()) {
+    if (VT == MVT::v16i32)
+      return true;
+    if (VT == MVT::v8i64 && Subtarget.hasDQI())
+      return true;
+  }
+  if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
+      (VT == MVT::v2i64 || VT == MVT::v4i64))
+    return true;
+  return false;
+}
+
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
@@ -20897,6 +20949,8 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
 
   if (isSoftFP16(VT))
     return promoteXINT_TO_FP(Op, DAG);
+  else if (isLegalConversion(SrcVT, true, Subtarget))
+    return Op;
 
   if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
     return LowerWin64_INT128_TO_FP(Op, DAG);
@@ -21400,6 +21454,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 
   if (isSoftFP16(DstVT))
     return promoteXINT_TO_FP(Op, DAG);
+  else if (isLegalConversion(SrcVT, false, Subtarget))
+    return Op;
 
   if (DstVT.isVector())
     return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
@@ -22229,6 +22285,8 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
                                              {NVT, MVT::Other}, {Chain, Src})});
     return DAG.getNode(Op.getOpcode(), dl, VT,
                        DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
+  } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
+    return Op;
   }
 
   if (VT.isVector()) {
@@ -22826,7 +22884,7 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
     return Op;
 
   if (SVT.getVectorElementType() == MVT::f16) {
-    assert(Subtarget.hasFP16() && Subtarget.hasVLX() && "Unexpected features!");
+    assert(Subtarget.hasF16C() && "Unexpected features!");
     if (SVT == MVT::v2f16)
       In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
                        DAG.getUNDEF(MVT::v2f16));
@@ -22836,6 +22894,8 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
       return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
                          {Op->getOperand(0), Res});
     return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
+  } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
+    return Op;
   }
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
@@ -22854,34 +22914,19 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
   SDValue In = Op.getOperand(IsStrict ? 1 : 0);
-  SDValue Op2 = Op.getOperand(IsStrict ? 2 : 1);
   MVT VT = Op.getSimpleValueType();
   MVT SVT = In.getSimpleValueType();
 
   if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
     return SDValue();
 
-  if (VT == MVT::f16) {
-    if (Subtarget.hasFP16())
-      return Op;
-
-    if (SVT != MVT::f32) {
-      if (IsStrict)
-        return DAG.getNode(
-            ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
-            {Chain,
-             DAG.getNode(ISD::STRICT_FP_ROUND, DL, {MVT::f32, MVT::Other},
-                         {Chain, In, Op2}),
-             Op2});
-
-      return DAG.getNode(ISD::FP_ROUND, DL, VT,
-                         DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, In, Op2),
-                         Op2);
-    }
-
-    if (!Subtarget.hasF16C())
+  if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
+    if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
       return SDValue();
 
+    if (VT.isVector())
+      return Op;
+
     SDValue Res;
     SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,
                                         MVT::i32);
@@ -24176,10 +24221,10 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
   SDLoc dl(Op);
 
   if (isFP) {
-#ifndef NDEBUG
     MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
-#endif
+    if (isSoftFP16(EltVT, Subtarget))
+      return SDValue();
 
     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
@@ -24741,6 +24786,9 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC =
       cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
 
+  if (isSoftFP16(Op0.getValueType()))
+    return SDValue();
+
   // Handle f128 first, since one possible outcome is a normal integer
   // comparison which gets handled by emitFlagsForSetcc.
   if (Op0.getValueType() == MVT::f128) {
@@ -24931,10 +24979,12 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op1.getSimpleValueType();
   SDValue CC;
 
-  if (isSoftFP16(VT))
-    return DAG.getBitcast(MVT::f16, DAG.getNode(ISD::SELECT, DL, MVT::i16, Cond,
-                                                DAG.getBitcast(MVT::i16, Op1),
-                                                DAG.getBitcast(MVT::i16, Op2)));
+  if (isSoftFP16(VT)) {
+    MVT NVT = VT.changeTypeToInteger();
+    return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
+                                          DAG.getBitcast(NVT, Op1),
+                                          DAG.getBitcast(NVT, Op2)));
+  }
 
   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   // are available or VBLENDV if AVX is available.
@@ -27268,27 +27318,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     }
     return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
   }
-  case Intrinsic::swift_async_context_addr: {
-    auto &MF = DAG.getMachineFunction();
-    auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
-    if (Subtarget.is64Bit()) {
-      MF.getFrameInfo().setFrameAddressIsTaken(true);
-      X86FI->setHasSwiftAsyncContext(true);
-      return SDValue(
-          DAG.getMachineNode(
-              X86::SUB64ri8, dl, MVT::i64,
-              DAG.getCopyFromReg(DAG.getEntryNode(), dl, X86::RBP, MVT::i64),
-              DAG.getTargetConstant(8, dl, MVT::i32)),
-          0);
-    } else {
-      // 32-bit so no special extended frame, create or reuse an existing stack
-      // slot.
-      if (!X86FI->getSwiftAsyncContextFrameIdx())
-        X86FI->setSwiftAsyncContextFrameIdx(
-            MF.getFrameInfo().CreateStackObject(4, Align(4), false));
-      return DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
-    }
-  }
   case Intrinsic::x86_avx512_vp2intersect_q_512:
   case Intrinsic::x86_avx512_vp2intersect_q_256:
   case Intrinsic::x86_avx512_vp2intersect_q_128:
@@ -27668,6 +27697,37 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
   if (!IntrData) {
     switch (IntNo) {
+
+    case Intrinsic::swift_async_context_addr: {
+      SDLoc dl(Op);
+      auto &MF = DAG.getMachineFunction();
+      auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
+      if (Subtarget.is64Bit()) {
+        MF.getFrameInfo().setFrameAddressIsTaken(true);
+        X86FI->setHasSwiftAsyncContext(true);
+        SDValue Chain = Op->getOperand(0);
+        SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
+        SDValue Result =
+            SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,
+                                       DAG.getTargetConstant(8, dl, MVT::i32)),
+                    0);
+        // Return { result, chain }.
+        return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+                           CopyRBP.getValue(1));
+      } else {
+        // 32-bit so no special extended frame, create or reuse an existing
+        // stack slot.
+        if (!X86FI->getSwiftAsyncContextFrameIdx())
+          X86FI->setSwiftAsyncContextFrameIdx(
+              MF.getFrameInfo().CreateStackObject(4, Align(4), false));
+        SDValue Result =
+            DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);
+        // Return { result, chain }.
+        return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
+                           Op->getOperand(0));
+      }
+    }
+
     case llvm::Intrinsic::x86_seh_ehregnode:
       return MarkEHRegistrationNode(Op, DAG);
     case llvm::Intrinsic::x86_seh_ehguard:
@@ -32901,20 +32961,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND: {
     bool IsStrict = N->isStrictFPOpcode();
+    SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
     SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+    SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
+    EVT SrcVT = Src.getValueType();
     EVT VT = N->getValueType(0);
-    EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
+    SDValue V;
     if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
       SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
                              : DAG.getUNDEF(MVT::v2f32);
       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
     }
+    if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
+      assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
+      if (SrcVT.getVectorElementType() != MVT::f32)
+        return;
+
+      if (IsStrict)
+        V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
+                        {Chain, Src, Rnd});
+      else
+        V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
+
+      Results.push_back(DAG.getBitcast(MVT::v8f16, V));
+      if (IsStrict)
+        Results.push_back(V.getValue(1));
+      return;
+    }
     if (!isTypeLegal(Src.getValueType()))
       return;
-    SDValue V;
+    EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
     if (IsStrict)
       V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
-                      {N->getOperand(0), Src});
+                      {Chain, Src});
     else
       V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
     Results.push_back(V);
@@ -37342,6 +37421,7 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
                                bool IsUnary) {
   unsigned NumMaskElts = Mask.size();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
+  unsigned SizeInBits = MaskVT.getSizeInBits();
 
   if (MaskVT.is128BitVector()) {
     if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
@@ -37409,7 +37489,10 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
 
   // Attempt to match against a OR if we're performing a blend shuffle and the
   // non-blended source element is zero in each case.
-  if ((EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
+  // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
+  if (SizeInBits == V1.getValueSizeInBits() &&
+      SizeInBits == V2.getValueSizeInBits() &&
+      (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
       (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
     bool IsBlend = true;
     unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
@@ -39652,11 +39735,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
 
-  // FIXME: Remove this after we support vector FP16
-  if (isSoftFP16(peekThroughBitcasts(N.getOperand(0)).getSimpleValueType(),
-                 Subtarget))
-    return SDValue();
-
   if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
     return R;
 
@@ -40947,12 +41025,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
                                         EltBits)) {
         OpBits.clearAllBits();
         OpElts.clearAllBits();
-        for (int I = 0; I != NumElts; ++I)
-          if (DemandedElts[I] && ((Invert && !EltBits[I].isAllOnes()) ||
-                                  (!Invert && !EltBits[I].isZero()))) {
+        for (int I = 0; I != NumElts; ++I) {
+          if (!DemandedElts[I])
+            continue;
+          if (UndefElts[I]) {
+            // We can't assume an undef src element gives an undef dst - the
+            // other src might be zero.
+            OpBits.setAllBits();
+            OpElts.setBit(I);
+          } else if ((Invert && !EltBits[I].isAllOnes()) ||
+                     (!Invert && !EltBits[I].isZero())) {
             OpBits |= Invert ? ~EltBits[I] : EltBits[I];
             OpElts.setBit(I);
           }
+        }
       }
       return std::make_pair(OpBits, OpElts);
     };
@@ -44715,7 +44801,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
   }
 
   // Early exit check
-  if (!TLI.isTypeLegal(VT))
+  if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))
     return SDValue();
 
   if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
@@ -47798,11 +47884,17 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
                                         EltBits)) {
         DemandedBits.clearAllBits();
         DemandedElts.clearAllBits();
-        for (int I = 0; I != NumElts; ++I)
-          if (!EltBits[I].isZero()) {
+        for (int I = 0; I != NumElts; ++I) {
+          if (UndefElts[I]) {
+            // We can't assume an undef src element gives an undef dst - the
+            // other src might be zero.
+            DemandedBits.setAllBits();
+            DemandedElts.setBit(I);
+          } else if (!EltBits[I].isZero()) {
             DemandedBits |= EltBits[I];
             DemandedElts.setBit(I);
           }
+        }
       }
       return std::make_pair(DemandedBits, DemandedElts);
     };
@@ -51042,6 +51134,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   MVT VT = N->getSimpleValueType(0);
+  int NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
 
   // ANDNP(undef, x) -> 0
   // ANDNP(x, undef) -> 0
@@ -51060,6 +51154,19 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
   if (SDValue Not = IsNOT(N0, DAG))
     return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);
 
+  // Constant Folding
+  APInt Undefs0, Undefs1;
+  SmallVector<APInt> EltBits0, EltBits1;
+  if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0) &&
+      getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {
+    SDLoc DL(N);
+    SmallVector<APInt> ResultBits;
+    for (int I = 0; I != NumElts; ++I)
+      ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
+    APInt ResultUndefs = APInt::getZero(NumElts);
+    return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);
+  }
+
   // TODO: Constant fold NOT(N0) to allow us to use AND.
   // TODO: Do this in IsNOT with suitable oneuse checks?
 
@@ -51074,20 +51181,24 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
     auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
       APInt UndefElts;
       SmallVector<APInt> EltBits;
-      int NumElts = VT.getVectorNumElements();
-      int EltSizeInBits = VT.getScalarSizeInBits();
       APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
       APInt DemandedElts = APInt::getAllOnes(NumElts);
       if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
                                         EltBits)) {
         DemandedBits.clearAllBits();
         DemandedElts.clearAllBits();
-        for (int I = 0; I != NumElts; ++I)
-          if ((Invert && !EltBits[I].isAllOnes()) ||
-              (!Invert && !EltBits[I].isZero())) {
+        for (int I = 0; I != NumElts; ++I) {
+          if (UndefElts[I]) {
+            // We can't assume an undef src element gives an undef dst - the
+            // other src might be zero.
+            DemandedBits.setAllBits();
+            DemandedElts.setBit(I);
+          } else if ((Invert && !EltBits[I].isAllOnes()) ||
+                     (!Invert && !EltBits[I].isZero())) {
             DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
             DemandedElts.setBit(I);
           }
+        }
       }
       return std::make_pair(DemandedBits, DemandedElts);
     };
@@ -54714,8 +54825,9 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.hasFP16())
     return SDValue();
 
+  bool IsStrict = N->isStrictFPOpcode();
   EVT VT = N->getValueType(0);
-  SDValue Src = N->getOperand(0);
+  SDValue Src = N->getOperand(IsStrict ? 1 : 0);
   EVT SrcVT = Src.getValueType();
 
   if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
@@ -54736,8 +54848,15 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
   // Destination is v8i16 with at least 8 elements.
   EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
                                std::max(8U, NumElts));
-  SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
-                            DAG.getTargetConstant(4, dl, MVT::i32));
+  SDValue Cvt, Chain;
+  SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
+  if (IsStrict) {
+    Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
+                      {N->getOperand(0), Src, Rnd});
+    Chain = Cvt.getValue(1);
+  } else {
+    Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
+  }
 
   // Extract down to real number of elements.
   if (NumElts < 8) {
@@ -54746,7 +54865,12 @@ static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
                       DAG.getIntPtrConstant(0, dl));
   }
 
-  return DAG.getBitcast(VT, Cvt);
+  Cvt = DAG.getBitcast(VT, Cvt);
+
+  if (IsStrict)
+    return DAG.getMergeValues({Cvt, Chain}, dl);
+
+  return Cvt;
 }
 
 static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
@@ -54954,6 +55078,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
+  case ISD::STRICT_FP_ROUND:
   case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
   case X86ISD::VBROADCAST_LOAD:
   case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 48da7b3ac882..c105bde78ad1 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3769,12 +3769,16 @@ let Predicates = [HasAVX512] in {
             (VMOVDQA64Zrm addr:$src)>;
   def : Pat<(alignedloadv32i16 addr:$src),
             (VMOVDQA64Zrm addr:$src)>;
+  def : Pat<(alignedloadv32f16 addr:$src),
+            (VMOVAPSZrm addr:$src)>;
   def : Pat<(alignedloadv64i8 addr:$src),
             (VMOVDQA64Zrm addr:$src)>;
   def : Pat<(loadv16i32 addr:$src),
             (VMOVDQU64Zrm addr:$src)>;
   def : Pat<(loadv32i16 addr:$src),
             (VMOVDQU64Zrm addr:$src)>;
+  def : Pat<(loadv32f16 addr:$src),
+            (VMOVUPSZrm addr:$src)>;
   def : Pat<(loadv64i8 addr:$src),
             (VMOVDQU64Zrm addr:$src)>;
 
@@ -3783,12 +3787,16 @@ let Predicates = [HasAVX512] in {
             (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
   def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst),
             (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
+  def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
+            (VMOVAPSZmr addr:$dst, VR512:$src)>;
   def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst),
             (VMOVDQA64Zmr addr:$dst, VR512:$src)>;
   def : Pat<(store (v16i32 VR512:$src), addr:$dst),
             (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
   def : Pat<(store (v32i16 VR512:$src), addr:$dst),
             (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
+  def : Pat<(store (v32f16 VR512:$src), addr:$dst),
+            (VMOVUPSZmr addr:$dst, VR512:$src)>;
   def : Pat<(store (v64i8 VR512:$src), addr:$dst),
             (VMOVDQU64Zmr addr:$dst, VR512:$src)>;
 }
@@ -3799,12 +3807,16 @@ let Predicates = [HasVLX] in {
             (VMOVDQA64Z128rm addr:$src)>;
   def : Pat<(alignedloadv8i16 addr:$src),
             (VMOVDQA64Z128rm addr:$src)>;
+  def : Pat<(alignedloadv8f16 addr:$src),
+            (VMOVAPSZ128rm addr:$src)>;
   def : Pat<(alignedloadv16i8 addr:$src),
             (VMOVDQA64Z128rm addr:$src)>;
   def : Pat<(loadv4i32 addr:$src),
             (VMOVDQU64Z128rm addr:$src)>;
   def : Pat<(loadv8i16 addr:$src),
             (VMOVDQU64Z128rm addr:$src)>;
+  def : Pat<(loadv8f16 addr:$src),
+            (VMOVUPSZ128rm addr:$src)>;
   def : Pat<(loadv16i8 addr:$src),
             (VMOVDQU64Z128rm addr:$src)>;
 
@@ -3813,12 +3825,16 @@ let Predicates = [HasVLX] in {
             (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
   def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
             (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
+            (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
   def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
             (VMOVDQA64Z128mr addr:$dst, VR128X:$src)>;
   def : Pat<(store (v4i32 VR128X:$src), addr:$dst),
             (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
   def : Pat<(store (v8i16 VR128X:$src), addr:$dst),
             (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
+  def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
+            (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
   def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
             (VMOVDQU64Z128mr addr:$dst, VR128X:$src)>;
 
@@ -3827,12 +3843,16 @@ let Predicates = [HasVLX] in {
             (VMOVDQA64Z256rm addr:$src)>;
   def : Pat<(alignedloadv16i16 addr:$src),
             (VMOVDQA64Z256rm addr:$src)>;
+  def : Pat<(alignedloadv16f16 addr:$src),
+            (VMOVAPSZ256rm addr:$src)>;
   def : Pat<(alignedloadv32i8 addr:$src),
             (VMOVDQA64Z256rm addr:$src)>;
   def : Pat<(loadv8i32 addr:$src),
             (VMOVDQU64Z256rm addr:$src)>;
   def : Pat<(loadv16i16 addr:$src),
             (VMOVDQU64Z256rm addr:$src)>;
+  def : Pat<(loadv16f16 addr:$src),
+            (VMOVUPSZ256rm addr:$src)>;
   def : Pat<(loadv32i8 addr:$src),
             (VMOVDQU64Z256rm addr:$src)>;
 
@@ -3841,12 +3861,16 @@ let Predicates = [HasVLX] in {
             (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
   def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst),
             (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
+            (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
   def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst),
             (VMOVDQA64Z256mr addr:$dst, VR256X:$src)>;
   def : Pat<(store (v8i32 VR256X:$src), addr:$dst),
             (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
   def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
             (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
+  def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
+            (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
   def : Pat<(store (v32i8 VR256X:$src), addr:$dst),
             (VMOVDQU64Z256mr addr:$dst, VR256X:$src)>;
 }
@@ -3855,16 +3879,12 @@ let Predicates = [HasBWI] in {
             (VMOVDQU16Zrrk VR512:$src0, VK32WM:$mask, VR512:$src1)>;
   def : Pat<(v32f16 (vselect VK32WM:$mask, (v32f16 VR512:$src1), v32f16_info.ImmAllZerosV)),
             (VMOVDQU16Zrrkz VK32WM:$mask, VR512:$src1)>;
-  def : Pat<(v32f16 (alignedloadv32f16 addr:$src)),
-            (VMOVAPSZrm addr:$src)>;
   def : Pat<(v32f16 (vselect VK32WM:$mask,
                      (v32f16 (alignedloadv32f16 addr:$src)), (v32f16 VR512:$src0))),
             (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
   def : Pat<(v32f16 (vselect VK32WM:$mask,
                      (v32f16 (alignedloadv32f16 addr:$src)), v32f16_info.ImmAllZerosV)),
             (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
-  def : Pat<(v32f16 (loadv32f16 addr:$src)),
-            (VMOVUPSZrm addr:$src)>;
   def : Pat<(v32f16 (vselect VK32WM:$mask,
                      (v32f16 (loadv32f16 addr:$src)), (v32f16 VR512:$src0))),
             (VMOVDQU16Zrmk VR512:$src0, VK32WM:$mask, addr:$src)>;
@@ -3878,10 +3898,6 @@ let Predicates = [HasBWI] in {
   def : Pat<(v32f16 (masked_load addr:$src, VK32WM:$mask, v32f16_info.ImmAllZerosV)),
             (VMOVDQU16Zrmkz VK32WM:$mask, addr:$src)>;
 
-  def : Pat<(alignedstore (v32f16 VR512:$src), addr:$dst),
-            (VMOVAPSZmr addr:$dst, VR512:$src)>;
-  def : Pat<(store (v32f16 VR512:$src), addr:$dst),
-            (VMOVUPSZmr addr:$dst, VR512:$src)>;
   def : Pat<(masked_store (v32f16 VR512:$src), addr:$dst, VK32WM:$mask),
             (VMOVDQU16Zmrk addr:$dst, VK32WM:$mask, VR512:$src)>;
 }
@@ -3890,16 +3906,12 @@ let Predicates = [HasBWI, HasVLX] in {
             (VMOVDQU16Z256rrk VR256X:$src0, VK16WM:$mask, VR256X:$src1)>;
   def : Pat<(v16f16 (vselect VK16WM:$mask, (v16f16 VR256X:$src1), v16f16x_info.ImmAllZerosV)),
             (VMOVDQU16Z256rrkz VK16WM:$mask, VR256X:$src1)>;
-  def : Pat<(v16f16 (alignedloadv16f16 addr:$src)),
-            (VMOVAPSZ256rm addr:$src)>;
   def : Pat<(v16f16 (vselect VK16WM:$mask,
                      (v16f16 (alignedloadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
             (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
   def : Pat<(v16f16 (vselect VK16WM:$mask,
                      (v16f16 (alignedloadv16f16 addr:$src)), v16f16x_info.ImmAllZerosV)),
             (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
-  def : Pat<(v16f16 (loadv16f16 addr:$src)),
-            (VMOVUPSZ256rm addr:$src)>;
   def : Pat<(v16f16 (vselect VK16WM:$mask,
                      (v16f16 (loadv16f16 addr:$src)), (v16f16 VR256X:$src0))),
             (VMOVDQU16Z256rmk VR256X:$src0, VK16WM:$mask, addr:$src)>;
@@ -3913,10 +3925,6 @@ let Predicates = [HasBWI, HasVLX] in {
   def : Pat<(v16f16 (masked_load addr:$src, VK16WM:$mask, v16f16x_info.ImmAllZerosV)),
             (VMOVDQU16Z256rmkz VK16WM:$mask, addr:$src)>;
 
-  def : Pat<(alignedstore (v16f16 VR256X:$src), addr:$dst),
-            (VMOVAPSZ256mr addr:$dst, VR256X:$src)>;
-  def : Pat<(store (v16f16 VR256X:$src), addr:$dst),
-            (VMOVUPSZ256mr addr:$dst, VR256X:$src)>;
   def : Pat<(masked_store (v16f16 VR256X:$src), addr:$dst, VK16WM:$mask),
             (VMOVDQU16Z256mrk addr:$dst, VK16WM:$mask, VR256X:$src)>;
 
@@ -3924,16 +3932,12 @@ let Predicates = [HasBWI, HasVLX] in {
             (VMOVDQU16Z128rrk VR128X:$src0, VK8WM:$mask, VR128X:$src1)>;
   def : Pat<(v8f16 (vselect VK8WM:$mask, (v8f16 VR128X:$src1), v8f16x_info.ImmAllZerosV)),
             (VMOVDQU16Z128rrkz VK8WM:$mask, VR128X:$src1)>;
-  def : Pat<(v8f16 (alignedloadv8f16 addr:$src)),
-            (VMOVAPSZ128rm addr:$src)>;
   def : Pat<(v8f16 (vselect VK8WM:$mask,
                      (v8f16 (alignedloadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
             (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
   def : Pat<(v8f16 (vselect VK8WM:$mask,
                      (v8f16 (alignedloadv8f16 addr:$src)), v8f16x_info.ImmAllZerosV)),
             (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
-  def : Pat<(v8f16 (loadv8f16 addr:$src)),
-            (VMOVUPSZ128rm addr:$src)>;
   def : Pat<(v8f16 (vselect VK8WM:$mask,
                      (v8f16 (loadv8f16 addr:$src)), (v8f16 VR128X:$src0))),
             (VMOVDQU16Z128rmk VR128X:$src0, VK8WM:$mask, addr:$src)>;
@@ -3947,10 +3951,6 @@ let Predicates = [HasBWI, HasVLX] in {
   def : Pat<(v8f16 (masked_load addr:$src, VK8WM:$mask, v8f16x_info.ImmAllZerosV)),
             (VMOVDQU16Z128rmkz VK8WM:$mask, addr:$src)>;
 
-  def : Pat<(alignedstore (v8f16 VR128X:$src), addr:$dst),
-            (VMOVAPSZ128mr addr:$dst, VR128X:$src)>;
-  def : Pat<(store (v8f16 VR128X:$src), addr:$dst),
-            (VMOVUPSZ128mr addr:$dst, VR128X:$src)>;
   def : Pat<(masked_store (v8f16 VR128X:$src), addr:$dst, VK8WM:$mask),
             (VMOVDQU16Z128mrk addr:$dst, VK8WM:$mask, VR128X:$src)>;
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index ec32ac2acad1..74ef831e1658 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -742,8 +742,8 @@ static bool regIsPICBase(Register BaseReg, const MachineRegisterInfo &MRI) {
   return isPICBase;
 }
 
-bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                                     AAResults *AA) const {
+bool X86InstrInfo::isReallyTriviallyReMaterializable(
+    const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default:
     // This function should only be called for opcodes with the ReMaterializable
@@ -869,7 +869,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
         MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
         MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
         MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
-        MI.isDereferenceableInvariantLoad(AA)) {
+        MI.isDereferenceableInvariantLoad()) {
       Register BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
       if (BaseReg == 0 || BaseReg == X86::RIP)
         return true;
@@ -3892,6 +3892,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         Register DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  assert(MFI.getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
+         "Load size exceeds stack slot");
   if (RC->getID() == X86::TILERegClassID) {
     unsigned Opc = X86::TILELOADD;
     // tileloadd (%sp, %idx), %tmm
@@ -3913,8 +3917,6 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
                       FrameIdx);
   } else {
-    const MachineFunction &MF = *MBB.getParent();
-    const MachineFrameInfo &MFI = MF.getFrameInfo();
     unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
     bool isAligned =
         (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 4943d2152fd2..98da00c39bdb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -240,8 +240,7 @@ public:
   unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
                                     int &FrameIndex) const override;
 
-  bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
-                                         AAResults *AA) const override;
+  bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      Register DestReg, unsigned SubIdx,
                      const MachineInstr &Orig,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 06cb280e860a..c5557bd5df4e 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -140,6 +140,7 @@ def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
 let Predicates = [NoAVX512] in {
 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
 def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+def : Pat<(v8f16 immAllZerosV), (V_SET0)>;
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
@@ -159,6 +160,7 @@ def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
 let Predicates = [NoAVX512] in {
 def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
 def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+def : Pat<(v16f16 immAllZerosV), (AVX_SET0)>;
 def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
 def : Pat<(v8f32 immAllZerosV), (AVX_SET0)>;
 def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
@@ -572,6 +574,23 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v32i8 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  def : Pat<(alignedloadv8f16 addr:$src),
+            (VMOVAPSrm addr:$src)>;
+  def : Pat<(loadv8f16 addr:$src),
+            (VMOVUPSrm addr:$src)>;
+  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+            (VMOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+            (VMOVUPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedloadv16f16 addr:$src),
+            (VMOVAPSYrm addr:$src)>;
+  def : Pat<(loadv16f16 addr:$src),
+            (VMOVUPSYrm addr:$src)>;
+  def : Pat<(alignedstore (v16f16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16f16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
@@ -613,6 +632,17 @@ let Predicates = [UseSSE1] in {
             (MOVUPSmr addr:$dst, VR128:$src)>;
 }
 
+let Predicates = [UseSSE2] in {
+  def : Pat<(alignedloadv8f16 addr:$src),
+            (MOVAPSrm addr:$src)>;
+  def : Pat<(loadv8f16 addr:$src),
+            (MOVUPSrm addr:$src)>;
+  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+            (MOVAPSmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+            (MOVUPSmr addr:$dst, VR128:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Move Low packed FP Instructions
 //===----------------------------------------------------------------------===//
@@ -3136,6 +3166,8 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignednontemporalstore (v16f16 VR256:$src), addr:$dst),
+            (VMOVNTDQYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
             (VMOVNTDQYmr addr:$dst, VR256:$src)>;
 
@@ -3143,6 +3175,8 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQmr addr:$dst, VR128:$src)>;
   def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
             (VMOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
+            (VMOVNTDQmr addr:$dst, VR128:$src)>;
   def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
             (VMOVNTDQmr addr:$dst, VR128:$src)>;
 }
@@ -3152,6 +3186,8 @@ let Predicates = [UseSSE2] in {
             (MOVNTDQmr addr:$dst, VR128:$src)>;
   def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
             (MOVNTDQmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignednontemporalstore (v8f16 VR128:$src), addr:$dst),
+            (MOVNTDQmr addr:$dst, VR128:$src)>;
   def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
             (MOVNTDQmr addr:$dst, VR128:$src)>;
 }
@@ -3374,12 +3410,16 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVDQArm addr:$src)>;
   def : Pat<(alignedloadv8i16 addr:$src),
             (VMOVDQArm addr:$src)>;
+  def : Pat<(alignedloadv8f16 addr:$src),
+            (VMOVDQArm addr:$src)>;
   def : Pat<(alignedloadv16i8 addr:$src),
             (VMOVDQArm addr:$src)>;
   def : Pat<(loadv4i32 addr:$src),
             (VMOVDQUrm addr:$src)>;
   def : Pat<(loadv8i16 addr:$src),
             (VMOVDQUrm addr:$src)>;
+  def : Pat<(loadv8f16 addr:$src),
+            (VMOVDQUrm addr:$src)>;
   def : Pat<(loadv16i8 addr:$src),
             (VMOVDQUrm addr:$src)>;
 
@@ -3387,12 +3427,16 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVDQAmr addr:$dst, VR128:$src)>;
   def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
             (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8f16 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
   def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
             (VMOVDQAmr addr:$dst, VR128:$src)>;
   def : Pat<(store (v4i32 VR128:$src), addr:$dst),
             (VMOVDQUmr addr:$dst, VR128:$src)>;
   def : Pat<(store (v8i16 VR128:$src), addr:$dst),
             (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8f16 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
   def : Pat<(store (v16i8 VR128:$src), addr:$dst),
             (VMOVDQUmr addr:$dst, VR128:$src)>;
 }
@@ -6431,6 +6475,8 @@ let Predicates = [HasAVX2, NoVLX] in {
             (VMOVNTDQAYrm addr:$src)>;
   def : Pat<(v16i16 (alignednontemporalload addr:$src)),
             (VMOVNTDQAYrm addr:$src)>;
+  def : Pat<(v16f16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQAYrm addr:$src)>;
   def : Pat<(v32i8 (alignednontemporalload addr:$src)),
             (VMOVNTDQAYrm addr:$src)>;
 }
@@ -6446,6 +6492,8 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVNTDQArm addr:$src)>;
   def : Pat<(v8i16 (alignednontemporalload addr:$src)),
             (VMOVNTDQArm addr:$src)>;
+  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
+            (VMOVNTDQArm addr:$src)>;
   def : Pat<(v16i8 (alignednontemporalload addr:$src)),
             (VMOVNTDQArm addr:$src)>;
 }
@@ -6461,6 +6509,8 @@ let Predicates = [UseSSE41] in {
             (MOVNTDQArm addr:$src)>;
   def : Pat<(v8i16 (alignednontemporalload addr:$src)),
             (MOVNTDQArm addr:$src)>;
+  def : Pat<(v8f16 (alignednontemporalload addr:$src)),
+            (MOVNTDQArm addr:$src)>;
   def : Pat<(v16i8 (alignednontemporalload addr:$src)),
             (MOVNTDQArm addr:$src)>;
 }
@@ -7050,6 +7100,8 @@ def : Pat<(v8i32 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
 def : Pat<(v16i16 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
+def : Pat<(v16f16 (X86SubVBroadcastld128 addr:$src)),
+          (VBROADCASTF128 addr:$src)>;
 def : Pat<(v32i8 (X86SubVBroadcastld128 addr:$src)),
           (VBROADCASTF128 addr:$src)>;
 }
@@ -7095,6 +7147,7 @@ let Predicates = [HasAVX1Only] in {
   defm : vperm2x128_lowering<"VPERM2F128", v4i64,  loadv4i64>;
   defm : vperm2x128_lowering<"VPERM2F128", v8i32,  loadv8i32>;
   defm : vperm2x128_lowering<"VPERM2F128", v16i16, loadv16i16>;
+  defm : vperm2x128_lowering<"VPERM2F128", v16f16, loadv16f16>;
   defm : vperm2x128_lowering<"VPERM2F128", v32i8,  loadv32i8>;
 }
 
@@ -7150,6 +7203,8 @@ let Predicates = [HasAVX1Only] in {
   defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v2i64, v4i64,  loadv2i64, loadv4i64>;
   defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v4i32, v8i32,  loadv4i32, loadv8i32>;
   defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8i16, v16i16, loadv8i16, loadv16i16>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v8f16, v16f16, loadv8f16, loadv16f16>;
+  defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
   defm : vinsert_lowering<"VINSERTF128", "VPERM2F128", v16i8, v32i8,  loadv16i8, loadv32i8>;
 }
 
@@ -7189,6 +7244,8 @@ let Predicates = [HasAVX1Only] in {
   defm : vextract_lowering<"VEXTRACTF128", v4i64,  v2i64>;
   defm : vextract_lowering<"VEXTRACTF128", v8i32,  v4i32>;
   defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
+  defm : vextract_lowering<"VEXTRACTF128", v16f16, v8f16>;
+  defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
   defm : vextract_lowering<"VEXTRACTF128", v32i8,  v16i8>;
 }
 
@@ -7503,6 +7560,10 @@ def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0))
           (VBLENDPSYrri VR256:$src1,
                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
                                        VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16f16 VR256:$src1), (v8f16 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
 def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
           (VBLENDPSYrri VR256:$src1,
                         (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
@@ -7517,6 +7578,9 @@ def : Pat<(insert_subvector (loadv4i64 addr:$src2), (v2i64 VR128:$src1), (iPTR 0
 def : Pat<(insert_subvector (loadv16i16 addr:$src2), (v8i16 VR128:$src1), (iPTR 0)),
           (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
                                        VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
+def : Pat<(insert_subvector (loadv16f16 addr:$src2), (v8f16 VR128:$src1), (iPTR 0)),
+          (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
 def : Pat<(insert_subvector (loadv32i8 addr:$src2), (v16i8 VR128:$src1), (iPTR 0)),
           (VBLENDPSYrmi (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
                                        VR128:$src1, sub_xmm), addr:$src2, 0xf0)>;
@@ -7759,6 +7823,8 @@ let Predicates = [HasAVX2] in {
   defm : vperm2x128_lowering<"VPERM2I128", v4i64,  loadv4i64>;
   defm : vperm2x128_lowering<"VPERM2I128", v8i32,  loadv8i32>;
   defm : vperm2x128_lowering<"VPERM2I128", v16i16, loadv16i16>;
+  defm : vperm2x128_lowering<"VPERM2I128", v16f16, loadv16f16>;
+  defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
   defm : vperm2x128_lowering<"VPERM2I128", v32i8,  loadv32i8>;
 }
 
@@ -7781,6 +7847,8 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v2i64, v4i64,  loadv2i64,  loadv4i64>;
   defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v4i32, v8i32,  loadv4i32,  loadv8i32>;
   defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8i16, v16i16, loadv8i16,  loadv16i16>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v8f16, v16f16, loadv8f16,  loadv16f16>;
+  defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
   defm : vinsert_lowering<"VINSERTI128", "VPERM2I128", v16i8, v32i8,  loadv16i8,  loadv32i8>;
 }
 
@@ -7801,6 +7869,8 @@ let Predicates = [HasAVX2, NoVLX] in {
   defm : vextract_lowering<"VEXTRACTI128", v4i64,  v2i64>;
   defm : vextract_lowering<"VEXTRACTI128", v8i32,  v4i32>;
   defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
+  defm : vextract_lowering<"VEXTRACTI128", v16f16, v8f16>;
+  defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
   defm : vextract_lowering<"VEXTRACTI128", v32i8,  v16i8>;
 }
 
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index f4e25e4194db..1de2a1725954 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -254,8 +254,12 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
 
   StringRef CPU =
       CPUAttr.isValid() ? CPUAttr.getValueAsString() : (StringRef)TargetCPU;
-  StringRef TuneCPU =
-      TuneAttr.isValid() ? TuneAttr.getValueAsString() : (StringRef)CPU;
+  // "x86-64" is a default target setting for many front ends. In these cases,
+  // they actually request for "generic" tuning unless the "tune-cpu" was
+  // specified.
+  StringRef TuneCPU = TuneAttr.isValid() ? TuneAttr.getValueAsString()
+                      : CPU == "x86-64"  ? "generic"
+                                         : (StringRef)CPU;
   StringRef FS =
       FSAttr.isValid() ? FSAttr.getValueAsString() : (StringRef)TargetFS;
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b36f8a3d06d0..b27aac9c4e93 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1297,29 +1297,6 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     LT.first = NumOfDests * NumOfShufflesPerDest;
   }
 
-  static const CostTblEntry AVX512FP16ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v8f16, 1},  // vpbroadcastw
-
-      {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
-      {TTI::SK_Reverse, MVT::v16f16, 2}, // vpermw
-      {TTI::SK_Reverse, MVT::v8f16, 1},  // vpshufb
-
-      {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1},  // vpshufb
-
-      {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v16f16, 2}, // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v8f16, 2}   // vpermt2w
-  };
-
-  if (!ST->useSoftFloat() && ST->hasFP16())
-    if (const auto *Entry =
-            CostTableLookup(AVX512FP16ShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
@@ -1339,17 +1316,22 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   static const CostTblEntry AVX512BWShuffleTbl[] = {
       {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
       {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
 
       {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
+      {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
       {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
       {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
 
       {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
 
       {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
@@ -1369,6 +1351,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Broadcast, MVT::v8i64, 1},  // vpbroadcastq
       {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
       {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
       {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
 
       {TTI::SK_Reverse, MVT::v8f64, 1},  // vpermpd
@@ -1376,6 +1359,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Reverse, MVT::v8i64, 1},  // vpermq
       {TTI::SK_Reverse, MVT::v16i32, 1}, // vpermd
       {TTI::SK_Reverse, MVT::v32i16, 7}, // per mca
+      {TTI::SK_Reverse, MVT::v32f16, 7}, // per mca
       {TTI::SK_Reverse, MVT::v64i8,  7}, // per mca
 
       {TTI::SK_PermuteSingleSrc, MVT::v8f64, 1},  // vpermpd
@@ -1408,11 +1392,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       // FIXME: This just applies the type legalization cost rules above
       // assuming these completely split.
       {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+      {TTI::SK_PermuteSingleSrc, MVT::v32f16, 14},
       {TTI::SK_PermuteSingleSrc, MVT::v64i8,  14},
       {TTI::SK_PermuteTwoSrc,    MVT::v32i16, 42},
+      {TTI::SK_PermuteTwoSrc,    MVT::v32f16, 42},
       {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  42},
 
       {TTI::SK_Select, MVT::v32i16, 1}, // vpternlogq
+      {TTI::SK_Select, MVT::v32f16, 1}, // vpternlogq
       {TTI::SK_Select, MVT::v64i8,  1}, // vpternlogq
       {TTI::SK_Select, MVT::v8f64,  1}, // vblendmpd
       {TTI::SK_Select, MVT::v16f32, 1}, // vblendmps
@@ -1430,6 +1417,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
       {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
       {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
       {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
 
       {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
@@ -1437,9 +1425,11 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
       {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
       {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
+      {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
       {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
 
       {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
+      {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
       {TTI::SK_Select, MVT::v32i8, 1},  // vpblendvb
 
       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
@@ -1448,6 +1438,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
                                                   // + vpblendvb
+      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
+                                                  // + vpblendvb
       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
                                                   // + vpblendvb
 
@@ -1457,6 +1449,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
                                                // + vpblendvb
+      {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
+                                               // + vpblendvb
       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
                                                // + vpblendvb
   };
@@ -1493,6 +1487,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
       {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
       {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
+      {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
       {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
 
       {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
@@ -1501,6 +1496,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
       {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
                                          // + vinsertf128
+      {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
+                                         // + vinsertf128
       {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
                                          // + vinsertf128
 
@@ -1509,6 +1506,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
       {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
       {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
+      {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
       {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
 
       {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
@@ -1517,6 +1515,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
       {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
                                                   // + 2*por + vinsertf128
+      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
+                                                  // + 2*por + vinsertf128
       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
                                                   // + 2*por + vinsertf128
 
@@ -1526,6 +1526,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
       {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
                                                 // + 4*por + vinsertf128
+      {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
+                                                // + 4*por + vinsertf128
       {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
                                                 // + 4*por + vinsertf128
   };
@@ -1540,6 +1542,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
       {TTI::SK_Select, MVT::v4f32, 1}, // blendps
       {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
+      {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
       {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
   };
 
@@ -1549,18 +1552,23 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
   static const CostTblEntry SSSE3ShuffleTbl[] = {
       {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
+      {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
       {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
 
       {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
+      {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
       {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
 
       {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
+      {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
       {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
 
       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
+      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
       {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
 
       {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
+      {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
       {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
   };
 
@@ -1573,12 +1581,14 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
       {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
       {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
+      {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
       {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
 
       {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
       {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
       {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
       {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
+      {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
       {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
                                         // + 2*pshufd + 2*unpck + packus
 
@@ -1586,6 +1596,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_Select, MVT::v2f64, 1}, // movsd
       {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
       {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
+      {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
       {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
 
       {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
@@ -1593,6 +1604,8 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
       {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
       {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
                                                   // + pshufd/unpck
+      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
+                                                  // + pshufd/unpck
     { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
                                                   // + 2*pshufd + 2*unpck + 2*packus
 
@@ -1600,6 +1613,7 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
     { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
     { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
+    { TTI::SK_PermuteTwoSrc,    MVT::v8f16,  8 }, // blend+permute
     { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
   };
 
@@ -5219,7 +5233,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
     return true;
 
-  if (ScalarTy->isHalfTy() && ST->hasBWI() && ST->hasFP16())
+  if (ScalarTy->isHalfTy() && ST->hasBWI())
     return true;
 
   if (!ScalarTy->isIntegerTy())
@@ -5674,8 +5688,7 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCost(
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
       return true;
-    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) ||
-        (!ST->useSoftFloat() && ST->hasFP16() && EltTy->isHalfTy()))
+    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8) || EltTy->isHalfTy())
       return HasBW;
     return false;
   };
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index 19ebcb3ea3e8..2fb06e29bf3b 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -27,7 +27,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetOptions.h"
-#include <algorithm> // std::sort
+#include <algorithm>
 
 using namespace llvm;
 
diff --git a/llvm/lib/ToolDrivers/llvm-lib/Options.td b/llvm/lib/ToolDrivers/llvm-lib/Options.td
index 9d969b040ef2..4af250e8ad73 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/Options.td
+++ b/llvm/lib/ToolDrivers/llvm-lib/Options.td
@@ -48,3 +48,4 @@ def nodefaultlib: P<"nodefaultlib", "">;
 def nodefaultlib_all: F<"nodefaultlib">;
 def nologo : F<"nologo">;
 def subsystem : P<"subsystem", "">;
+def verbose : F<"verbose">;
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index f7bbdcffd2ec..81b43a2ab2c2 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -112,8 +112,8 @@ static bool declaresCoroCleanupIntrinsics(const Module &M) {
   return coro::declaresIntrinsics(
       M, {"llvm.coro.alloc", "llvm.coro.begin", "llvm.coro.subfn.addr",
           "llvm.coro.free", "llvm.coro.id", "llvm.coro.id.retcon",
-          "llvm.coro.id.retcon.once", "llvm.coro.async.size.replace",
-          "llvm.coro.async.resume"});
+          "llvm.coro.id.async", "llvm.coro.id.retcon.once",
+          "llvm.coro.async.size.replace", "llvm.coro.async.resume"});
 }
 
 PreservedAnalyses CoroCleanupPass::run(Module &M,
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 5557370c82ba..af35b45c2eaf 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -17,8 +17,6 @@
 namespace llvm {
 
 class CallGraph;
-class CallGraphSCC;
-class PassRegistry;
 
 namespace coro {
 
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 9c1b247cdb39..722a1c6ec0ce 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1961,6 +1961,13 @@ static coro::Shape splitCoroutine(Function &F,
   return Shape;
 }
 
+/// Remove calls to llvm.coro.end in the original function.
+static void removeCoroEnds(const coro::Shape &Shape) {
+  for (auto End : Shape.CoroEnds) {
+    replaceCoroEnd(End, Shape, Shape.FramePtr, /*in resume*/ false, nullptr);
+  }
+}
+
 static void updateCallGraphAfterCoroutineSplit(
     LazyCallGraph::Node &N, const coro::Shape &Shape,
     const SmallVectorImpl<Function *> &Clones, LazyCallGraph::SCC &C,
@@ -1969,10 +1976,14 @@ static void updateCallGraphAfterCoroutineSplit(
   if (!Shape.CoroBegin)
     return;
 
-  for (llvm::AnyCoroEndInst *End : Shape.CoroEnds) {
-    auto &Context = End->getContext();
-    End->replaceAllUsesWith(ConstantInt::getFalse(Context));
-    End->eraseFromParent();
+  if (Shape.ABI != coro::ABI::Switch)
+    removeCoroEnds(Shape);
+  else {
+    for (llvm::AnyCoroEndInst *End : Shape.CoroEnds) {
+      auto &Context = End->getContext();
+      End->replaceAllUsesWith(ConstantInt::getFalse(Context));
+      End->eraseFromParent();
+    }
   }
 
   if (!Clones.empty()) {
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index e5ff98e4f73f..37c773bd47d6 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -326,7 +326,7 @@ static bool getPotentialCopiesOfMemoryValue(
                     << " (only exact: " << OnlyExact << ")\n";);
 
   Value &Ptr = *I.getPointerOperand();
-  SmallVector<Value *, 8> Objects;
+  SmallSetVector<Value *, 8> Objects;
   if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, QueryingAA, &I,
                                        UsedAssumedInformation)) {
     LLVM_DEBUG(
@@ -343,6 +343,7 @@ static bool getPotentialCopiesOfMemoryValue(
 
   const auto *TLI =
       A.getInfoCache().getTargetLibraryInfoForFunction(*I.getFunction());
+  LLVM_DEBUG(dbgs() << "Visit " << Objects.size() << " objects:\n");
   for (Value *Obj : Objects) {
     LLVM_DEBUG(dbgs() << "Visit underlying object " << *Obj << "\n");
     if (isa<UndefValue>(Obj))
@@ -352,8 +353,8 @@ static bool getPotentialCopiesOfMemoryValue(
       // be OK. We do not try to optimize the latter.
       if (!NullPointerIsDefined(I.getFunction(),
                                 Ptr.getType()->getPointerAddressSpace()) &&
-          A.getAssumedSimplified(Ptr, QueryingAA, UsedAssumedInformation) ==
-              Obj)
+          A.getAssumedSimplified(Ptr, QueryingAA, UsedAssumedInformation,
+                                 AA::Interprocedural) == Obj)
         continue;
       LLVM_DEBUG(
           dbgs() << "Underlying object is a valid nullptr, giving up.\n";);
@@ -375,25 +376,37 @@ static bool getPotentialCopiesOfMemoryValue(
         return false;
       }
 
-    if (IsLoad) {
-      Value *InitialValue = AA::getInitialValueForObj(*Obj, *I.getType(), TLI);
-      if (!InitialValue)
-        return false;
-      NewCopies.push_back(InitialValue);
-      NewCopyOrigins.push_back(nullptr);
-    }
+    bool NullOnly = true;
+    bool NullRequired = false;
+    auto CheckForNullOnlyAndUndef = [&](Optional<Value *> V, bool IsExact) {
+      if (!V || *V == nullptr)
+        NullOnly = false;
+      else if (isa<UndefValue>(*V))
+        /* No op */;
+      else if (isa<Constant>(*V) && cast<Constant>(*V)->isNullValue())
+        NullRequired = !IsExact;
+      else
+        NullOnly = false;
+    };
 
     auto CheckAccess = [&](const AAPointerInfo::Access &Acc, bool IsExact) {
       if ((IsLoad && !Acc.isWrite()) || (!IsLoad && !Acc.isRead()))
         return true;
       if (IsLoad && Acc.isWrittenValueYetUndetermined())
         return true;
-      if (OnlyExact && !IsExact &&
+      CheckForNullOnlyAndUndef(Acc.getContent(), IsExact);
+      if (OnlyExact && !IsExact && !NullOnly &&
           !isa_and_nonnull<UndefValue>(Acc.getWrittenValue())) {
         LLVM_DEBUG(dbgs() << "Non exact access " << *Acc.getRemoteInst()
                           << ", abort!\n");
         return false;
       }
+      if (NullRequired && !NullOnly) {
+        LLVM_DEBUG(dbgs() << "Required all `null` accesses due to non exact "
+                             "one, however found non-null one: "
+                          << *Acc.getRemoteInst() << ", abort!\n");
+        return false;
+      }
       if (IsLoad) {
         assert(isa<LoadInst>(I) && "Expected load or store instruction only!");
         if (!Acc.isWrittenValueUnknown()) {
@@ -424,15 +437,36 @@ static bool getPotentialCopiesOfMemoryValue(
       return true;
     };
 
+    // If the value has been written to we don't need the initial value of the
+    // object.
+    bool HasBeenWrittenTo = false;
+
     auto &PI = A.getAAFor<AAPointerInfo>(QueryingAA, IRPosition::value(*Obj),
                                          DepClassTy::NONE);
-    if (!PI.forallInterferingAccesses(A, QueryingAA, I, CheckAccess)) {
+    if (!PI.forallInterferingAccesses(A, QueryingAA, I, CheckAccess,
+                                      HasBeenWrittenTo)) {
       LLVM_DEBUG(
           dbgs()
           << "Failed to verify all interfering accesses for underlying object: "
           << *Obj << "\n");
       return false;
     }
+
+    if (IsLoad && !HasBeenWrittenTo) {
+      Value *InitialValue = AA::getInitialValueForObj(*Obj, *I.getType(), TLI);
+      if (!InitialValue)
+        return false;
+      CheckForNullOnlyAndUndef(InitialValue, /* IsExact */ true);
+      if (NullRequired && !NullOnly) {
+        LLVM_DEBUG(dbgs() << "Non exact access but initial value that is not "
+                             "null or undef, abort!\n");
+        return false;
+      }
+
+      NewCopies.push_back(InitialValue);
+      NewCopyOrigins.push_back(nullptr);
+    }
+
     PIs.push_back(&PI);
   }
 
@@ -520,12 +554,21 @@ isPotentiallyReachable(Attributor &A, const Instruction &FromI,
                     << " from " << FromI << " [GBCB: " << bool(GoBackwardsCB)
                     << "]\n");
 
+  // TODO: If we can go arbitrarily backwards we will eventually reach an
+  // entry point that can reach ToI. Only once this takes a set of blocks
+  // through which we cannot go, or once we track internal functions not
+  // accessible from the outside, it makes sense to perform backwards analysis
+  // in the absence of a GoBackwardsCB.
+  if (!GoBackwardsCB) {
+    LLVM_DEBUG(dbgs() << "[AA] check @" << ToFn.getName() << " from " << FromI
+                      << " is not checked backwards, abort\n");
+    return true;
+  }
+
   SmallPtrSet<const Instruction *, 8> Visited;
   SmallVector<const Instruction *> Worklist;
   Worklist.push_back(&FromI);
 
-  const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
-      QueryingAA, IRPosition::function(ToFn), DepClassTy::OPTIONAL);
   while (!Worklist.empty()) {
     const Instruction *CurFromI = Worklist.pop_back_val();
     if (!Visited.insert(CurFromI).second)
@@ -545,26 +588,13 @@ isPotentiallyReachable(Attributor &A, const Instruction &FromI,
                         << *ToI << " [Intra]\n");
       if (Result)
         return true;
-      if (NoRecurseAA.isAssumedNoRecurse())
-        continue;
-    }
-
-    // TODO: If we can go arbitrarily backwards we will eventually reach an
-    // entry point that can reach ToI. Only once this takes a set of blocks
-    // through which we cannot go, or once we track internal functions not
-    // accessible from the outside, it makes sense to perform backwards analysis
-    // in the absence of a GoBackwardsCB.
-    if (!GoBackwardsCB) {
-      LLVM_DEBUG(dbgs() << "[AA] check @" << ToFn.getName() << " from "
-                        << *CurFromI << " is not checked backwards, abort\n");
-      return true;
     }
 
     // Check if the current instruction is already known to reach the ToFn.
     const auto &FnReachabilityAA = A.getAAFor<AAFunctionReachability>(
         QueryingAA, IRPosition::function(*FromFn), DepClassTy::OPTIONAL);
     bool Result = FnReachabilityAA.instructionCanReach(
-        A, *CurFromI, ToFn, /* UseBackwards */ false);
+        A, *CurFromI, ToFn);
     LLVM_DEBUG(dbgs() << "[AA] " << *CurFromI << " in @" << FromFn->getName()
                       << " " << (Result ? "can potentially " : "cannot ")
                       << "reach @" << ToFn.getName() << " [FromFn]\n");
@@ -1038,60 +1068,74 @@ Attributor::getAssumedConstant(const IRPosition &IRP,
   }
   if (auto *C = dyn_cast<Constant>(&IRP.getAssociatedValue()))
     return C;
-  const auto &ValueSimplifyAA =
-      getAAFor<AAValueSimplify>(AA, IRP, DepClassTy::NONE);
-  Optional<Value *> SimplifiedV =
-      ValueSimplifyAA.getAssumedSimplifiedValue(*this);
-  bool IsKnown = ValueSimplifyAA.isAtFixpoint();
-  UsedAssumedInformation |= !IsKnown;
-  if (!SimplifiedV) {
-    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
-    return llvm::None;
-  }
-  if (isa_and_nonnull<UndefValue>(SimplifiedV.value())) {
-    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
-    return UndefValue::get(IRP.getAssociatedType());
+  SmallVector<AA::ValueAndContext> Values;
+  if (getAssumedSimplifiedValues(IRP, &AA, Values,
+                                 AA::ValueScope::Interprocedural,
+                                 UsedAssumedInformation)) {
+    if (Values.empty())
+      return llvm::None;
+    if (auto *C = dyn_cast_or_null<Constant>(
+            AAPotentialValues::getSingleValue(*this, AA, IRP, Values)))
+      return C;
   }
-  Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.value());
-  if (CI)
-    CI = dyn_cast_or_null<Constant>(
-        AA::getWithType(*CI, *IRP.getAssociatedType()));
-  if (CI)
-    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
-  return CI;
+  return nullptr;
 }
 
-Optional<Value *>
-Attributor::getAssumedSimplified(const IRPosition &IRP,
-                                 const AbstractAttribute *AA,
-                                 bool &UsedAssumedInformation) {
+Optional<Value *> Attributor::getAssumedSimplified(const IRPosition &IRP,
+                                                   const AbstractAttribute *AA,
+                                                   bool &UsedAssumedInformation,
+                                                   AA::ValueScope S) {
   // First check all callbacks provided by outside AAs. If any of them returns
   // a non-null value that is different from the associated value, or None, we
   // assume it's simplified.
   for (auto &CB : SimplificationCallbacks.lookup(IRP))
     return CB(IRP, AA, UsedAssumedInformation);
 
-  // If no high-level/outside simplification occurred, use AAValueSimplify.
-  const auto &ValueSimplifyAA =
-      getOrCreateAAFor<AAValueSimplify>(IRP, AA, DepClassTy::NONE);
-  Optional<Value *> SimplifiedV =
-      ValueSimplifyAA.getAssumedSimplifiedValue(*this);
-  bool IsKnown = ValueSimplifyAA.isAtFixpoint();
-  UsedAssumedInformation |= !IsKnown;
-  if (!SimplifiedV) {
-    if (AA)
-      recordDependence(ValueSimplifyAA, *AA, DepClassTy::OPTIONAL);
+  SmallVector<AA::ValueAndContext> Values;
+  if (!getAssumedSimplifiedValues(IRP, AA, Values, S, UsedAssumedInformation))
+    return &IRP.getAssociatedValue();
+  if (Values.empty())
     return llvm::None;
+  if (AA)
+    if (Value *V = AAPotentialValues::getSingleValue(*this, *AA, IRP, Values))
+      return V;
+  if (IRP.getPositionKind() == IRPosition::IRP_RETURNED ||
+      IRP.getPositionKind() == IRPosition::IRP_CALL_SITE_RETURNED)
+    return nullptr;
+  return &IRP.getAssociatedValue();
+}
+
+bool Attributor::getAssumedSimplifiedValues(
+    const IRPosition &IRP, const AbstractAttribute *AA,
+    SmallVectorImpl<AA::ValueAndContext> &Values, AA::ValueScope S,
+    bool &UsedAssumedInformation) {
+  // First check all callbacks provided by outside AAs. If any of them returns
+  // a non-null value that is different from the associated value, or None, we
+  // assume it's simplified.
+  const auto &SimplificationCBs = SimplificationCallbacks.lookup(IRP);
+  for (auto &CB : SimplificationCBs) {
+    Optional<Value *> CBResult = CB(IRP, AA, UsedAssumedInformation);
+    if (!CBResult.has_value())
+      continue;
+    Value *V = CBResult.value();
+    if (!V)
+      return false;
+    if ((S & AA::ValueScope::Interprocedural) ||
+        AA::isValidInScope(*V, IRP.getAnchorScope()))
+      Values.push_back(AA::ValueAndContext{*V, nullptr});
+    else
+      return false;
   }
-  if (*SimplifiedV == nullptr)
-    return const_cast<Value *>(&IRP.getAssociatedValue());
-  if (Value *SimpleV =
-          AA::getWithType(**SimplifiedV, *IRP.getAssociatedType())) {
-    if (AA)
-      recordDependence(ValueSimplifyAA, *AA, DepClassTy::OPTIONAL);
-    return SimpleV;
-  }
-  return const_cast<Value *>(&IRP.getAssociatedValue());
+  if (!SimplificationCBs.empty())
+    return true;
+
+  // If no high-level/outside simplification occurred, use AAPotentialValues.
+  const auto &PotentialValuesAA =
+      getOrCreateAAFor<AAPotentialValues>(IRP, AA, DepClassTy::OPTIONAL);
+  if (!PotentialValuesAA.getAssumedSimplifiedValues(*this, Values, S))
+    return false;
+  UsedAssumedInformation |= !PotentialValuesAA.isAtFixpoint();
+  return true;
 }
 
 Optional<Value *> Attributor::translateArgumentToCallSiteContent(
@@ -1106,7 +1150,7 @@ Optional<Value *> Attributor::translateArgumentToCallSiteContent(
       if (!Arg->hasPointeeInMemoryValueAttr())
         return getAssumedSimplified(
             IRPosition::callsite_argument(CB, Arg->getArgNo()), AA,
-            UsedAssumedInformation);
+            UsedAssumedInformation, AA::Intraprocedural);
   return nullptr;
 }
 
@@ -1295,8 +1339,21 @@ bool Attributor::checkForAllUses(
   SmallVector<const Use *, 16> Worklist;
   SmallPtrSet<const Use *, 16> Visited;
 
-  for (const Use &U : V.uses())
-    Worklist.push_back(&U);
+  auto AddUsers = [&](const Value &V, const Use *OldUse) {
+    for (const Use &UU : V.uses()) {
+      if (OldUse && EquivalentUseCB && !EquivalentUseCB(*OldUse, UU)) {
+        LLVM_DEBUG(dbgs() << "[Attributor] Potential copy was "
+                             "rejected by the equivalence call back: "
+                          << *UU << "!\n");
+        return false;
+      }
+
+      Worklist.push_back(&UU);
+    }
+    return true;
+  };
+
+  AddUsers(V, /* OldUse */ nullptr);
 
   LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size()
                     << " initial uses to check\n");
@@ -1342,15 +1399,8 @@ bool Attributor::checkForAllUses(
                             << PotentialCopies.size()
                             << " potential copies instead!\n");
           for (Value *PotentialCopy : PotentialCopies)
-            for (const Use &CopyUse : PotentialCopy->uses()) {
-              if (EquivalentUseCB && !EquivalentUseCB(*U, CopyUse)) {
-                LLVM_DEBUG(dbgs() << "[Attributor] Potential copy was "
-                                     "rejected by the equivalence call back: "
-                                  << *CopyUse << "!\n");
-                return false;
-              }
-              Worklist.push_back(&CopyUse);
-            }
+            if (!AddUsers(*PotentialCopy, U))
+              return false;
           continue;
         }
       }
@@ -1361,8 +1411,25 @@ bool Attributor::checkForAllUses(
       return false;
     if (!Follow)
       continue;
-    for (const Use &UU : U->getUser()->uses())
-      Worklist.push_back(&UU);
+
+    User &Usr = *U->getUser();
+    AddUsers(Usr, /* OldUse */ nullptr);
+
+    auto *RI = dyn_cast<ReturnInst>(&Usr);
+    if (!RI)
+      continue;
+
+    Function &F = *RI->getFunction();
+    auto CallSitePred = [&](AbstractCallSite ACS) {
+      return AddUsers(*ACS.getInstruction(), U);
+    };
+    if (!checkForAllCallSites(CallSitePred, F, /* RequireAllCallSites */ true,
+                              &QueryingAA, UsedAssumedInformation)) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Could not follow return instruction "
+                           "to all call sites: "
+                        << *RI << "\n");
+      return false;
+    }
   }
 
   return true;
@@ -1918,7 +1985,8 @@ ChangeStatus Attributor::cleanupIR() {
                     << ToBeDeletedInsts.size() << " instructions and "
                     << ToBeChangedValues.size() << " values and "
                     << ToBeChangedUses.size() << " uses. To insert "
-                    << ToBeChangedToUnreachableInsts.size() << " unreachables."
+                    << ToBeChangedToUnreachableInsts.size()
+                    << " unreachables.\n"
                     << "Preserve manifest added " << ManifestAddedBlocks.size()
                     << " blocks\n");
 
@@ -2046,6 +2114,8 @@ ChangeStatus Attributor::cleanupIR() {
   }
   for (auto &V : ToBeChangedToUnreachableInsts)
     if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Change to unreachable: " << *I
+                        << "\n");
       assert(isRunOn(*I->getFunction()) &&
              "Cannot replace an instruction outside the current SCC!");
       CGModifiedFunctions.insert(I->getFunction());
@@ -2877,7 +2947,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
 
     // Every function might be simplified.
     bool UsedAssumedInformation = false;
-    getAssumedSimplified(RetPos, nullptr, UsedAssumedInformation);
+    getAssumedSimplified(RetPos, nullptr, UsedAssumedInformation,
+                         AA::Intraprocedural);
 
     // Every returned value might be marked noundef.
     getOrCreateAAFor<AANoUndef>(RetPos);
@@ -2906,7 +2977,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     // interface though as outside AAs can register custom simplification
     // callbacks.
     bool UsedAssumedInformation = false;
-    getAssumedSimplified(ArgPos, /* AA */ nullptr, UsedAssumedInformation);
+    getAssumedSimplified(ArgPos, /* AA */ nullptr, UsedAssumedInformation,
+                         AA::Intraprocedural);
 
     // Every argument might be dead.
     getOrCreateAAFor<AAIsDead>(ArgPos);
@@ -2970,7 +3042,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
 
       IRPosition CBRetPos = IRPosition::callsite_returned(CB);
       bool UsedAssumedInformation = false;
-      getAssumedSimplified(CBRetPos, nullptr, UsedAssumedInformation);
+      getAssumedSimplified(CBRetPos, nullptr, UsedAssumedInformation,
+                           AA::Intraprocedural);
     }
 
     for (int I = 0, E = CB.arg_size(); I < E; ++I) {
@@ -2984,7 +3057,8 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
       // Attributor interface though as outside AAs can register custom
       // simplification callbacks.
       bool UsedAssumedInformation = false;
-      getAssumedSimplified(CBArgPos, /* AA */ nullptr, UsedAssumedInformation);
+      getAssumedSimplified(CBArgPos, /* AA */ nullptr, UsedAssumedInformation,
+                           AA::Intraprocedural);
 
       // Every call site argument might be marked "noundef".
       getOrCreateAAFor<AANoUndef>(CBArgPos);
@@ -3034,12 +3108,12 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
           IRPosition::value(*cast<LoadInst>(I).getPointerOperand()));
       if (SimplifyAllLoads)
         getAssumedSimplified(IRPosition::value(I), nullptr,
-                             UsedAssumedInformation);
+                             UsedAssumedInformation, AA::Intraprocedural);
     } else {
       auto &SI = cast<StoreInst>(I);
       getOrCreateAAFor<AAIsDead>(IRPosition::inst(I));
       getAssumedSimplified(IRPosition::value(*SI.getValueOperand()), nullptr,
-                           UsedAssumedInformation);
+                           UsedAssumedInformation, AA::Intraprocedural);
       getOrCreateAAFor<AAAlign>(IRPosition::value(*SI.getPointerOperand()));
     }
     return true;
@@ -3126,6 +3200,26 @@ raw_ostream &llvm::operator<<(raw_ostream &OS,
   return OS;
 }
 
+raw_ostream &llvm::operator<<(raw_ostream &OS,
+                              const PotentialLLVMValuesState &S) {
+  OS << "set-state(< {";
+  if (!S.isValidState())
+    OS << "full-set";
+  else {
+    for (auto &It : S.getAssumedSet()) {
+      if (auto *F = dyn_cast<Function>(It.first.getValue()))
+        OS << "@" << F->getName() << "[" << int(It.second) << "], ";
+      else
+        OS << *It.first.getValue() << "[" << int(It.second) << "], ";
+    }
+    if (S.undefIsContained())
+      OS << "undef ";
+  }
+  OS << "} >)";
+
+  return OS;
+}
+
 void AbstractAttribute::print(raw_ostream &OS) const {
   OS << "[";
   OS << getName();
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 1ff54b78e27e..660ff3ee9563 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -14,12 +14,14 @@
 #include "llvm/Transforms/IPO/Attributor.h"
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
@@ -35,11 +37,13 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Assumptions.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -72,6 +76,8 @@ static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
 template <>
 unsigned llvm::PotentialConstantIntValuesState::MaxPotentialValues = 0;
 
+template <> unsigned llvm::PotentialLLVMValuesState::MaxPotentialValues = -1;
+
 static cl::opt<unsigned, true> MaxPotentialValues(
     "attributor-max-potential-values", cl::Hidden,
     cl::desc("Maximum number of potential values to be "
@@ -79,6 +85,12 @@ static cl::opt<unsigned, true> MaxPotentialValues(
     cl::location(llvm::PotentialConstantIntValuesState::MaxPotentialValues),
     cl::init(7));
 
+static cl::opt<int> MaxPotentialValuesIterations(
+    "attributor-max-potential-values-iterations", cl::Hidden,
+    cl::desc(
+        "Maximum number of iterations we keep dismantling potential values."),
+    cl::init(64));
+
 static cl::opt<unsigned> MaxInterferingAccesses(
     "attributor-max-interfering-accesses", cl::Hidden,
     cl::desc("Maximum number of interfering accesses to "
@@ -162,6 +174,7 @@ PIPE_OPERATOR(AAValueConstantRange)
 PIPE_OPERATOR(AAPrivatizablePtr)
 PIPE_OPERATOR(AAUndefinedBehavior)
 PIPE_OPERATOR(AAPotentialConstantValues)
+PIPE_OPERATOR(AAPotentialValues)
 PIPE_OPERATOR(AANoUndef)
 PIPE_OPERATOR(AACallEdges)
 PIPE_OPERATOR(AAFunctionReachability)
@@ -293,228 +306,35 @@ static Value *constructPointer(Type *ResTy, Type *PtrElemTy, Value *Ptr,
   return Ptr;
 }
 
-/// Recursively visit all values that might become \p IRP at some point. This
-/// will be done by looking through cast instructions, selects, phis, and calls
-/// with the "returned" attribute. Once we cannot look through the value any
-/// further, the callback \p VisitValueCB is invoked and passed the current
-/// value, the \p State, and a flag to indicate if we stripped anything.
-/// Stripped means that we unpacked the value associated with \p IRP at least
-/// once. Note that the value used for the callback may still be the value
-/// associated with \p IRP (due to PHIs). To limit how much effort is invested,
-/// we will never visit more values than specified by \p MaxValues.
-/// If \p VS does not contain the Interprocedural bit, only values valid in the
-/// scope of \p CtxI will be visited and simplification into other scopes is
-/// prevented.
-template <typename StateTy>
-static bool genericValueTraversal(
-    Attributor &A, IRPosition IRP, const AbstractAttribute &QueryingAA,
-    StateTy &State,
-    function_ref<bool(Value &, const Instruction *, StateTy &, bool)>
-        VisitValueCB,
-    const Instruction *CtxI, bool &UsedAssumedInformation,
-    bool UseValueSimplify = true, int MaxValues = 16,
-    function_ref<Value *(Value *)> StripCB = nullptr,
-    AA::ValueScope VS = AA::Interprocedural) {
-
-  struct LivenessInfo {
-    const AAIsDead *LivenessAA = nullptr;
-    bool AnyDead = false;
-  };
-  SmallMapVector<const Function *, LivenessInfo, 4> LivenessAAs;
-  auto GetLivenessInfo = [&](const Function &F) -> LivenessInfo & {
-    LivenessInfo &LI = LivenessAAs[&F];
-    if (!LI.LivenessAA)
-      LI.LivenessAA = &A.getAAFor<AAIsDead>(QueryingAA, IRPosition::function(F),
-                                            DepClassTy::NONE);
-    return LI;
-  };
-
-  Value *InitialV = &IRP.getAssociatedValue();
-  using Item = std::pair<Value *, const Instruction *>;
-  SmallSet<Item, 16> Visited;
-  SmallVector<Item, 16> Worklist;
-  Worklist.push_back({InitialV, CtxI});
-
-  int Iteration = 0;
-  do {
-    Item I = Worklist.pop_back_val();
-    Value *V = I.first;
-    CtxI = I.second;
-    if (StripCB)
-      V = StripCB(V);
-
-    // Check if we should process the current value. To prevent endless
-    // recursion keep a record of the values we followed!
-    if (!Visited.insert(I).second)
-      continue;
-
-    // Make sure we limit the compile time for complex expressions.
-    if (Iteration++ >= MaxValues) {
-      LLVM_DEBUG(dbgs() << "Generic value traversal reached iteration limit: "
-                        << Iteration << "!\n");
-      return false;
-    }
-
-    // Explicitly look through calls with a "returned" attribute if we do
-    // not have a pointer as stripPointerCasts only works on them.
-    Value *NewV = nullptr;
-    if (V->getType()->isPointerTy()) {
-      NewV = V->stripPointerCasts();
-    } else {
-      auto *CB = dyn_cast<CallBase>(V);
-      if (CB && CB->getCalledFunction()) {
-        for (Argument &Arg : CB->getCalledFunction()->args())
-          if (Arg.hasReturnedAttr()) {
-            NewV = CB->getArgOperand(Arg.getArgNo());
-            break;
-          }
-      }
-    }
-    if (NewV && NewV != V) {
-      Worklist.push_back({NewV, CtxI});
-      continue;
-    }
-
-    // Look through select instructions, visit assumed potential values.
-    if (auto *SI = dyn_cast<SelectInst>(V)) {
-      Optional<Constant *> C = A.getAssumedConstant(
-          *SI->getCondition(), QueryingAA, UsedAssumedInformation);
-      bool NoValueYet = !C;
-      if (NoValueYet || isa_and_nonnull<UndefValue>(*C))
-        continue;
-      if (auto *CI = dyn_cast_or_null<ConstantInt>(*C)) {
-        if (CI->isZero())
-          Worklist.push_back({SI->getFalseValue(), CtxI});
-        else
-          Worklist.push_back({SI->getTrueValue(), CtxI});
-        continue;
-      }
-      // We could not simplify the condition, assume both values.(
-      Worklist.push_back({SI->getTrueValue(), CtxI});
-      Worklist.push_back({SI->getFalseValue(), CtxI});
-      continue;
-    }
-
-    // Look through phi nodes, visit all live operands.
-    if (auto *PHI = dyn_cast<PHINode>(V)) {
-      LivenessInfo &LI = GetLivenessInfo(*PHI->getFunction());
-      for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
-        BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
-        if (LI.LivenessAA->isEdgeDead(IncomingBB, PHI->getParent())) {
-          LI.AnyDead = true;
-          UsedAssumedInformation |= !LI.LivenessAA->isAtFixpoint();
-          continue;
-        }
-        Worklist.push_back(
-            {PHI->getIncomingValue(u), IncomingBB->getTerminator()});
-      }
-      continue;
-    }
-
-    if (auto *Arg = dyn_cast<Argument>(V)) {
-      if ((VS & AA::Interprocedural) && !Arg->hasPassPointeeByValueCopyAttr()) {
-        SmallVector<Item> CallSiteValues;
-        bool UsedAssumedInformation = false;
-        if (A.checkForAllCallSites(
-                [&](AbstractCallSite ACS) {
-                  // Callbacks might not have a corresponding call site operand,
-                  // stick with the argument in that case.
-                  Value *CSOp = ACS.getCallArgOperand(*Arg);
-                  if (!CSOp)
-                    return false;
-                  CallSiteValues.push_back({CSOp, ACS.getInstruction()});
-                  return true;
-                },
-                *Arg->getParent(), true, &QueryingAA, UsedAssumedInformation)) {
-          Worklist.append(CallSiteValues);
-          continue;
-        }
-      }
-    }
-
-    if (UseValueSimplify && !isa<Constant>(V)) {
-      Optional<Value *> SimpleV =
-          A.getAssumedSimplified(*V, QueryingAA, UsedAssumedInformation);
-      if (!SimpleV)
-        continue;
-      Value *NewV = SimpleV.value();
-      if (NewV && NewV != V) {
-        if ((VS & AA::Interprocedural) || !CtxI ||
-            AA::isValidInScope(*NewV, CtxI->getFunction())) {
-          Worklist.push_back({NewV, CtxI});
-          continue;
-        }
-      }
-    }
-
-    if (auto *LI = dyn_cast<LoadInst>(V)) {
-      bool UsedAssumedInformation = false;
-      // If we ask for the potentially loaded values from the initial pointer we
-      // will simply end up here again. The load is as far as we can make it.
-      if (LI->getPointerOperand() != InitialV) {
-        SmallSetVector<Value *, 4> PotentialCopies;
-        SmallSetVector<Instruction *, 4> PotentialValueOrigins;
-        if (AA::getPotentiallyLoadedValues(A, *LI, PotentialCopies,
-                                           PotentialValueOrigins, QueryingAA,
-                                           UsedAssumedInformation,
-                                           /* OnlyExact */ true)) {
-          // Values have to be dynamically unique or we loose the fact that a
-          // single llvm::Value might represent two runtime values (e.g., stack
-          // locations in different recursive calls).
-          bool DynamicallyUnique =
-              llvm::all_of(PotentialCopies, [&A, &QueryingAA](Value *PC) {
-                return AA::isDynamicallyUnique(A, QueryingAA, *PC);
-              });
-          if (DynamicallyUnique &&
-              ((VS & AA::Interprocedural) || !CtxI ||
-               llvm::all_of(PotentialCopies, [CtxI](Value *PC) {
-                 return AA::isValidInScope(*PC, CtxI->getFunction());
-               }))) {
-            for (auto *PotentialCopy : PotentialCopies)
-              Worklist.push_back({PotentialCopy, CtxI});
-            continue;
-          }
-        }
-      }
-    }
-
-    // Once a leaf is reached we inform the user through the callback.
-    if (!VisitValueCB(*V, CtxI, State, Iteration > 1)) {
-      LLVM_DEBUG(dbgs() << "Generic value traversal visit callback failed for: "
-                        << *V << "!\n");
-      return false;
-    }
-  } while (!Worklist.empty());
-
-  // If we actually used liveness information so we have to record a dependence.
-  for (auto &It : LivenessAAs)
-    if (It.second.AnyDead)
-      A.recordDependence(*It.second.LivenessAA, QueryingAA,
-                         DepClassTy::OPTIONAL);
-
-  // All values have been visited.
-  return true;
-}
-
 bool AA::getAssumedUnderlyingObjects(Attributor &A, const Value &Ptr,
-                                     SmallVectorImpl<Value *> &Objects,
+                                     SmallSetVector<Value *, 8> &Objects,
                                      const AbstractAttribute &QueryingAA,
                                      const Instruction *CtxI,
                                      bool &UsedAssumedInformation,
-                                     AA::ValueScope VS) {
-  auto StripCB = [&](Value *V) { return getUnderlyingObject(V); };
-  SmallPtrSet<Value *, 8> SeenObjects;
-  auto VisitValueCB = [&SeenObjects](Value &Val, const Instruction *,
-                                     SmallVectorImpl<Value *> &Objects,
-                                     bool) -> bool {
-    if (SeenObjects.insert(&Val).second)
-      Objects.push_back(&Val);
+                                     AA::ValueScope S,
+                                     SmallPtrSetImpl<Value *> *SeenObjects) {
+  SmallPtrSet<Value *, 8> LocalSeenObjects;
+  if (!SeenObjects)
+    SeenObjects = &LocalSeenObjects;
+
+  SmallVector<AA::ValueAndContext> Values;
+  if (!A.getAssumedSimplifiedValues(IRPosition::value(Ptr), &QueryingAA, Values,
+                                    S, UsedAssumedInformation)) {
+    Objects.insert(const_cast<Value *>(&Ptr));
     return true;
-  };
-  if (!genericValueTraversal<decltype(Objects)>(
-          A, IRPosition::value(Ptr), QueryingAA, Objects, VisitValueCB, CtxI,
-          UsedAssumedInformation, true, 32, StripCB, VS))
-    return false;
+  }
+
+  for (auto &VAC : Values) {
+    Value *UO = getUnderlyingObject(VAC.getValue());
+    if (UO && UO != VAC.getValue() && SeenObjects->insert(UO).second) {
+      if (!getAssumedUnderlyingObjects(A, *UO, Objects, QueryingAA,
+                                       VAC.getCtxI(), UsedAssumedInformation, S,
+                                       SeenObjects))
+        return false;
+      continue;
+    }
+    Objects.insert(VAC.getValue());
+  }
   return true;
 }
 
@@ -1122,9 +942,6 @@ struct AAPointerInfoImpl
   using BaseTy = StateWrapper<AA::PointerInfo::State, AAPointerInfo>;
   AAPointerInfoImpl(const IRPosition &IRP, Attributor &A) : BaseTy(IRP) {}
 
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override { AAPointerInfo::initialize(A); }
-
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
     return std::string("PointerInfo ") +
@@ -1144,9 +961,14 @@ struct AAPointerInfoImpl
       const override {
     return State::forallInterferingAccesses(OAS, CB);
   }
-  bool forallInterferingAccesses(
-      Attributor &A, const AbstractAttribute &QueryingAA, Instruction &I,
-      function_ref<bool(const Access &, bool)> UserCB) const override {
+
+  bool
+  forallInterferingAccesses(Attributor &A, const AbstractAttribute &QueryingAA,
+                            Instruction &I,
+                            function_ref<bool(const Access &, bool)> UserCB,
+                            bool &HasBeenWrittenTo) const override {
+    HasBeenWrittenTo = false;
+
     SmallPtrSet<const Access *, 8> DominatingWrites;
     SmallVector<std::pair<const Access *, bool>, 8> InterferingAccesses;
 
@@ -1182,14 +1004,12 @@ struct AAPointerInfoImpl
 
     const bool FindInterferingWrites = I.mayReadFromMemory();
     const bool FindInterferingReads = I.mayWriteToMemory();
-    const bool UseDominanceReasoning = FindInterferingWrites;
+    const bool UseDominanceReasoning =
+        FindInterferingWrites && NoRecurseAA.isKnownNoRecurse();
     const bool CanUseCFGResoning = CanIgnoreThreading(I);
     InformationCache &InfoCache = A.getInfoCache();
     const DominatorTree *DT =
-        NoRecurseAA.isKnownNoRecurse() && UseDominanceReasoning
-            ? InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
-                  Scope)
-            : nullptr;
+        InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(Scope);
 
     enum GPUAddressSpace : unsigned {
       Generic = 0,
@@ -1246,22 +1066,17 @@ struct AAPointerInfoImpl
           (!FindInterferingReads || !Acc.isRead()))
         return true;
 
+      bool Dominates = DT && Exact && Acc.isMustAccess() &&
+                       (Acc.getLocalInst()->getFunction() == &Scope) &&
+                       DT->dominates(Acc.getRemoteInst(), &I);
+      if (FindInterferingWrites && Dominates)
+        HasBeenWrittenTo = true;
+
       // For now we only filter accesses based on CFG reasoning which does not
       // work yet if we have threading effects, or the access is complicated.
-      if (CanUseCFGResoning) {
-        if ((!Acc.isWrite() ||
-             !AA::isPotentiallyReachable(A, *Acc.getLocalInst(), I, QueryingAA,
-                                         IsLiveInCalleeCB)) &&
-            (!Acc.isRead() ||
-             !AA::isPotentiallyReachable(A, I, *Acc.getLocalInst(), QueryingAA,
-                                         IsLiveInCalleeCB)))
-          return true;
-        if (DT && Exact && (Acc.getLocalInst()->getFunction() == &Scope) &&
-            IsSameThreadAsLoad(Acc)) {
-          if (DT->dominates(Acc.getLocalInst(), &I))
-            DominatingWrites.insert(&Acc);
-        }
-      }
+      if (CanUseCFGResoning && Dominates && UseDominanceReasoning &&
+          IsSameThreadAsLoad(Acc))
+        DominatingWrites.insert(&Acc);
 
       InterferingAccesses.push_back({&Acc, Exact});
       return true;
@@ -1269,19 +1084,27 @@ struct AAPointerInfoImpl
     if (!State::forallInterferingAccesses(I, AccessCB))
       return false;
 
-    // If we cannot use CFG reasoning we only filter the non-write accesses
-    // and are done here.
-    if (!CanUseCFGResoning) {
-      for (auto &It : InterferingAccesses)
-        if (!UserCB(*It.first, It.second))
-          return false;
-      return true;
+    if (HasBeenWrittenTo) {
+      const Function *ScopePtr = &Scope;
+      IsLiveInCalleeCB = [ScopePtr](const Function &Fn) {
+        return ScopePtr != &Fn;
+      };
     }
 
     // Helper to determine if we can skip a specific write access. This is in
     // the worst case quadratic as we are looking for another write that will
     // hide the effect of this one.
     auto CanSkipAccess = [&](const Access &Acc, bool Exact) {
+      if ((!Acc.isWrite() ||
+           !AA::isPotentiallyReachable(A, *Acc.getLocalInst(), I, QueryingAA,
+                                       IsLiveInCalleeCB)) &&
+          (!Acc.isRead() ||
+           !AA::isPotentiallyReachable(A, I, *Acc.getLocalInst(), QueryingAA,
+                                       IsLiveInCalleeCB)))
+        return true;
+
+      if (!DT || !UseDominanceReasoning)
+        return false;
       if (!IsSameThreadAsLoad(Acc))
         return false;
       if (!DominatingWrites.count(&Acc))
@@ -1303,7 +1126,7 @@ struct AAPointerInfoImpl
     // succeeded for all or not.
     unsigned NumInterferingAccesses = InterferingAccesses.size();
     for (auto &It : InterferingAccesses) {
-      if (!DT || NumInterferingAccesses > MaxInterferingAccesses ||
+      if (NumInterferingAccesses > MaxInterferingAccesses ||
           !CanSkipAccess(*It.first, It.second)) {
         if (!UserCB(*It.first, It.second))
           return false;
@@ -1339,8 +1162,9 @@ struct AAPointerInfoImpl
         if (FromCallee) {
           Content = A.translateArgumentToCallSiteContent(
               RAcc.getContent(), CB, *this, UsedAssumedInformation);
-          AK = AccessKind(
-              AK & (IsByval ? AccessKind::AK_READ : AccessKind::AK_READ_WRITE));
+          AK =
+              AccessKind(AK & (IsByval ? AccessKind::AK_R : AccessKind::AK_RW));
+          AK = AccessKind(AK | (RAcc.isMayAccess() ? AK_MAY : AK_MUST));
         }
         Changed =
             Changed | addAccess(A, OAS.getOffset(), OAS.getSize(), CB, Content,
@@ -1353,6 +1177,27 @@ struct AAPointerInfoImpl
   /// Statistic tracking for all AAPointerInfo implementations.
   /// See AbstractAttribute::trackStatistics().
   void trackPointerInfoStatistics(const IRPosition &IRP) const {}
+
+  /// Dump the state into \p O.
+  void dumpState(raw_ostream &O) {
+    for (auto &It : AccessBins) {
+      O << "[" << It.first.getOffset() << "-"
+        << It.first.getOffset() + It.first.getSize()
+        << "] : " << It.getSecond()->size() << "\n";
+      for (auto &Acc : *It.getSecond()) {
+        O << "     - " << Acc.getKind() << " - " << *Acc.getLocalInst() << "\n";
+        if (Acc.getLocalInst() != Acc.getRemoteInst())
+          O << "     -->                         " << *Acc.getRemoteInst()
+            << "\n";
+        if (!Acc.isWrittenValueYetUndetermined()) {
+          if (Acc.getWrittenValue())
+            O << "       - c: " << *Acc.getWrittenValue() << "\n";
+          else
+            O << "       - c: <unknown>\n";
+        }
+      }
+    }
+  }
 };
 
 struct AAPointerInfoFloating : public AAPointerInfoImpl {
@@ -1360,9 +1205,6 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
   AAPointerInfoFloating(const IRPosition &IRP, Attributor &A)
       : AAPointerInfoImpl(IRP, A) {}
 
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override { AAPointerInfoImpl::initialize(A); }
-
   /// Deal with an access and signal if it was handled successfully.
   bool handleAccess(Attributor &A, Instruction &I, Value &Ptr,
                     Optional<Value *> Content, AccessKind Kind, int64_t Offset,
@@ -1460,7 +1302,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         Follow = true;
         return true;
       }
-      if (isa<CastInst>(Usr) || isa<SelectInst>(Usr))
+      if (isa<CastInst>(Usr) || isa<SelectInst>(Usr) || isa<ReturnInst>(Usr))
         return HandlePassthroughUser(Usr, OffsetInfoMap[CurPtr], Follow);
 
       // For PHIs we need to take care of the recurrence explicitly as the value
@@ -1469,6 +1311,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
       if (isa<PHINode>(Usr)) {
         // Note the order here, the Usr access might change the map, CurPtr is
         // already in it though.
+        bool IsFirstPHIUser = !OffsetInfoMap.count(Usr);
         OffsetInfo &UsrOI = OffsetInfoMap[Usr];
         OffsetInfo &PtrOI = OffsetInfoMap[CurPtr];
         // Check if the PHI is invariant (so far).
@@ -1484,52 +1327,69 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
         }
 
         // Check if the PHI operand is not dependent on the PHI itself.
-        // TODO: This is not great as we look at the pointer type. However, it
-        // is unclear where the Offset size comes from with typeless pointers.
         APInt Offset(
             DL.getIndexSizeInBits(CurPtr->getType()->getPointerAddressSpace()),
             0);
-        if (&AssociatedValue == CurPtr->stripAndAccumulateConstantOffsets(
-                                    DL, Offset, /* AllowNonInbounds */ true)) {
-          if (Offset != PtrOI.Offset) {
-            LLVM_DEBUG(dbgs()
-                       << "[AAPointerInfo] PHI operand pointer offset mismatch "
-                       << *CurPtr << " in " << *Usr << "\n");
-            return false;
-          }
-          return HandlePassthroughUser(Usr, PtrOI, Follow);
+        Value *CurPtrBase = CurPtr->stripAndAccumulateConstantOffsets(
+            DL, Offset, /* AllowNonInbounds */ true);
+        auto It = OffsetInfoMap.find(CurPtrBase);
+        if (It != OffsetInfoMap.end()) {
+          Offset += It->getSecond().Offset;
+          if (IsFirstPHIUser || Offset == UsrOI.Offset)
+            return HandlePassthroughUser(Usr, PtrOI, Follow);
+          LLVM_DEBUG(dbgs()
+                     << "[AAPointerInfo] PHI operand pointer offset mismatch "
+                     << *CurPtr << " in " << *Usr << "\n");
+        } else {
+          LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI operand is too complex "
+                            << *CurPtr << " in " << *Usr << "\n");
         }
 
         // TODO: Approximate in case we know the direction of the recurrence.
-        LLVM_DEBUG(dbgs() << "[AAPointerInfo] PHI operand is too complex "
-                          << *CurPtr << " in " << *Usr << "\n");
         UsrOI = PtrOI;
         UsrOI.Offset = OffsetAndSize::Unknown;
         Follow = true;
         return true;
       }
 
-      if (auto *LoadI = dyn_cast<LoadInst>(Usr))
-        return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr,
-                            AccessKind::AK_READ, OffsetInfoMap[CurPtr].Offset,
-                            Changed, LoadI->getType());
+      if (auto *LoadI = dyn_cast<LoadInst>(Usr)) {
+        // If the access is to a pointer that may or may not be the associated
+        // value, e.g. due to a PHI, we cannot assume it will be read.
+        AccessKind AK = AccessKind::AK_R;
+        if (getUnderlyingObject(CurPtr) == &AssociatedValue)
+          AK = AccessKind(AK | AccessKind::AK_MUST);
+        else
+          AK = AccessKind(AK | AccessKind::AK_MAY);
+        return handleAccess(A, *LoadI, *CurPtr, /* Content */ nullptr, AK,
+                            OffsetInfoMap[CurPtr].Offset, Changed,
+                            LoadI->getType());
+      }
+
       if (auto *StoreI = dyn_cast<StoreInst>(Usr)) {
         if (StoreI->getValueOperand() == CurPtr) {
           LLVM_DEBUG(dbgs() << "[AAPointerInfo] Escaping use in store "
                             << *StoreI << "\n");
           return false;
         }
+        // If the access is to a pointer that may or may not be the associated
+        // value, e.g. due to a PHI, we cannot assume it will be written.
+        AccessKind AK = AccessKind::AK_W;
+        if (getUnderlyingObject(CurPtr) == &AssociatedValue)
+          AK = AccessKind(AK | AccessKind::AK_MUST);
+        else
+          AK = AccessKind(AK | AccessKind::AK_MAY);
         bool UsedAssumedInformation = false;
-        Optional<Value *> Content = A.getAssumedSimplified(
-            *StoreI->getValueOperand(), *this, UsedAssumedInformation);
-        return handleAccess(A, *StoreI, *CurPtr, Content, AccessKind::AK_WRITE,
+        Optional<Value *> Content =
+            A.getAssumedSimplified(*StoreI->getValueOperand(), *this,
+                                   UsedAssumedInformation, AA::Interprocedural);
+        return handleAccess(A, *StoreI, *CurPtr, Content, AK,
                             OffsetInfoMap[CurPtr].Offset, Changed,
                             StoreI->getValueOperand()->getType());
       }
       if (auto *CB = dyn_cast<CallBase>(Usr)) {
         if (CB->isLifetimeStartOrEnd())
           return true;
-        if (TLI && isFreeCall(CB, TLI))
+        if (getFreedOperand(CB, TLI) == U)
           return true;
         if (CB->isArgOperand(&U)) {
           unsigned ArgNo = CB->getArgOperandNo(&U);
@@ -1539,7 +1399,7 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
           Changed = translateAndAddState(A, CSArgPI,
                                          OffsetInfoMap[CurPtr].Offset, *CB) |
                     Changed;
-          return true;
+          return isValidState();
         }
         LLVM_DEBUG(dbgs() << "[AAPointerInfo] Call user not handled " << *CB
                           << "\n");
@@ -1551,36 +1411,30 @@ struct AAPointerInfoFloating : public AAPointerInfoImpl {
       return false;
     };
     auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) {
-      if (OffsetInfoMap.count(NewU))
+      if (OffsetInfoMap.count(NewU)) {
+        LLVM_DEBUG({
+          if (!(OffsetInfoMap[NewU] == OffsetInfoMap[OldU])) {
+            dbgs() << "[AAPointerInfo] Equivalent use callback failed: "
+                   << OffsetInfoMap[NewU].Offset << " vs "
+                   << OffsetInfoMap[OldU].Offset << "\n";
+          }
+        });
         return OffsetInfoMap[NewU] == OffsetInfoMap[OldU];
+      }
       OffsetInfoMap[NewU] = OffsetInfoMap[OldU];
       return true;
     };
     if (!A.checkForAllUses(UsePred, *this, AssociatedValue,
                            /* CheckBBLivenessOnly */ true, DepClassTy::OPTIONAL,
-                           /* IgnoreDroppableUses */ true, EquivalentUseCB))
+                           /* IgnoreDroppableUses */ true, EquivalentUseCB)) {
+      LLVM_DEBUG(
+          dbgs() << "[AAPointerInfo] Check for all uses failed, abort!\n");
       return indicatePessimisticFixpoint();
+    }
 
     LLVM_DEBUG({
       dbgs() << "Accesses by bin after update:\n";
-      for (auto &It : AccessBins) {
-        dbgs() << "[" << It.first.getOffset() << "-"
-               << It.first.getOffset() + It.first.getSize()
-               << "] : " << It.getSecond()->size() << "\n";
-        for (auto &Acc : *It.getSecond()) {
-          dbgs() << "     - " << Acc.getKind() << " - " << *Acc.getLocalInst()
-                 << "\n";
-          if (Acc.getLocalInst() != Acc.getRemoteInst())
-            dbgs() << "     -->                         "
-                   << *Acc.getRemoteInst() << "\n";
-          if (!Acc.isWrittenValueYetUndetermined()) {
-            if (Acc.getWrittenValue())
-              dbgs() << "       - c: " << *Acc.getWrittenValue() << "\n";
-            else
-              dbgs() << "       - c: <unknown>\n";
-          }
-        }
-      }
+      dumpState(dbgs());
     });
 
     return Changed;
@@ -1643,16 +1497,22 @@ struct AAPointerInfoCallSiteArgument final : AAPointerInfoFloating {
       unsigned ArgNo = getIRPosition().getCallSiteArgNo();
       ChangeStatus Changed = ChangeStatus::UNCHANGED;
       if (ArgNo == 0) {
-        handleAccess(A, *MI, Ptr, nullptr, AccessKind::AK_WRITE, 0, Changed,
-                     nullptr, LengthVal);
+        handleAccess(A, *MI, Ptr, nullptr, AccessKind::AK_MUST_WRITE, 0,
+                     Changed, nullptr, LengthVal);
       } else if (ArgNo == 1) {
-        handleAccess(A, *MI, Ptr, nullptr, AccessKind::AK_READ, 0, Changed,
+        handleAccess(A, *MI, Ptr, nullptr, AccessKind::AK_MUST_READ, 0, Changed,
                      nullptr, LengthVal);
       } else {
         LLVM_DEBUG(dbgs() << "[AAPointerInfo] Unhandled memory intrinsic "
                           << *MI << "\n");
         return indicatePessimisticFixpoint();
       }
+
+      LLVM_DEBUG({
+        dbgs() << "Accesses by bin after update:\n";
+        dumpState(dbgs());
+      });
+
       return Changed;
     }
 
@@ -1954,23 +1814,23 @@ bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
 ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
   ChangeStatus Changed = ChangeStatus::UNCHANGED;
 
-  auto ReturnValueCB = [&](Value &V, const Instruction *CtxI, ReturnInst &Ret,
-                           bool) -> bool {
-    assert(AA::isValidInScope(V, Ret.getFunction()) &&
-           "Assumed returned value should be valid in function scope!");
-    if (ReturnedValues[&V].insert(&Ret))
-      Changed = ChangeStatus::CHANGED;
-    return true;
-  };
-
+  SmallVector<AA::ValueAndContext> Values;
   bool UsedAssumedInformation = false;
   auto ReturnInstCB = [&](Instruction &I) {
     ReturnInst &Ret = cast<ReturnInst>(I);
-    return genericValueTraversal<ReturnInst>(
-        A, IRPosition::value(*Ret.getReturnValue()), *this, Ret, ReturnValueCB,
-        &I, UsedAssumedInformation, /* UseValueSimplify */ true,
-        /* MaxValues */ 16,
-        /* StripCB */ nullptr, AA::Intraprocedural);
+    Values.clear();
+    if (!A.getAssumedSimplifiedValues(IRPosition::value(*Ret.getReturnValue()),
+                                      *this, Values, AA::Intraprocedural,
+                                      UsedAssumedInformation))
+      Values.push_back({*Ret.getReturnValue(), Ret});
+
+    for (auto &VAC : Values) {
+      assert(AA::isValidInScope(*VAC.getValue(), Ret.getFunction()) &&
+             "Assumed returned value should be valid in function scope!");
+      if (ReturnedValues[VAC.getValue()].insert(&Ret))
+        Changed = ChangeStatus::CHANGED;
+    }
+    return true;
   };
 
   // Discover returned values from all live returned instructions in the
@@ -2472,6 +2332,18 @@ struct AANonNullFloating : public AANonNullImpl {
   ChangeStatus updateImpl(Attributor &A) override {
     const DataLayout &DL = A.getDataLayout();
 
+    bool Stripped;
+    bool UsedAssumedInformation = false;
+    SmallVector<AA::ValueAndContext> Values;
+    if (!A.getAssumedSimplifiedValues(getIRPosition(), *this, Values,
+                                      AA::AnyScope, UsedAssumedInformation)) {
+      Values.push_back({getAssociatedValue(), getCtxI()});
+      Stripped = false;
+    } else {
+      Stripped = Values.size() != 1 ||
+                 Values.front().getValue() != &getAssociatedValue();
+    }
+
     DominatorTree *DT = nullptr;
     AssumptionCache *AC = nullptr;
     InformationCache &InfoCache = A.getInfoCache();
@@ -2480,8 +2352,8 @@ struct AANonNullFloating : public AANonNullImpl {
       AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*Fn);
     }
 
-    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
-                            AANonNull::StateType &T, bool Stripped) -> bool {
+    AANonNull::StateType T;
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI) -> bool {
       const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V),
                                              DepClassTy::REQUIRED);
       if (!Stripped && this == &AA) {
@@ -2495,12 +2367,9 @@ struct AANonNullFloating : public AANonNullImpl {
       return T.isValidState();
     };
 
-    StateType T;
-    bool UsedAssumedInformation = false;
-    if (!genericValueTraversal<StateType>(A, getIRPosition(), *this, T,
-                                          VisitValueCB, getCtxI(),
-                                          UsedAssumedInformation))
-      return indicatePessimisticFixpoint();
+    for (const auto &VAC : Values)
+      if (!VisitValueCB(*VAC.getValue(), VAC.getCtxI()))
+        return indicatePessimisticFixpoint();
 
     return clampStateAndIndicateChange(getState(), T);
   }
@@ -2753,8 +2622,9 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
         if (!NoUndefAA.isKnownNoUndef())
           continue;
         bool UsedAssumedInformation = false;
-        Optional<Value *> SimplifiedVal = A.getAssumedSimplified(
-            IRPosition::value(*ArgVal), *this, UsedAssumedInformation);
+        Optional<Value *> SimplifiedVal =
+            A.getAssumedSimplified(IRPosition::value(*ArgVal), *this,
+                                   UsedAssumedInformation, AA::Interprocedural);
         if (UsedAssumedInformation)
           continue;
         if (SimplifiedVal && !SimplifiedVal.value())
@@ -2925,8 +2795,9 @@ private:
   Optional<Value *> stopOnUndefOrAssumed(Attributor &A, Value *V,
                                          Instruction *I) {
     bool UsedAssumedInformation = false;
-    Optional<Value *> SimplifiedV = A.getAssumedSimplified(
-        IRPosition::value(*V), *this, UsedAssumedInformation);
+    Optional<Value *> SimplifiedV =
+        A.getAssumedSimplified(IRPosition::value(*V), *this,
+                               UsedAssumedInformation, AA::Interprocedural);
     if (!UsedAssumedInformation) {
       // Don't depend on assumed values.
       if (!SimplifiedV) {
@@ -3369,7 +3240,9 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl {
           }
         }
 
-        if (!AA::isPotentiallyReachable(A, *UserI, *getCtxI(), *this))
+        if (!AA::isPotentiallyReachable(
+                A, *UserI, *getCtxI(), *this,
+                [ScopeFn](const Function &Fn) { return &Fn != ScopeFn; }))
           return true;
       }
 
@@ -4364,10 +4237,23 @@ struct AADereferenceableFloating : AADereferenceableImpl {
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
+
+    bool Stripped;
+    bool UsedAssumedInformation = false;
+    SmallVector<AA::ValueAndContext> Values;
+    if (!A.getAssumedSimplifiedValues(getIRPosition(), *this, Values,
+                                      AA::AnyScope, UsedAssumedInformation)) {
+      Values.push_back({getAssociatedValue(), getCtxI()});
+      Stripped = false;
+    } else {
+      Stripped = Values.size() != 1 ||
+                 Values.front().getValue() != &getAssociatedValue();
+    }
+
     const DataLayout &DL = A.getDataLayout();
+    DerefState T;
 
-    auto VisitValueCB = [&](const Value &V, const Instruction *, DerefState &T,
-                            bool Stripped) -> bool {
+    auto VisitValueCB = [&](const Value &V) -> bool {
       unsigned IdxWidth =
           DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
       APInt Offset(IdxWidth, 0);
@@ -4420,12 +4306,9 @@ struct AADereferenceableFloating : AADereferenceableImpl {
       return T.isValidState();
     };
 
-    DerefState T;
-    bool UsedAssumedInformation = false;
-    if (!genericValueTraversal<DerefState>(A, getIRPosition(), *this, T,
-                                           VisitValueCB, getCtxI(),
-                                           UsedAssumedInformation))
-      return indicatePessimisticFixpoint();
+    for (const auto &VAC : Values)
+      if (!VisitValueCB(*VAC.getValue()))
+        return indicatePessimisticFixpoint();
 
     return clampStateAndIndicateChange(getState(), T);
   }
@@ -4652,8 +4535,20 @@ struct AAAlignFloating : AAAlignImpl {
   ChangeStatus updateImpl(Attributor &A) override {
     const DataLayout &DL = A.getDataLayout();
 
-    auto VisitValueCB = [&](Value &V, const Instruction *,
-                            AAAlign::StateType &T, bool Stripped) -> bool {
+    bool Stripped;
+    bool UsedAssumedInformation = false;
+    SmallVector<AA::ValueAndContext> Values;
+    if (!A.getAssumedSimplifiedValues(getIRPosition(), *this, Values,
+                                      AA::AnyScope, UsedAssumedInformation)) {
+      Values.push_back({getAssociatedValue(), getCtxI()});
+      Stripped = false;
+    } else {
+      Stripped = Values.size() != 1 ||
+                 Values.front().getValue() != &getAssociatedValue();
+    }
+
+    StateType T;
+    auto VisitValueCB = [&](Value &V) -> bool {
       if (isa<UndefValue>(V) || isa<ConstantPointerNull>(V))
         return true;
       const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V),
@@ -4686,15 +4581,13 @@ struct AAAlignFloating : AAAlignImpl {
       return T.isValidState();
     };
 
-    StateType T;
-    bool UsedAssumedInformation = false;
-    if (!genericValueTraversal<StateType>(A, getIRPosition(), *this, T,
-                                          VisitValueCB, getCtxI(),
-                                          UsedAssumedInformation))
-      return indicatePessimisticFixpoint();
+    for (const auto &VAC : Values) {
+      if (!VisitValueCB(*VAC.getValue()))
+        return indicatePessimisticFixpoint();
+    }
 
-    // TODO: If we know we visited all incoming values, thus no are assumed
-    // dead, we can take the known information from the state T.
+    //  TODO: If we know we visited all incoming values, thus no are assumed
+    //  dead, we can take the known information from the state T.
     return clampStateAndIndicateChange(getState(), T);
   }
 
@@ -4941,7 +4834,9 @@ struct AAInstanceInfoImpl : public AAInstanceInfo {
           return false;
         // If this call base might reach the scope again we might forward the
         // argument back here. This is very conservative.
-        if (AA::isPotentiallyReachable(A, *CB, *Scope, *this, nullptr))
+        if (AA::isPotentiallyReachable(
+                A, *CB, *Scope, *this,
+                [Scope](const Function &Fn) { return &Fn != Scope; }))
           return false;
         return true;
       }
@@ -5518,9 +5413,9 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     if (const auto &NewV = VMap.lookup(&V))
       return NewV;
     bool UsedAssumedInformation = false;
-    Optional<Value *> SimpleV =
-        A.getAssumedSimplified(V, QueryingAA, UsedAssumedInformation);
-    if (!SimpleV)
+    Optional<Value *> SimpleV = A.getAssumedSimplified(
+        V, QueryingAA, UsedAssumedInformation, AA::Interprocedural);
+    if (!SimpleV.has_value())
       return PoisonValue::get(&Ty);
     Value *EffectiveV = &V;
     if (SimpleV.value())
@@ -5561,8 +5456,8 @@ struct AAValueSimplifyImpl : AAValueSimplify {
     bool UsedAssumedInformation = false;
     Optional<Value *> QueryingValueSimplified = &IRP.getAssociatedValue();
     if (Simplify)
-      QueryingValueSimplified =
-          A.getAssumedSimplified(IRP, QueryingAA, UsedAssumedInformation);
+      QueryingValueSimplified = A.getAssumedSimplified(
+          IRP, QueryingAA, UsedAssumedInformation, AA::Interprocedural);
     return unionAssumed(QueryingValueSimplified);
   }
 
@@ -5763,209 +5658,11 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
       indicatePessimisticFixpoint();
   }
 
-  /// Check if \p Cmp is a comparison we can simplify.
-  ///
-  /// We handle multiple cases, one in which at least one operand is an
-  /// (assumed) nullptr. If so, try to simplify it using AANonNull on the other
-  /// operand. Return true if successful, in that case SimplifiedAssociatedValue
-  /// will be updated.
-  bool handleCmp(Attributor &A, CmpInst &Cmp) {
-    auto Union = [&](Value &V) {
-      SimplifiedAssociatedValue = AA::combineOptionalValuesInAAValueLatice(
-          SimplifiedAssociatedValue, &V, V.getType());
-      return SimplifiedAssociatedValue != Optional<Value *>(nullptr);
-    };
-
-    Value *LHS = Cmp.getOperand(0);
-    Value *RHS = Cmp.getOperand(1);
-
-    // Simplify the operands first.
-    bool UsedAssumedInformation = false;
-    const auto &SimplifiedLHS =
-        A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedLHS)
-      return true;
-    if (!SimplifiedLHS.value())
-      return false;
-    LHS = *SimplifiedLHS;
-
-    const auto &SimplifiedRHS =
-        A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedRHS)
-      return true;
-    if (!SimplifiedRHS.value())
-      return false;
-    RHS = *SimplifiedRHS;
-
-    LLVMContext &Ctx = Cmp.getContext();
-    // Handle the trivial case first in which we don't even need to think about
-    // null or non-null.
-    if (LHS == RHS && (Cmp.isTrueWhenEqual() || Cmp.isFalseWhenEqual())) {
-      Constant *NewVal =
-          ConstantInt::get(Type::getInt1Ty(Ctx), Cmp.isTrueWhenEqual());
-      if (!Union(*NewVal))
-        return false;
-      if (!UsedAssumedInformation)
-        indicateOptimisticFixpoint();
-      return true;
-    }
-
-    // From now on we only handle equalities (==, !=).
-    ICmpInst *ICmp = dyn_cast<ICmpInst>(&Cmp);
-    if (!ICmp || !ICmp->isEquality())
-      return false;
-
-    bool LHSIsNull = isa<ConstantPointerNull>(LHS);
-    bool RHSIsNull = isa<ConstantPointerNull>(RHS);
-    if (!LHSIsNull && !RHSIsNull)
-      return false;
-
-    // Left is the nullptr ==/!= non-nullptr case. We'll use AANonNull on the
-    // non-nullptr operand and if we assume it's non-null we can conclude the
-    // result of the comparison.
-    assert((LHSIsNull || RHSIsNull) &&
-           "Expected nullptr versus non-nullptr comparison at this point");
-
-    // The index is the operand that we assume is not null.
-    unsigned PtrIdx = LHSIsNull;
-    auto &PtrNonNullAA = A.getAAFor<AANonNull>(
-        *this, IRPosition::value(*ICmp->getOperand(PtrIdx)),
-        DepClassTy::REQUIRED);
-    if (!PtrNonNullAA.isAssumedNonNull())
-      return false;
-    UsedAssumedInformation |= !PtrNonNullAA.isKnownNonNull();
-
-    // The new value depends on the predicate, true for != and false for ==.
-    Constant *NewVal = ConstantInt::get(
-        Type::getInt1Ty(Ctx), ICmp->getPredicate() == CmpInst::ICMP_NE);
-    if (!Union(*NewVal))
-      return false;
-
-    if (!UsedAssumedInformation)
-      indicateOptimisticFixpoint();
-
-    return true;
-  }
-
-  /// Use the generic, non-optimistic InstSimplfy functionality if we managed to
-  /// simplify any operand of the instruction \p I. Return true if successful,
-  /// in that case SimplifiedAssociatedValue will be updated.
-  bool handleGenericInst(Attributor &A, Instruction &I) {
-    bool SomeSimplified = false;
-    bool UsedAssumedInformation = false;
-
-    SmallVector<Value *, 8> NewOps(I.getNumOperands());
-    int Idx = 0;
-    for (Value *Op : I.operands()) {
-      const auto &SimplifiedOp =
-          A.getAssumedSimplified(IRPosition::value(*Op, getCallBaseContext()),
-                                 *this, UsedAssumedInformation);
-      // If we are not sure about any operand we are not sure about the entire
-      // instruction, we'll wait.
-      if (!SimplifiedOp)
-        return true;
-
-      if (SimplifiedOp.value())
-        NewOps[Idx] = SimplifiedOp.value();
-      else
-        NewOps[Idx] = Op;
-
-      SomeSimplified |= (NewOps[Idx] != Op);
-      ++Idx;
-    }
-
-    // We won't bother with the InstSimplify interface if we didn't simplify any
-    // operand ourselves.
-    if (!SomeSimplified)
-      return false;
-
-    InformationCache &InfoCache = A.getInfoCache();
-    Function *F = I.getFunction();
-    const auto *DT =
-        InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*F);
-    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
-    auto *AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*F);
-    OptimizationRemarkEmitter *ORE = nullptr;
-
-    const DataLayout &DL = I.getModule()->getDataLayout();
-    SimplifyQuery Q(DL, TLI, DT, AC, &I);
-    if (Value *SimplifiedI =
-            simplifyInstructionWithOperands(&I, NewOps, Q, ORE)) {
-      SimplifiedAssociatedValue = AA::combineOptionalValuesInAAValueLatice(
-          SimplifiedAssociatedValue, SimplifiedI, I.getType());
-      return SimplifiedAssociatedValue != Optional<Value *>(nullptr);
-    }
-    return false;
-  }
-
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     auto Before = SimplifiedAssociatedValue;
-
-    // Do not simplify loads that are only used in llvm.assume if we cannot also
-    // remove all stores that may feed into the load. The reason is that the
-    // assume is probably worth something as long as the stores are around.
-    if (auto *LI = dyn_cast<LoadInst>(&getAssociatedValue())) {
-      InformationCache &InfoCache = A.getInfoCache();
-      if (InfoCache.isOnlyUsedByAssume(*LI)) {
-        SmallSetVector<Value *, 4> PotentialCopies;
-        SmallSetVector<Instruction *, 4> PotentialValueOrigins;
-        bool UsedAssumedInformation = false;
-        if (AA::getPotentiallyLoadedValues(A, *LI, PotentialCopies,
-                                           PotentialValueOrigins, *this,
-                                           UsedAssumedInformation,
-                                           /* OnlyExact */ true)) {
-          if (!llvm::all_of(PotentialValueOrigins, [&](Instruction *I) {
-                if (!I)
-                  return true;
-                if (auto *SI = dyn_cast<StoreInst>(I))
-                  return A.isAssumedDead(SI->getOperandUse(0), this,
-                                         /* LivenessAA */ nullptr,
-                                         UsedAssumedInformation,
-                                         /* CheckBBLivenessOnly */ false);
-                return A.isAssumedDead(*I, this, /* LivenessAA */ nullptr,
-                                       UsedAssumedInformation,
-                                       /* CheckBBLivenessOnly */ false);
-              }))
-            return indicatePessimisticFixpoint();
-        }
-      }
-    }
-
-    auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &,
-                            bool Stripped) -> bool {
-      auto &AA = A.getAAFor<AAValueSimplify>(
-          *this, IRPosition::value(V, getCallBaseContext()),
-          DepClassTy::REQUIRED);
-      if (!Stripped && this == &AA) {
-
-        if (auto *I = dyn_cast<Instruction>(&V)) {
-          if (auto *Cmp = dyn_cast<CmpInst>(&V))
-            if (handleCmp(A, *Cmp))
-              return true;
-          if (handleGenericInst(A, *I))
-            return true;
-        }
-        // TODO: Look the instruction and check recursively.
-
-        LLVM_DEBUG(dbgs() << "[ValueSimplify] Can't be stripped more : " << V
-                          << "\n");
-        return false;
-      }
-      return checkAndUpdate(A, *this,
-                            IRPosition::value(V, getCallBaseContext()));
-    };
-
-    bool Dummy = false;
-    bool UsedAssumedInformation = false;
-    if (!genericValueTraversal<bool>(A, getIRPosition(), *this, Dummy,
-                                     VisitValueCB, getCtxI(),
-                                     UsedAssumedInformation,
-                                     /* UseValueSimplify */ false))
-      if (!askSimplifiedValueForOtherAAs(A))
-        return indicatePessimisticFixpoint();
+    if (!askSimplifiedValueForOtherAAs(A))
+      return indicatePessimisticFixpoint();
 
     // If a candicate was found in this update, return CHANGED.
     return Before == SimplifiedAssociatedValue ? ChangeStatus::UNCHANGED
@@ -6122,6 +5819,8 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
   struct DeallocationInfo {
     /// The call that deallocates the memory.
     CallBase *const CB;
+    /// The value freed by the call.
+    Value *FreedOp;
 
     /// Flag to indicate if we don't know all objects this deallocation might
     /// free.
@@ -6153,14 +5852,14 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
       CallBase *CB = dyn_cast<CallBase>(&I);
       if (!CB)
         return true;
-      if (isFreeCall(CB, TLI)) {
-        DeallocationInfos[CB] = new (A.Allocator) DeallocationInfo{CB};
+      if (Value *FreedOp = getFreedOperand(CB, TLI)) {
+        DeallocationInfos[CB] = new (A.Allocator) DeallocationInfo{CB, FreedOp};
         return true;
       }
       // To do heap to stack, we need to know that the allocation itself is
       // removable once uses are rewritten, and that we can initialize the
       // alloca to the same pattern as the original allocation result.
-      if (isAllocationFn(CB, TLI) && isAllocRemovable(CB, TLI)) {
+      if (isRemovableAlloc(CB, TLI)) {
         auto *I8Ty = Type::getInt8Ty(CB->getParent()->getContext());
         if (nullptr != getInitialValueOfAllocation(CB, TLI, I8Ty)) {
           AllocationInfo *AI = new (A.Allocator) AllocationInfo{CB};
@@ -6427,44 +6126,36 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) {
                           /* CheckBBLivenessOnly */ true))
         continue;
 
-      // Use the optimistic version to get the freed objects, ignoring dead
-      // branches etc.
-      SmallVector<Value *, 8> Objects;
-      if (!AA::getAssumedUnderlyingObjects(A, *DI.CB->getArgOperand(0), Objects,
-                                           *this, DI.CB,
-                                           UsedAssumedInformation)) {
-        LLVM_DEBUG(
-            dbgs()
-            << "[H2S] Unexpected failure in getAssumedUnderlyingObjects!\n");
+      // Use the non-optimistic version to get the freed object.
+      Value *Obj = getUnderlyingObject(DI.FreedOp);
+      if (!Obj) {
+        LLVM_DEBUG(dbgs() << "[H2S] Unknown underlying object for free!\n");
         DI.MightFreeUnknownObjects = true;
         continue;
       }
 
-      // Check each object explicitly.
-      for (auto *Obj : Objects) {
-        // Free of null and undef can be ignored as no-ops (or UB in the latter
-        // case).
-        if (isa<ConstantPointerNull>(Obj) || isa<UndefValue>(Obj))
-          continue;
-
-        CallBase *ObjCB = dyn_cast<CallBase>(Obj);
-        if (!ObjCB) {
-          LLVM_DEBUG(dbgs()
-                     << "[H2S] Free of a non-call object: " << *Obj << "\n");
-          DI.MightFreeUnknownObjects = true;
-          continue;
-        }
+      // Free of null and undef can be ignored as no-ops (or UB in the latter
+      // case).
+      if (isa<ConstantPointerNull>(Obj) || isa<UndefValue>(Obj))
+        continue;
 
-        AllocationInfo *AI = AllocationInfos.lookup(ObjCB);
-        if (!AI) {
-          LLVM_DEBUG(dbgs() << "[H2S] Free of a non-allocation object: " << *Obj
-                            << "\n");
-          DI.MightFreeUnknownObjects = true;
-          continue;
-        }
+      CallBase *ObjCB = dyn_cast<CallBase>(Obj);
+      if (!ObjCB) {
+        LLVM_DEBUG(dbgs() << "[H2S] Free of a non-call object: " << *Obj
+                          << "\n");
+        DI.MightFreeUnknownObjects = true;
+        continue;
+      }
 
-        DI.PotentialAllocationCalls.insert(ObjCB);
+      AllocationInfo *AI = AllocationInfos.lookup(ObjCB);
+      if (!AI) {
+        LLVM_DEBUG(dbgs() << "[H2S] Free of a non-allocation object: " << *Obj
+                          << "\n");
+        DI.MightFreeUnknownObjects = true;
+        continue;
       }
+
+      DI.PotentialAllocationCalls.insert(ObjCB);
     }
   };
 
@@ -7692,7 +7383,7 @@ bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use &U,
                                                   const Instruction *UserI) {
   // The loaded value is unrelated to the pointer argument, no need to
   // follow the users of the load.
-  if (isa<LoadInst>(UserI))
+  if (isa<LoadInst>(UserI) || isa<ReturnInst>(UserI))
     return false;
 
   // By default we follow all uses assuming UserI might leak information on U,
@@ -7822,16 +7513,15 @@ struct AAMemoryLocationImpl : public AAMemoryLocation {
 
   AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A)
       : AAMemoryLocation(IRP, A), Allocator(A.Allocator) {
-    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
-      AccessKind2Accesses[u] = nullptr;
+    AccessKind2Accesses.fill(nullptr);
   }
 
   ~AAMemoryLocationImpl() {
     // The AccessSets are allocated via a BumpPtrAllocator, we call
     // the destructor manually.
-    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
-      if (AccessKind2Accesses[u])
-        AccessKind2Accesses[u]->~AccessSet();
+    for (AccessSet *AS : AccessKind2Accesses)
+      if (AS)
+        AS->~AccessSet();
   }
 
   /// See AbstractAttribute::initialize(...).
@@ -7999,7 +7689,7 @@ protected:
   /// Mapping from *single* memory location kinds, e.g., LOCAL_MEM with the
   /// value of NO_LOCAL_MEM, to the accesses encountered for this memory kind.
   using AccessSet = SmallSet<AccessInfo, 2, AccessInfo>;
-  AccessSet *AccessKind2Accesses[llvm::CTLog2<VALID_STATE>()];
+  std::array<AccessSet *, llvm::CTLog2<VALID_STATE>()> AccessKind2Accesses;
 
   /// Categorize the pointer arguments of CB that might access memory in
   /// AccessedLoc and update the state and access map accordingly.
@@ -8061,7 +7751,7 @@ void AAMemoryLocationImpl::categorizePtrValue(
                     << Ptr << " ["
                     << getMemoryLocationsAsStr(State.getAssumed()) << "]\n");
 
-  SmallVector<Value *, 8> Objects;
+  SmallSetVector<Value *, 8> Objects;
   bool UsedAssumedInformation = false;
   if (!AA::getAssumedUnderlyingObjects(A, Ptr, Objects, *this, &I,
                                        UsedAssumedInformation,
@@ -8670,19 +8360,19 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
 
     // Simplify the operands first.
     bool UsedAssumedInformation = false;
-    const auto &SimplifiedLHS =
-        A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedLHS)
+    const auto &SimplifiedLHS = A.getAssumedSimplified(
+        IRPosition::value(*LHS, getCallBaseContext()), *this,
+        UsedAssumedInformation, AA::Interprocedural);
+    if (!SimplifiedLHS.has_value())
       return true;
     if (!SimplifiedLHS.value())
       return false;
     LHS = *SimplifiedLHS;
 
-    const auto &SimplifiedRHS =
-        A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedRHS)
+    const auto &SimplifiedRHS = A.getAssumedSimplified(
+        IRPosition::value(*RHS, getCallBaseContext()), *this,
+        UsedAssumedInformation, AA::Interprocedural);
+    if (!SimplifiedRHS.has_value())
       return true;
     if (!SimplifiedRHS.value())
       return false;
@@ -8723,10 +8413,10 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
 
     // Simplify the operand first.
     bool UsedAssumedInformation = false;
-    const auto &SimplifiedOpV =
-        A.getAssumedSimplified(IRPosition::value(*OpV, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedOpV)
+    const auto &SimplifiedOpV = A.getAssumedSimplified(
+        IRPosition::value(*OpV, getCallBaseContext()), *this,
+        UsedAssumedInformation, AA::Interprocedural);
+    if (!SimplifiedOpV.has_value())
       return true;
     if (!SimplifiedOpV.value())
       return false;
@@ -8753,19 +8443,19 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
 
     // Simplify the operands first.
     bool UsedAssumedInformation = false;
-    const auto &SimplifiedLHS =
-        A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedLHS)
+    const auto &SimplifiedLHS = A.getAssumedSimplified(
+        IRPosition::value(*LHS, getCallBaseContext()), *this,
+        UsedAssumedInformation, AA::Interprocedural);
+    if (!SimplifiedLHS.has_value())
       return true;
     if (!SimplifiedLHS.value())
       return false;
     LHS = *SimplifiedLHS;
 
-    const auto &SimplifiedRHS =
-        A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedRHS)
+    const auto &SimplifiedRHS = A.getAssumedSimplified(
+        IRPosition::value(*RHS, getCallBaseContext()), *this,
+        UsedAssumedInformation, AA::Interprocedural);
+    if (!SimplifiedRHS.has_value())
       return true;
     if (!SimplifiedRHS.value())
       return false;
@@ -8820,17 +8510,18 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
-    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
-                            IntegerRangeState &T, bool Stripped) -> bool {
+
+    IntegerRangeState T(getBitWidth());
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI) -> bool {
       Instruction *I = dyn_cast<Instruction>(&V);
       if (!I || isa<CallBase>(I)) {
 
         // Simplify the operand first.
         bool UsedAssumedInformation = false;
-        const auto &SimplifiedOpV =
-            A.getAssumedSimplified(IRPosition::value(V, getCallBaseContext()),
-                                   *this, UsedAssumedInformation);
-        if (!SimplifiedOpV)
+        const auto &SimplifiedOpV = A.getAssumedSimplified(
+            IRPosition::value(V, getCallBaseContext()), *this,
+            UsedAssumedInformation, AA::Interprocedural);
+        if (!SimplifiedOpV.has_value())
           return true;
         if (!SimplifiedOpV.value())
           return false;
@@ -8880,13 +8571,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
       return T.isValidState();
     };
 
-    IntegerRangeState T(getBitWidth());
-
-    bool UsedAssumedInformation = false;
-    if (!genericValueTraversal<IntegerRangeState>(A, getIRPosition(), *this, T,
-                                                  VisitValueCB, getCtxI(),
-                                                  UsedAssumedInformation,
-                                                  /* UseValueSimplify */ false))
+    if (!VisitValueCB(getAssociatedValue(), getCtxI()))
       return indicatePessimisticFixpoint();
 
     // Ensure that long def-use chains can't cause circular reasoning either by
@@ -8998,6 +8683,36 @@ struct AAPotentialConstantValuesImpl : AAPotentialConstantValues {
       AAPotentialConstantValues::initialize(A);
   }
 
+  bool fillSetWithConstantValues(Attributor &A, const IRPosition &IRP, SetTy &S,
+                                 bool &ContainsUndef) {
+    SmallVector<AA::ValueAndContext> Values;
+    bool UsedAssumedInformation = false;
+    if (!A.getAssumedSimplifiedValues(IRP, *this, Values, AA::Interprocedural,
+                                      UsedAssumedInformation)) {
+      if (!IRP.getAssociatedType()->isIntegerTy())
+        return false;
+      auto &PotentialValuesAA = A.getAAFor<AAPotentialConstantValues>(
+          *this, IRP, DepClassTy::REQUIRED);
+      if (!PotentialValuesAA.getState().isValidState())
+        return false;
+      ContainsUndef = PotentialValuesAA.getState().undefIsContained();
+      S = PotentialValuesAA.getState().getAssumedSet();
+      return true;
+    }
+
+    for (auto &It : Values) {
+      if (isa<UndefValue>(It.getValue()))
+        continue;
+      auto *CI = dyn_cast<ConstantInt>(It.getValue());
+      if (!CI)
+        return false;
+      S.insert(CI->getValue());
+    }
+    ContainsUndef = S.empty();
+
+    return true;
+  }
+
   /// See AbstractAttribute::getAsStr().
   const std::string getAsStr() const override {
     std::string Str;
@@ -9186,50 +8901,22 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
     Value *LHS = ICI->getOperand(0);
     Value *RHS = ICI->getOperand(1);
 
-    // Simplify the operands first.
-    bool UsedAssumedInformation = false;
-    const auto &SimplifiedLHS =
-        A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedLHS)
-      return ChangeStatus::UNCHANGED;
-    if (!SimplifiedLHS.value())
-      return indicatePessimisticFixpoint();
-    LHS = *SimplifiedLHS;
-
-    const auto &SimplifiedRHS =
-        A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedRHS)
-      return ChangeStatus::UNCHANGED;
-    if (!SimplifiedRHS.value())
-      return indicatePessimisticFixpoint();
-    RHS = *SimplifiedRHS;
-
-    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
-      return indicatePessimisticFixpoint();
-
-    auto &LHSAA = A.getAAFor<AAPotentialConstantValues>(
-        *this, IRPosition::value(*LHS), DepClassTy::REQUIRED);
-    if (!LHSAA.isValidState())
+    bool LHSContainsUndef = false, RHSContainsUndef = false;
+    SetTy LHSAAPVS, RHSAAPVS;
+    if (!fillSetWithConstantValues(A, IRPosition::value(*LHS), LHSAAPVS,
+                                   LHSContainsUndef) ||
+        !fillSetWithConstantValues(A, IRPosition::value(*RHS), RHSAAPVS,
+                                   RHSContainsUndef))
       return indicatePessimisticFixpoint();
 
-    auto &RHSAA = A.getAAFor<AAPotentialConstantValues>(
-        *this, IRPosition::value(*RHS), DepClassTy::REQUIRED);
-    if (!RHSAA.isValidState())
-      return indicatePessimisticFixpoint();
-
-    const SetTy &LHSAAPVS = LHSAA.getAssumedSet();
-    const SetTy &RHSAAPVS = RHSAA.getAssumedSet();
-
     // TODO: make use of undef flag to limit potential values aggressively.
     bool MaybeTrue = false, MaybeFalse = false;
     const APInt Zero(RHS->getType()->getIntegerBitWidth(), 0);
-    if (LHSAA.undefIsContained() && RHSAA.undefIsContained()) {
+    if (LHSContainsUndef && RHSContainsUndef) {
       // The result of any comparison between undefs can be soundly replaced
       // with undef.
       unionAssumedWithUndef();
-    } else if (LHSAA.undefIsContained()) {
+    } else if (LHSContainsUndef) {
       for (const APInt &R : RHSAAPVS) {
         bool CmpResult = calculateICmpInst(ICI, Zero, R);
         MaybeTrue |= CmpResult;
@@ -9237,7 +8924,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
         if (MaybeTrue & MaybeFalse)
           return indicatePessimisticFixpoint();
       }
-    } else if (RHSAA.undefIsContained()) {
+    } else if (RHSContainsUndef) {
       for (const APInt &L : LHSAAPVS) {
         bool CmpResult = calculateICmpInst(ICI, L, Zero);
         MaybeTrue |= CmpResult;
@@ -9269,29 +8956,7 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
     Value *LHS = SI->getTrueValue();
     Value *RHS = SI->getFalseValue();
 
-    // Simplify the operands first.
     bool UsedAssumedInformation = false;
-    const auto &SimplifiedLHS =
-        A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedLHS)
-      return ChangeStatus::UNCHANGED;
-    if (!SimplifiedLHS.value())
-      return indicatePessimisticFixpoint();
-    LHS = *SimplifiedLHS;
-
-    const auto &SimplifiedRHS =
-        A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedRHS)
-      return ChangeStatus::UNCHANGED;
-    if (!SimplifiedRHS.value())
-      return indicatePessimisticFixpoint();
-    RHS = *SimplifiedRHS;
-
-    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
-      return indicatePessimisticFixpoint();
-
     Optional<Constant *> C = A.getAssumedConstant(*SI->getCondition(), *this,
                                                   UsedAssumedInformation);
 
@@ -9302,35 +8967,36 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
     else if (C && *C && (*C)->isZeroValue())
       OnlyRight = true;
 
-    const AAPotentialConstantValues *LHSAA = nullptr, *RHSAA = nullptr;
-    if (!OnlyRight) {
-      LHSAA = &A.getAAFor<AAPotentialConstantValues>(
-          *this, IRPosition::value(*LHS), DepClassTy::REQUIRED);
-      if (!LHSAA->isValidState())
-        return indicatePessimisticFixpoint();
-    }
-    if (!OnlyLeft) {
-      RHSAA = &A.getAAFor<AAPotentialConstantValues>(
-          *this, IRPosition::value(*RHS), DepClassTy::REQUIRED);
-      if (!RHSAA->isValidState())
-        return indicatePessimisticFixpoint();
-    }
+    bool LHSContainsUndef = false, RHSContainsUndef = false;
+    SetTy LHSAAPVS, RHSAAPVS;
+    if (!OnlyRight && !fillSetWithConstantValues(A, IRPosition::value(*LHS),
+                                                 LHSAAPVS, LHSContainsUndef))
+      return indicatePessimisticFixpoint();
+
+    if (!OnlyLeft && !fillSetWithConstantValues(A, IRPosition::value(*RHS),
+                                                RHSAAPVS, RHSContainsUndef))
+      return indicatePessimisticFixpoint();
 
-    if (!LHSAA || !RHSAA) {
+    if (OnlyLeft || OnlyRight) {
       // select (true/false), lhs, rhs
-      auto *OpAA = LHSAA ? LHSAA : RHSAA;
+      auto *OpAA = OnlyLeft ? &LHSAAPVS : &RHSAAPVS;
+      auto Undef = OnlyLeft ? LHSContainsUndef : RHSContainsUndef;
 
-      if (OpAA->undefIsContained())
+      if (Undef)
         unionAssumedWithUndef();
-      else
-        unionAssumed(*OpAA);
+      else {
+        for (auto &It : *OpAA)
+          unionAssumed(It);
+      }
 
-    } else if (LHSAA->undefIsContained() && RHSAA->undefIsContained()) {
+    } else if (LHSContainsUndef && RHSContainsUndef) {
       // select i1 *, undef , undef => undef
       unionAssumedWithUndef();
     } else {
-      unionAssumed(*LHSAA);
-      unionAssumed(*RHSAA);
+      for (auto &It : LHSAAPVS)
+        unionAssumed(It);
+      for (auto &It : RHSAAPVS)
+        unionAssumed(It);
     }
     return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
                                          : ChangeStatus::CHANGED;
@@ -9344,26 +9010,16 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
     uint32_t ResultBitWidth = CI->getDestTy()->getIntegerBitWidth();
     Value *Src = CI->getOperand(0);
 
-    // Simplify the operand first.
-    bool UsedAssumedInformation = false;
-    const auto &SimplifiedSrc =
-        A.getAssumedSimplified(IRPosition::value(*Src, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedSrc)
-      return ChangeStatus::UNCHANGED;
-    if (!SimplifiedSrc.value())
+    bool SrcContainsUndef = false;
+    SetTy SrcPVS;
+    if (!fillSetWithConstantValues(A, IRPosition::value(*Src), SrcPVS,
+                                   SrcContainsUndef))
       return indicatePessimisticFixpoint();
-    Src = *SimplifiedSrc;
 
-    auto &SrcAA = A.getAAFor<AAPotentialConstantValues>(
-        *this, IRPosition::value(*Src), DepClassTy::REQUIRED);
-    if (!SrcAA.isValidState())
-      return indicatePessimisticFixpoint();
-    const SetTy &SrcAAPVS = SrcAA.getAssumedSet();
-    if (SrcAA.undefIsContained())
+    if (SrcContainsUndef)
       unionAssumedWithUndef();
     else {
-      for (const APInt &S : SrcAAPVS) {
+      for (const APInt &S : SrcPVS) {
         APInt T = calculateCastInst(CI, S, ResultBitWidth);
         unionAssumed(T);
       }
@@ -9377,53 +9033,26 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
     Value *LHS = BinOp->getOperand(0);
     Value *RHS = BinOp->getOperand(1);
 
-    // Simplify the operands first.
-    bool UsedAssumedInformation = false;
-    const auto &SimplifiedLHS =
-        A.getAssumedSimplified(IRPosition::value(*LHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedLHS)
-      return ChangeStatus::UNCHANGED;
-    if (!SimplifiedLHS.value())
+    bool LHSContainsUndef = false, RHSContainsUndef = false;
+    SetTy LHSAAPVS, RHSAAPVS;
+    if (!fillSetWithConstantValues(A, IRPosition::value(*LHS), LHSAAPVS,
+                                   LHSContainsUndef) ||
+        !fillSetWithConstantValues(A, IRPosition::value(*RHS), RHSAAPVS,
+                                   RHSContainsUndef))
       return indicatePessimisticFixpoint();
-    LHS = *SimplifiedLHS;
 
-    const auto &SimplifiedRHS =
-        A.getAssumedSimplified(IRPosition::value(*RHS, getCallBaseContext()),
-                               *this, UsedAssumedInformation);
-    if (!SimplifiedRHS)
-      return ChangeStatus::UNCHANGED;
-    if (!SimplifiedRHS.value())
-      return indicatePessimisticFixpoint();
-    RHS = *SimplifiedRHS;
-
-    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
-      return indicatePessimisticFixpoint();
-
-    auto &LHSAA = A.getAAFor<AAPotentialConstantValues>(
-        *this, IRPosition::value(*LHS), DepClassTy::REQUIRED);
-    if (!LHSAA.isValidState())
-      return indicatePessimisticFixpoint();
-
-    auto &RHSAA = A.getAAFor<AAPotentialConstantValues>(
-        *this, IRPosition::value(*RHS), DepClassTy::REQUIRED);
-    if (!RHSAA.isValidState())
-      return indicatePessimisticFixpoint();
-
-    const SetTy &LHSAAPVS = LHSAA.getAssumedSet();
-    const SetTy &RHSAAPVS = RHSAA.getAssumedSet();
     const APInt Zero = APInt(LHS->getType()->getIntegerBitWidth(), 0);
 
     // TODO: make use of undef flag to limit potential values aggressively.
-    if (LHSAA.undefIsContained() && RHSAA.undefIsContained()) {
+    if (LHSContainsUndef && RHSContainsUndef) {
       if (!calculateBinaryOperatorAndTakeUnion(BinOp, Zero, Zero))
         return indicatePessimisticFixpoint();
-    } else if (LHSAA.undefIsContained()) {
+    } else if (LHSContainsUndef) {
       for (const APInt &R : RHSAAPVS) {
         if (!calculateBinaryOperatorAndTakeUnion(BinOp, Zero, R))
           return indicatePessimisticFixpoint();
       }
-    } else if (RHSAA.undefIsContained()) {
+    } else if (RHSContainsUndef) {
       for (const APInt &L : LHSAAPVS) {
         if (!calculateBinaryOperatorAndTakeUnion(BinOp, L, Zero))
           return indicatePessimisticFixpoint();
@@ -9440,35 +9069,6 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
                                          : ChangeStatus::CHANGED;
   }
 
-  ChangeStatus updateWithPHINode(Attributor &A, PHINode *PHI) {
-    auto AssumedBefore = getAssumed();
-    for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
-      Value *IncomingValue = PHI->getIncomingValue(u);
-
-      // Simplify the operand first.
-      bool UsedAssumedInformation = false;
-      const auto &SimplifiedIncomingValue = A.getAssumedSimplified(
-          IRPosition::value(*IncomingValue, getCallBaseContext()), *this,
-          UsedAssumedInformation);
-      if (!SimplifiedIncomingValue)
-        continue;
-      if (!SimplifiedIncomingValue.value())
-        return indicatePessimisticFixpoint();
-      IncomingValue = *SimplifiedIncomingValue;
-
-      auto &PotentialValuesAA = A.getAAFor<AAPotentialConstantValues>(
-          *this, IRPosition::value(*IncomingValue), DepClassTy::REQUIRED);
-      if (!PotentialValuesAA.isValidState())
-        return indicatePessimisticFixpoint();
-      if (PotentialValuesAA.undefIsContained())
-        unionAssumedWithUndef();
-      else
-        unionAssumed(PotentialValuesAA.getAssumed());
-    }
-    return AssumedBefore == getAssumed() ? ChangeStatus::UNCHANGED
-                                         : ChangeStatus::CHANGED;
-  }
-
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
     Value &V = getAssociatedValue();
@@ -9486,9 +9086,6 @@ struct AAPotentialConstantValuesFloating : AAPotentialConstantValuesImpl {
     if (auto *BinOp = dyn_cast<BinaryOperator>(I))
       return updateWithBinaryOperator(A, BinOp);
 
-    if (auto *PHI = dyn_cast<PHINode>(I))
-      return updateWithPHINode(A, PHI);
-
     return indicatePessimisticFixpoint();
   }
 
@@ -9642,7 +9239,8 @@ struct AANoUndefImpl : AANoUndef {
     // A position whose simplified value does not have any value is
     // considered to be dead. We don't manifest noundef in such positions for
     // the same reason above.
-    if (!A.getAssumedSimplified(getIRPosition(), *this, UsedAssumedInformation)
+    if (!A.getAssumedSimplified(getIRPosition(), *this, UsedAssumedInformation,
+                                AA::Interprocedural)
              .has_value())
       return ChangeStatus::UNCHANGED;
     return AANoUndef::manifest(A);
@@ -9663,11 +9261,19 @@ struct AANoUndefFloating : public AANoUndefImpl {
 
   /// See AbstractAttribute::updateImpl(...).
   ChangeStatus updateImpl(Attributor &A) override {
-    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
-                            AANoUndef::StateType &T, bool Stripped) -> bool {
+
+    SmallVector<AA::ValueAndContext> Values;
+    bool UsedAssumedInformation = false;
+    if (!A.getAssumedSimplifiedValues(getIRPosition(), *this, Values,
+                                      AA::AnyScope, UsedAssumedInformation)) {
+      Values.push_back({getAssociatedValue(), getCtxI()});
+    }
+
+    StateType T;
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI) -> bool {
       const auto &AA = A.getAAFor<AANoUndef>(*this, IRPosition::value(V),
                                              DepClassTy::REQUIRED);
-      if (!Stripped && this == &AA) {
+      if (this == &AA) {
         T.indicatePessimisticFixpoint();
       } else {
         const AANoUndef::StateType &S =
@@ -9677,12 +9283,9 @@ struct AANoUndefFloating : public AANoUndefImpl {
       return T.isValidState();
     };
 
-    StateType T;
-    bool UsedAssumedInformation = false;
-    if (!genericValueTraversal<StateType>(A, getIRPosition(), *this, T,
-                                          VisitValueCB, getCtxI(),
-                                          UsedAssumedInformation))
-      return indicatePessimisticFixpoint();
+    for (const auto &VAC : Values)
+      if (!VisitValueCB(*VAC.getValue(), VAC.getCtxI()))
+        return indicatePessimisticFixpoint();
 
     return clampStateAndIndicateChange(getState(), T);
   }
@@ -9782,8 +9385,7 @@ struct AACallEdgesCallSite : public AACallEdgesImpl {
   ChangeStatus updateImpl(Attributor &A) override {
     ChangeStatus Change = ChangeStatus::UNCHANGED;
 
-    auto VisitValue = [&](Value &V, const Instruction *CtxI, bool &HasUnknown,
-                          bool Stripped) -> bool {
+    auto VisitValue = [&](Value &V, const Instruction *CtxI) -> bool {
       if (Function *Fn = dyn_cast<Function>(&V)) {
         addCalledFunction(Fn, Change);
       } else {
@@ -9795,17 +9397,17 @@ struct AACallEdgesCallSite : public AACallEdgesImpl {
       return true;
     };
 
+    SmallVector<AA::ValueAndContext> Values;
     // Process any value that we might call.
-    auto ProcessCalledOperand = [&](Value *V) {
-      bool DummyValue = false;
+    auto ProcessCalledOperand = [&](Value *V, Instruction *CtxI) {
       bool UsedAssumedInformation = false;
-      if (!genericValueTraversal<bool>(A, IRPosition::value(*V), *this,
-                                       DummyValue, VisitValue, nullptr,
-                                       UsedAssumedInformation, false)) {
-        // If we haven't gone through all values, assume that there are unknown
-        // callees.
-        setHasUnknownCallee(true, Change);
+      Values.clear();
+      if (!A.getAssumedSimplifiedValues(IRPosition::value(*V), *this, Values,
+                                        AA::AnyScope, UsedAssumedInformation)) {
+        Values.push_back({*V, CtxI});
       }
+      for (auto &VAC : Values)
+        VisitValue(*VAC.getValue(), VAC.getCtxI());
     };
 
     CallBase *CB = cast<CallBase>(getCtxI());
@@ -9828,13 +9430,13 @@ struct AACallEdgesCallSite : public AACallEdgesImpl {
     }
 
     // The most simple case.
-    ProcessCalledOperand(CB->getCalledOperand());
+    ProcessCalledOperand(CB->getCalledOperand(), CB);
 
     // Process callback functions.
     SmallVector<const Use *, 4u> CallbackUses;
     AbstractCallSite::getCallbackUses(*CB, CallbackUses);
     for (const Use *U : CallbackUses)
-      ProcessCalledOperand(U->get());
+      ProcessCalledOperand(U->get(), CB);
 
     return Change;
   }
@@ -9920,8 +9522,11 @@ private:
 
       for (auto *AAEdges : AAEdgesList) {
         if (AAEdges->hasUnknownCallee()) {
-          if (!CanReachUnknownCallee)
+          if (!CanReachUnknownCallee) {
+            LLVM_DEBUG(dbgs()
+                       << "[QueryResolver] Edges include unknown callee!\n");
             Change = ChangeStatus::CHANGED;
+          }
           CanReachUnknownCallee = true;
           return Change;
         }
@@ -10065,14 +9670,10 @@ public:
   }
 
   bool instructionCanReach(Attributor &A, const Instruction &Inst,
-                           const Function &Fn,
-                           bool UseBackwards) const override {
+                           const Function &Fn) const override {
     if (!isValidState())
       return true;
 
-    if (UseBackwards)
-      return AA::isPotentiallyReachable(A, Inst, Fn, *this, nullptr);
-
     const auto &Reachability = A.getAAFor<AAReachability>(
         *this, IRPosition::function(*getAssociatedFunction()),
         DepClassTy::REQUIRED);
@@ -10085,8 +9686,11 @@ public:
     // This is a hack for us to be able to cache queries.
     auto *NonConstThis = const_cast<AAFunctionReachabilityFunction *>(this);
     QueryResolver &InstQSet = NonConstThis->InstQueries[&Inst];
-    if (!AllKnown)
+    if (!AllKnown) {
+      LLVM_DEBUG(dbgs() << "[AAReachability] Not all reachable edges known, "
+                           "may reach unknown callee!\n");
       InstQSet.CanReachUnknownCallee = true;
+    }
 
     return InstQSet.isReachable(A, *NonConstThis, CallEdges, Fn);
   }
@@ -10119,8 +9723,11 @@ public:
         bool AllKnown =
             getReachableCallEdges(A, *Reachability, *InstPair.first, CallEdges);
         // Update will return change if we this effects any queries.
-        if (!AllKnown)
+        if (!AllKnown) {
+          LLVM_DEBUG(dbgs() << "[AAReachability] Not all reachable edges "
+                               "known, may reach unknown callee!\n");
           InstPair.second.CanReachUnknownCallee = true;
+        }
         Change |= InstPair.second.update(A, *this, CallEdges);
       }
     }
@@ -10133,8 +9740,11 @@ public:
         WholeFunction.Reachable.size() + WholeFunction.Unreachable.size();
 
     return "FunctionReachability [" +
-           std::to_string(WholeFunction.Reachable.size()) + "," +
-           std::to_string(QueryCount) + "]";
+           (canReachUnknownCallee()
+                ? "unknown"
+                : (std::to_string(WholeFunction.Reachable.size()) + "," +
+                   std::to_string(QueryCount))) +
+           "]";
   }
 
   void trackStatistics() const override {}
@@ -10156,6 +9766,822 @@ private:
 };
 } // namespace
 
+template <typename AAType>
+static Optional<Constant *>
+askForAssumedConstant(Attributor &A, const AbstractAttribute &QueryingAA,
+                      const IRPosition &IRP, Type &Ty) {
+  if (!Ty.isIntegerTy())
+    return nullptr;
+
+  // This will also pass the call base context.
+  const auto &AA = A.getAAFor<AAType>(QueryingAA, IRP, DepClassTy::NONE);
+
+  Optional<Constant *> COpt = AA.getAssumedConstant(A);
+
+  if (!COpt.has_value()) {
+    A.recordDependence(AA, QueryingAA, DepClassTy::OPTIONAL);
+    return llvm::None;
+  }
+  if (auto *C = COpt.value()) {
+    A.recordDependence(AA, QueryingAA, DepClassTy::OPTIONAL);
+    return C;
+  }
+  return nullptr;
+}
+
+Value *AAPotentialValues::getSingleValue(
+    Attributor &A, const AbstractAttribute &AA, const IRPosition &IRP,
+    SmallVectorImpl<AA::ValueAndContext> &Values) {
+  Type &Ty = *IRP.getAssociatedType();
+  Optional<Value *> V;
+  for (auto &It : Values) {
+    V = AA::combineOptionalValuesInAAValueLatice(V, It.getValue(), &Ty);
+    if (V.has_value() && !V.value())
+      break;
+  }
+  if (!V.has_value())
+    return UndefValue::get(&Ty);
+  return V.value();
+}
+
+namespace {
+struct AAPotentialValuesImpl : AAPotentialValues {
+  using StateType = PotentialLLVMValuesState;
+
+  AAPotentialValuesImpl(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValues(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    if (A.hasSimplificationCallback(getIRPosition())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+    Value *Stripped = getAssociatedValue().stripPointerCasts();
+    if (isa<Constant>(Stripped)) {
+      addValue(A, getState(), *Stripped, getCtxI(), AA::AnyScope,
+               getAnchorScope());
+      indicateOptimisticFixpoint();
+      return;
+    }
+    AAPotentialValues::initialize(A);
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    std::string Str;
+    llvm::raw_string_ostream OS(Str);
+    OS << getState();
+    return OS.str();
+  }
+
+  template <typename AAType>
+  static Optional<Value *> askOtherAA(Attributor &A,
+                                      const AbstractAttribute &AA,
+                                      const IRPosition &IRP, Type &Ty) {
+    if (isa<Constant>(IRP.getAssociatedValue()))
+      return &IRP.getAssociatedValue();
+    Optional<Constant *> C = askForAssumedConstant<AAType>(A, AA, IRP, Ty);
+    if (!C)
+      return llvm::None;
+    if (C.value())
+      if (auto *CC = AA::getWithType(**C, Ty))
+        return CC;
+    return nullptr;
+  }
+
+  void addValue(Attributor &A, StateType &State, Value &V,
+                const Instruction *CtxI, AA::ValueScope S,
+                Function *AnchorScope) const {
+
+    IRPosition ValIRP = IRPosition::value(V);
+    if (auto *CB = dyn_cast_or_null<CallBase>(CtxI)) {
+      for (auto &U : CB->args()) {
+        if (U.get() != &V)
+          continue;
+        ValIRP = IRPosition::callsite_argument(*CB, CB->getArgOperandNo(&U));
+        break;
+      }
+    }
+
+    Value *VPtr = &V;
+    if (ValIRP.getAssociatedType()->isIntegerTy()) {
+      Type &Ty = *getAssociatedType();
+      Optional<Value *> SimpleV =
+          askOtherAA<AAValueConstantRange>(A, *this, ValIRP, Ty);
+      if (SimpleV.has_value() && !SimpleV.value()) {
+        auto &PotentialConstantsAA = A.getAAFor<AAPotentialConstantValues>(
+            *this, ValIRP, DepClassTy::OPTIONAL);
+        if (PotentialConstantsAA.isValidState()) {
+          for (auto &It : PotentialConstantsAA.getAssumedSet()) {
+            State.unionAssumed({{*ConstantInt::get(&Ty, It), nullptr}, S});
+          }
+          assert(!PotentialConstantsAA.undefIsContained() &&
+                 "Undef should be an explicit value!");
+          return;
+        }
+      }
+      if (!SimpleV.has_value())
+        return;
+
+      if (SimpleV.value())
+        VPtr = SimpleV.value();
+    }
+
+    if (isa<ConstantInt>(VPtr))
+      CtxI = nullptr;
+    if (!AA::isValidInScope(*VPtr, AnchorScope))
+      S = AA::ValueScope(S | AA::Interprocedural);
+
+    State.unionAssumed({{*VPtr, CtxI}, S});
+  }
+
+  /// Helper struct to tie a value+context pair together with the scope for
+  /// which this is the simplified version.
+  struct ItemInfo {
+    AA::ValueAndContext I;
+    AA::ValueScope S;
+  };
+
+  bool recurseForValue(Attributor &A, const IRPosition &IRP, AA::ValueScope S) {
+    SmallMapVector<AA::ValueAndContext, int, 8> ValueScopeMap;
+    for (auto CS : {AA::Intraprocedural, AA::Interprocedural}) {
+      if (!(CS & S))
+        continue;
+
+      bool UsedAssumedInformation = false;
+      SmallVector<AA::ValueAndContext> Values;
+      if (!A.getAssumedSimplifiedValues(IRP, this, Values, CS,
+                                        UsedAssumedInformation))
+        return false;
+
+      for (auto &It : Values)
+        ValueScopeMap[It] += CS;
+    }
+    for (auto &It : ValueScopeMap)
+      addValue(A, getState(), *It.first.getValue(), It.first.getCtxI(),
+               AA::ValueScope(It.second), getAnchorScope());
+
+    return true;
+  }
+
+  void giveUpOnIntraprocedural(Attributor &A) {
+    auto NewS = StateType::getBestState(getState());
+    for (auto &It : getAssumedSet()) {
+      if (It.second == AA::Intraprocedural)
+        continue;
+      addValue(A, NewS, *It.first.getValue(), It.first.getCtxI(),
+               AA::Interprocedural, getAnchorScope());
+    }
+    assert(!undefIsContained() && "Undef should be an explicit value!");
+    addValue(A, NewS, getAssociatedValue(), getCtxI(), AA::Intraprocedural,
+             getAnchorScope());
+    getState() = NewS;
+  }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...).
+  ChangeStatus indicatePessimisticFixpoint() override {
+    getState() = StateType::getBestState(getState());
+    getState().unionAssumed({{getAssociatedValue(), getCtxI()}, AA::AnyScope});
+    AAPotentialValues::indicateOptimisticFixpoint();
+    return ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    SmallVector<AA::ValueAndContext> Values;
+    for (AA::ValueScope S : {AA::Interprocedural, AA::Intraprocedural}) {
+      Values.clear();
+      if (!getAssumedSimplifiedValues(A, Values, S))
+        continue;
+      Value &OldV = getAssociatedValue();
+      if (isa<UndefValue>(OldV))
+        continue;
+      Value *NewV = getSingleValue(A, *this, getIRPosition(), Values);
+      if (!NewV || NewV == &OldV)
+        continue;
+      if (getCtxI() &&
+          !AA::isValidAtPosition({*NewV, *getCtxI()}, A.getInfoCache()))
+        continue;
+      if (A.changeAfterManifest(getIRPosition(), *NewV))
+        return ChangeStatus::CHANGED;
+    }
+    return ChangeStatus::UNCHANGED;
+  }
+
+  bool getAssumedSimplifiedValues(Attributor &A,
+                                  SmallVectorImpl<AA::ValueAndContext> &Values,
+                                  AA::ValueScope S) const override {
+    if (!isValidState())
+      return false;
+    for (auto &It : getAssumedSet())
+      if (It.second & S)
+        Values.push_back(It.first);
+    assert(!undefIsContained() && "Undef should be an explicit value!");
+    return true;
+  }
+};
+
+struct AAPotentialValuesFloating : AAPotentialValuesImpl {
+  AAPotentialValuesFloating(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto AssumedBefore = getAssumed();
+
+    genericValueTraversal(A);
+
+    return (AssumedBefore == getAssumed()) ? ChangeStatus::UNCHANGED
+                                           : ChangeStatus::CHANGED;
+  }
+
+  /// Helper struct to remember which AAIsDead instances we actually used.
+  struct LivenessInfo {
+    const AAIsDead *LivenessAA = nullptr;
+    bool AnyDead = false;
+  };
+
+  /// Check if \p Cmp is a comparison we can simplify.
+  ///
+  /// We handle multiple cases, one in which at least one operand is an
+  /// (assumed) nullptr. If so, try to simplify it using AANonNull on the other
+  /// operand. Return true if successful, in that case Worklist will be updated.
+  bool handleCmp(Attributor &A, CmpInst &Cmp, ItemInfo II,
+                 SmallVectorImpl<ItemInfo> &Worklist) {
+    Value *LHS = Cmp.getOperand(0);
+    Value *RHS = Cmp.getOperand(1);
+
+    // Simplify the operands first.
+    bool UsedAssumedInformation = false;
+    const auto &SimplifiedLHS = A.getAssumedSimplified(
+        IRPosition::value(*LHS, getCallBaseContext()), *this,
+        UsedAssumedInformation, AA::Intraprocedural);
+    if (!SimplifiedLHS.has_value())
+      return true;
+    if (!SimplifiedLHS.value())
+      return false;
+    LHS = *SimplifiedLHS;
+
+    const auto &SimplifiedRHS = A.getAssumedSimplified(
+        IRPosition::value(*RHS, getCallBaseContext()), *this,
+        UsedAssumedInformation, AA::Intraprocedural);
+    if (!SimplifiedRHS.has_value())
+      return true;
+    if (!SimplifiedRHS.value())
+      return false;
+    RHS = *SimplifiedRHS;
+
+    LLVMContext &Ctx = Cmp.getContext();
+    // Handle the trivial case first in which we don't even need to think about
+    // null or non-null.
+    if (LHS == RHS && (Cmp.isTrueWhenEqual() || Cmp.isFalseWhenEqual())) {
+      Constant *NewV =
+          ConstantInt::get(Type::getInt1Ty(Ctx), Cmp.isTrueWhenEqual());
+      addValue(A, getState(), *NewV, /* CtxI */ nullptr, II.S,
+               getAnchorScope());
+      return true;
+    }
+
+    // From now on we only handle equalities (==, !=).
+    ICmpInst *ICmp = dyn_cast<ICmpInst>(&Cmp);
+    if (!ICmp || !ICmp->isEquality())
+      return false;
+
+    bool LHSIsNull = isa<ConstantPointerNull>(LHS);
+    bool RHSIsNull = isa<ConstantPointerNull>(RHS);
+    if (!LHSIsNull && !RHSIsNull)
+      return false;
+
+    // Left is the nullptr ==/!= non-nullptr case. We'll use AANonNull on the
+    // non-nullptr operand and if we assume it's non-null we can conclude the
+    // result of the comparison.
+    assert((LHSIsNull || RHSIsNull) &&
+           "Expected nullptr versus non-nullptr comparison at this point");
+
+    // The index is the operand that we assume is not null.
+    unsigned PtrIdx = LHSIsNull;
+    auto &PtrNonNullAA = A.getAAFor<AANonNull>(
+        *this, IRPosition::value(*ICmp->getOperand(PtrIdx)),
+        DepClassTy::REQUIRED);
+    if (!PtrNonNullAA.isAssumedNonNull())
+      return false;
+
+    // The new value depends on the predicate, true for != and false for ==.
+    Constant *NewV = ConstantInt::get(Type::getInt1Ty(Ctx),
+                                      ICmp->getPredicate() == CmpInst::ICMP_NE);
+    addValue(A, getState(), *NewV, /* CtxI */ nullptr, II.S, getAnchorScope());
+    return true;
+  }
+
+  bool handleSelectInst(Attributor &A, SelectInst &SI, ItemInfo II,
+                        SmallVectorImpl<ItemInfo> &Worklist) {
+    const Instruction *CtxI = II.I.getCtxI();
+    bool UsedAssumedInformation = false;
+
+    Optional<Constant *> C =
+        A.getAssumedConstant(*SI.getCondition(), *this, UsedAssumedInformation);
+    bool NoValueYet = !C.has_value();
+    if (NoValueYet || isa_and_nonnull<UndefValue>(*C))
+      return true;
+    if (auto *CI = dyn_cast_or_null<ConstantInt>(*C)) {
+      if (CI->isZero())
+        Worklist.push_back({{*SI.getFalseValue(), CtxI}, II.S});
+      else
+        Worklist.push_back({{*SI.getTrueValue(), CtxI}, II.S});
+    } else {
+      // We could not simplify the condition, assume both values.
+      Worklist.push_back({{*SI.getTrueValue(), CtxI}, II.S});
+      Worklist.push_back({{*SI.getFalseValue(), CtxI}, II.S});
+    }
+    return true;
+  }
+
+  bool handleLoadInst(Attributor &A, LoadInst &LI, ItemInfo II,
+                      SmallVectorImpl<ItemInfo> &Worklist) {
+    SmallSetVector<Value *, 4> PotentialCopies;
+    SmallSetVector<Instruction *, 4> PotentialValueOrigins;
+    bool UsedAssumedInformation = false;
+    if (!AA::getPotentiallyLoadedValues(A, LI, PotentialCopies,
+                                        PotentialValueOrigins, *this,
+                                        UsedAssumedInformation,
+                                        /* OnlyExact */ true)) {
+      LLVM_DEBUG(dbgs() << "[AAPotentialValues] Failed to get potentially "
+                           "loaded values for load instruction "
+                        << LI << "\n");
+      return false;
+    }
+
+    // Do not simplify loads that are only used in llvm.assume if we cannot also
+    // remove all stores that may feed into the load. The reason is that the
+    // assume is probably worth something as long as the stores are around.
+    InformationCache &InfoCache = A.getInfoCache();
+    if (InfoCache.isOnlyUsedByAssume(LI)) {
+      if (!llvm::all_of(PotentialValueOrigins, [&](Instruction *I) {
+            if (!I)
+              return true;
+            if (auto *SI = dyn_cast<StoreInst>(I))
+              return A.isAssumedDead(SI->getOperandUse(0), this,
+                                     /* LivenessAA */ nullptr,
+                                     UsedAssumedInformation,
+                                     /* CheckBBLivenessOnly */ false);
+            return A.isAssumedDead(*I, this, /* LivenessAA */ nullptr,
+                                   UsedAssumedInformation,
+                                   /* CheckBBLivenessOnly */ false);
+          })) {
+        LLVM_DEBUG(dbgs() << "[AAPotentialValues] Load is onl used by assumes "
+                             "and we cannot delete all the stores: "
+                          << LI << "\n");
+        return false;
+      }
+    }
+
+    // Values have to be dynamically unique or we loose the fact that a
+    // single llvm::Value might represent two runtime values (e.g.,
+    // stack locations in different recursive calls).
+    const Instruction *CtxI = II.I.getCtxI();
+    bool ScopeIsLocal = (II.S & AA::Intraprocedural);
+    bool AllLocal = ScopeIsLocal;
+    bool DynamicallyUnique = llvm::all_of(PotentialCopies, [&](Value *PC) {
+      AllLocal &= AA::isValidInScope(*PC, getAnchorScope());
+      return AA::isDynamicallyUnique(A, *this, *PC);
+    });
+    if (!DynamicallyUnique) {
+      LLVM_DEBUG(dbgs() << "[AAPotentialValues] Not all potentially loaded "
+                           "values are dynamically unique: "
+                        << LI << "\n");
+      return false;
+    }
+
+    for (auto *PotentialCopy : PotentialCopies) {
+      if (AllLocal) {
+        Worklist.push_back({{*PotentialCopy, CtxI}, II.S});
+      } else {
+        Worklist.push_back({{*PotentialCopy, CtxI}, AA::Interprocedural});
+      }
+    }
+    if (!AllLocal && ScopeIsLocal)
+      addValue(A, getState(), LI, CtxI, AA::Intraprocedural, getAnchorScope());
+    return true;
+  }
+
+  bool handlePHINode(
+      Attributor &A, PHINode &PHI, ItemInfo II,
+      SmallVectorImpl<ItemInfo> &Worklist,
+      SmallMapVector<const Function *, LivenessInfo, 4> &LivenessAAs) {
+    auto GetLivenessInfo = [&](const Function &F) -> LivenessInfo & {
+      LivenessInfo &LI = LivenessAAs[&F];
+      if (!LI.LivenessAA)
+        LI.LivenessAA = &A.getAAFor<AAIsDead>(*this, IRPosition::function(F),
+                                              DepClassTy::NONE);
+      return LI;
+    };
+
+    LivenessInfo &LI = GetLivenessInfo(*PHI.getFunction());
+    for (unsigned u = 0, e = PHI.getNumIncomingValues(); u < e; u++) {
+      BasicBlock *IncomingBB = PHI.getIncomingBlock(u);
+      if (LI.LivenessAA->isEdgeDead(IncomingBB, PHI.getParent())) {
+        LI.AnyDead = true;
+        continue;
+      }
+      Worklist.push_back(
+          {{*PHI.getIncomingValue(u), IncomingBB->getTerminator()}, II.S});
+    }
+    return true;
+  }
+
+  /// Use the generic, non-optimistic InstSimplfy functionality if we managed to
+  /// simplify any operand of the instruction \p I. Return true if successful,
+  /// in that case Worklist will be updated.
+  bool handleGenericInst(Attributor &A, Instruction &I, ItemInfo II,
+                         SmallVectorImpl<ItemInfo> &Worklist) {
+    bool SomeSimplified = false;
+    bool UsedAssumedInformation = false;
+
+    SmallVector<Value *, 8> NewOps(I.getNumOperands());
+    int Idx = 0;
+    for (Value *Op : I.operands()) {
+      const auto &SimplifiedOp = A.getAssumedSimplified(
+          IRPosition::value(*Op, getCallBaseContext()), *this,
+          UsedAssumedInformation, AA::Intraprocedural);
+      // If we are not sure about any operand we are not sure about the entire
+      // instruction, we'll wait.
+      if (!SimplifiedOp.has_value())
+        return true;
+
+      if (SimplifiedOp.value())
+        NewOps[Idx] = SimplifiedOp.value();
+      else
+        NewOps[Idx] = Op;
+
+      SomeSimplified |= (NewOps[Idx] != Op);
+      ++Idx;
+    }
+
+    // We won't bother with the InstSimplify interface if we didn't simplify any
+    // operand ourselves.
+    if (!SomeSimplified)
+      return false;
+
+    InformationCache &InfoCache = A.getInfoCache();
+    Function *F = I.getFunction();
+    const auto *DT =
+        InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*F);
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+    auto *AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*F);
+    OptimizationRemarkEmitter *ORE = nullptr;
+
+    const DataLayout &DL = I.getModule()->getDataLayout();
+    SimplifyQuery Q(DL, TLI, DT, AC, &I);
+    Value *NewV = simplifyInstructionWithOperands(&I, NewOps, Q, ORE);
+    if (!NewV || NewV == &I)
+      return false;
+
+    LLVM_DEBUG(dbgs() << "Generic inst " << I << " assumed simplified to "
+                      << *NewV << "\n");
+    Worklist.push_back({{*NewV, II.I.getCtxI()}, II.S});
+    return true;
+  }
+
+  bool simplifyInstruction(
+      Attributor &A, Instruction &I, ItemInfo II,
+      SmallVectorImpl<ItemInfo> &Worklist,
+      SmallMapVector<const Function *, LivenessInfo, 4> &LivenessAAs) {
+    if (auto *CI = dyn_cast<CmpInst>(&I))
+      if (handleCmp(A, *CI, II, Worklist))
+        return true;
+
+    switch (I.getOpcode()) {
+    case Instruction::Select:
+      return handleSelectInst(A, cast<SelectInst>(I), II, Worklist);
+    case Instruction::PHI:
+      return handlePHINode(A, cast<PHINode>(I), II, Worklist, LivenessAAs);
+    case Instruction::Load:
+      return handleLoadInst(A, cast<LoadInst>(I), II, Worklist);
+    default:
+      return handleGenericInst(A, I, II, Worklist);
+    };
+    return false;
+  }
+
+  void genericValueTraversal(Attributor &A) {
+    SmallMapVector<const Function *, LivenessInfo, 4> LivenessAAs;
+
+    Value *InitialV = &getAssociatedValue();
+    SmallSet<AA::ValueAndContext, 16> Visited;
+    SmallVector<ItemInfo, 16> Worklist;
+    Worklist.push_back({{*InitialV, getCtxI()}, AA::AnyScope});
+
+    int Iteration = 0;
+    do {
+      ItemInfo II = Worklist.pop_back_val();
+      Value *V = II.I.getValue();
+      assert(V);
+      const Instruction *CtxI = II.I.getCtxI();
+      AA::ValueScope S = II.S;
+
+      // Check if we should process the current value. To prevent endless
+      // recursion keep a record of the values we followed!
+      if (!Visited.insert(II.I).second)
+        continue;
+
+      // Make sure we limit the compile time for complex expressions.
+      if (Iteration++ >= MaxPotentialValuesIterations) {
+        LLVM_DEBUG(dbgs() << "Generic value traversal reached iteration limit: "
+                          << Iteration << "!\n");
+        addValue(A, getState(), *V, CtxI, S, getAnchorScope());
+        continue;
+      }
+
+      // Explicitly look through calls with a "returned" attribute if we do
+      // not have a pointer as stripPointerCasts only works on them.
+      Value *NewV = nullptr;
+      if (V->getType()->isPointerTy()) {
+        NewV = AA::getWithType(*V->stripPointerCasts(), *V->getType());
+      } else {
+        auto *CB = dyn_cast<CallBase>(V);
+        if (CB && CB->getCalledFunction()) {
+          for (Argument &Arg : CB->getCalledFunction()->args())
+            if (Arg.hasReturnedAttr()) {
+              NewV = CB->getArgOperand(Arg.getArgNo());
+              break;
+            }
+        }
+      }
+      if (NewV && NewV != V) {
+        Worklist.push_back({{*NewV, CtxI}, S});
+        continue;
+      }
+
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        if (simplifyInstruction(A, *I, II, Worklist, LivenessAAs))
+          continue;
+      }
+
+      if (V != InitialV || isa<Argument>(V))
+        if (recurseForValue(A, IRPosition::value(*V), II.S))
+          continue;
+
+      // If we haven't stripped anything we give up.
+      if (V == InitialV && CtxI == getCtxI()) {
+        indicatePessimisticFixpoint();
+        return;
+      }
+
+      addValue(A, getState(), *V, CtxI, S, getAnchorScope());
+    } while (!Worklist.empty());
+
+    // If we actually used liveness information so we have to record a
+    // dependence.
+    for (auto &It : LivenessAAs)
+      if (It.second.AnyDead)
+        A.recordDependence(*It.second.LivenessAA, *this, DepClassTy::OPTIONAL);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesArgument final : AAPotentialValuesImpl {
+  using Base = AAPotentialValuesImpl;
+  AAPotentialValuesArgument(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    auto &Arg = cast<Argument>(getAssociatedValue());
+    if (Arg.hasPointeeInMemoryValueAttr())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto AssumedBefore = getAssumed();
+
+    unsigned CSArgNo = getCallSiteArgNo();
+
+    bool UsedAssumedInformation = false;
+    SmallVector<AA::ValueAndContext> Values;
+    auto CallSitePred = [&](AbstractCallSite ACS) {
+      const auto CSArgIRP = IRPosition::callsite_argument(ACS, CSArgNo);
+      if (CSArgIRP.getPositionKind() == IRP_INVALID)
+        return false;
+
+      if (!A.getAssumedSimplifiedValues(CSArgIRP, this, Values,
+                                        AA::Interprocedural,
+                                        UsedAssumedInformation))
+        return false;
+
+      return isValidState();
+    };
+
+    if (!A.checkForAllCallSites(CallSitePred, *this,
+                                /* RequireAllCallSites */ true,
+                                UsedAssumedInformation))
+      return indicatePessimisticFixpoint();
+
+    Function *Fn = getAssociatedFunction();
+    bool AnyNonLocal = false;
+    for (auto &It : Values) {
+      if (isa<Constant>(It.getValue())) {
+        addValue(A, getState(), *It.getValue(), It.getCtxI(), AA::AnyScope,
+                 getAnchorScope());
+        continue;
+      }
+      if (!AA::isDynamicallyUnique(A, *this, *It.getValue()))
+        return indicatePessimisticFixpoint();
+
+      if (auto *Arg = dyn_cast<Argument>(It.getValue()))
+        if (Arg->getParent() == Fn) {
+          addValue(A, getState(), *It.getValue(), It.getCtxI(), AA::AnyScope,
+                   getAnchorScope());
+          continue;
+        }
+      addValue(A, getState(), *It.getValue(), It.getCtxI(), AA::Interprocedural,
+               getAnchorScope());
+      AnyNonLocal = true;
+    }
+    if (undefIsContained())
+      unionAssumedWithUndef();
+    if (AnyNonLocal)
+      giveUpOnIntraprocedural(A);
+
+    return (AssumedBefore == getAssumed()) ? ChangeStatus::UNCHANGED
+                                           : ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesReturned
+    : AAReturnedFromReturnedValues<AAPotentialValues, AAPotentialValuesImpl> {
+  using Base =
+      AAReturnedFromReturnedValues<AAPotentialValues, AAPotentialValuesImpl>;
+  AAPotentialValuesReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    if (A.hasSimplificationCallback(getIRPosition()))
+      indicatePessimisticFixpoint();
+    else
+      AAPotentialValues::initialize(A);
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    // We queried AAValueSimplify for the returned values so they will be
+    // replaced if a simplified form was found. Nothing to do here.
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    return AAPotentialValues::indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesFunction : AAPotentialValuesImpl {
+  AAPotentialValuesFunction(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("AAPotentialValues(Function|CallSite)::updateImpl will "
+                     "not be called");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FN_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesCallSite : AAPotentialValuesFunction {
+  AAPotentialValuesCallSite(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesFunction(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesCallSiteReturned : AAPotentialValuesImpl {
+  AAPotentialValuesCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto AssumedBefore = getAssumed();
+
+    Function *Callee = getAssociatedFunction();
+    if (!Callee)
+      return indicatePessimisticFixpoint();
+
+    bool UsedAssumedInformation = false;
+    auto *CB = cast<CallBase>(getCtxI());
+    if (CB->isMustTailCall() &&
+        !A.isAssumedDead(IRPosition::inst(*CB), this, nullptr,
+                         UsedAssumedInformation))
+      return indicatePessimisticFixpoint();
+
+    SmallVector<AA::ValueAndContext> Values;
+    if (!A.getAssumedSimplifiedValues(IRPosition::returned(*Callee), this,
+                                      Values, AA::Intraprocedural,
+                                      UsedAssumedInformation))
+      return indicatePessimisticFixpoint();
+
+    Function *Caller = CB->getCaller();
+
+    bool AnyNonLocal = false;
+    for (auto &It : Values) {
+      Value *V = It.getValue();
+      Optional<Value *> CallerV = A.translateArgumentToCallSiteContent(
+          V, *CB, *this, UsedAssumedInformation);
+      if (!CallerV.has_value()) {
+        // Nothing to do as long as no value was determined.
+        continue;
+      }
+      V = CallerV.value() ? CallerV.value() : V;
+      if (AA::isDynamicallyUnique(A, *this, *V) &&
+          AA::isValidInScope(*V, Caller)) {
+        if (CallerV.value()) {
+          SmallVector<AA::ValueAndContext> ArgValues;
+          IRPosition IRP = IRPosition::value(*V);
+          if (auto *Arg = dyn_cast<Argument>(V))
+            if (Arg->getParent() == CB->getCalledFunction())
+              IRP = IRPosition::callsite_argument(*CB, Arg->getArgNo());
+          if (recurseForValue(A, IRP, AA::AnyScope))
+            continue;
+        }
+        addValue(A, getState(), *V, CB, AA::AnyScope, getAnchorScope());
+      } else {
+        AnyNonLocal = true;
+        break;
+      }
+    }
+    if (AnyNonLocal) {
+      Values.clear();
+      if (!A.getAssumedSimplifiedValues(IRPosition::returned(*Callee), this,
+                                        Values, AA::Interprocedural,
+                                        UsedAssumedInformation))
+        return indicatePessimisticFixpoint();
+      AnyNonLocal = false;
+      getState() = PotentialLLVMValuesState::getBestState();
+      for (auto &It : Values) {
+        Value *V = It.getValue();
+        if (!AA::isDynamicallyUnique(A, *this, *V))
+          return indicatePessimisticFixpoint();
+        if (AA::isValidInScope(*V, Caller)) {
+          addValue(A, getState(), *V, CB, AA::AnyScope, getAnchorScope());
+        } else {
+          AnyNonLocal = true;
+          addValue(A, getState(), *V, CB, AA::Interprocedural,
+                   getAnchorScope());
+        }
+      }
+      if (AnyNonLocal)
+        giveUpOnIntraprocedural(A);
+    }
+    return (AssumedBefore == getAssumed()) ? ChangeStatus::UNCHANGED
+                                           : ChangeStatus::CHANGED;
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    return AAPotentialValues::indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(potential_values)
+  }
+};
+
+struct AAPotentialValuesCallSiteArgument : AAPotentialValuesFloating {
+  AAPotentialValuesCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAPotentialValuesFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(potential_values)
+  }
+};
+} // namespace
+
 /// ---------------------- Assumption Propagation ------------------------------
 namespace {
 struct AAAssumptionInfoImpl : public AAAssumptionInfo {
@@ -10323,6 +10749,7 @@ const char AAMemoryBehavior::ID = 0;
 const char AAMemoryLocation::ID = 0;
 const char AAValueConstantRange::ID = 0;
 const char AAPotentialConstantValues::ID = 0;
+const char AAPotentialValues::ID = 0;
 const char AANoUndef::ID = 0;
 const char AACallEdges::ID = 0;
 const char AAFunctionReachability::ID = 0;
@@ -10441,6 +10868,7 @@ CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAInstanceInfo)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialConstantValues)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
 CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPointerInfo)
 
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index 56e2df14ff38..360ec24a0509 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -1147,6 +1147,14 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
   // Declare a callback for the internalize pass that will ask for every
   // candidate GlobalValue if it can be internalized or not.
   auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
+    // It may be the case that GV is on a chain of an ifunc, its alias and
+    // subsequent aliases. In this case, the summary for the value is not
+    // available.
+    if (isa<GlobalIFunc>(&GV) ||
+        (isa<GlobalAlias>(&GV) &&
+         isa<GlobalIFunc>(cast<GlobalAlias>(&GV)->getAliaseeObject())))
+      return true;
+
     // Lookup the linkage recorded in the summaries during global analysis.
     auto GS = DefinedGlobals.find(GV.getGUID());
     if (GS == DefinedGlobals.end()) {
@@ -1277,7 +1285,7 @@ Expected<bool> FunctionImporter::importFunctions(
       }
     }
     for (GlobalAlias &GA : SrcModule->aliases()) {
-      if (!GA.hasName())
+      if (!GA.hasName() || isa<GlobalIFunc>(GA.getAliaseeObject()))
         continue;
       auto GUID = GA.getGUID();
       auto Import = ImportGUIDs.count(GUID);
@@ -1413,29 +1421,6 @@ static bool doImportingForModule(Module &M) {
   return *Result;
 }
 
-namespace {
-
-/// Pass that performs cross-module function import provided a summary file.
-class FunctionImportLegacyPass : public ModulePass {
-public:
-  /// Pass identification, replacement for typeid
-  static char ID;
-
-  explicit FunctionImportLegacyPass() : ModulePass(ID) {}
-
-  /// Specify pass name for debug output
-  StringRef getPassName() const override { return "Function Importing"; }
-
-  bool runOnModule(Module &M) override {
-    if (skipModule(M))
-      return false;
-
-    return doImportingForModule(M);
-  }
-};
-
-} // end anonymous namespace
-
 PreservedAnalyses FunctionImportPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
   if (!doImportingForModule(M))
@@ -1443,15 +1428,3 @@ PreservedAnalyses FunctionImportPass::run(Module &M,
 
   return PreservedAnalyses::none();
 }
-
-char FunctionImportLegacyPass::ID = 0;
-INITIALIZE_PASS(FunctionImportLegacyPass, "function-import",
-                "Summary Based Function Import", false, false)
-
-namespace llvm {
-
-Pass *createFunctionImportPass() {
-  return new FunctionImportLegacyPass();
-}
-
-} // end namespace llvm
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 1ad6e2b2a1d2..ec26db8bfc0b 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1040,7 +1040,7 @@ static bool tryToOptimizeStoreOfAllocationToGlobal(GlobalVariable *GV,
                                                    CallInst *CI,
                                                    const DataLayout &DL,
                                                    TargetLibraryInfo *TLI) {
-  if (!isAllocRemovable(CI, TLI))
+  if (!isRemovableAlloc(CI, TLI))
     // Must be able to remove the call when we get done..
     return false;
 
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
index ec2b80012ed6..dfd434e61d5b 100644
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -44,7 +44,6 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeLoopExtractorLegacyPassPass(Registry);
   initializeBlockExtractorLegacyPassPass(Registry);
   initializeSingleLoopExtractorPass(Registry);
-  initializeLowerTypeTestsPass(Registry);
   initializeMergeFunctionsLegacyPassPass(Registry);
   initializePartialInlinerLegacyPassPass(Registry);
   initializeAttributorLegacyPassPass(Registry);
@@ -60,9 +59,6 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeStripNonDebugSymbolsPass(Registry);
   initializeBarrierNoopPass(Registry);
   initializeEliminateAvailableExternallyLegacyPassPass(Registry);
-  initializeSampleProfileLoaderLegacyPassPass(Registry);
-  initializeFunctionImportLegacyPassPass(Registry);
-  initializeWholeProgramDevirtPass(Registry);
 }
 
 void LLVMInitializeIPO(LLVMPassRegistryRef R) {
diff --git a/llvm/lib/Transforms/IPO/Internalize.cpp b/llvm/lib/Transforms/IPO/Internalize.cpp
index 5aa5b905f06c..85b1a8303d33 100644
--- a/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
@@ -40,13 +41,13 @@ STATISTIC(NumAliases, "Number of aliases internalized");
 STATISTIC(NumFunctions, "Number of functions internalized");
 STATISTIC(NumGlobals, "Number of global vars internalized");
 
-// APIFile - A file which contains a list of symbols that should not be marked
-// external.
+// APIFile - A file which contains a list of symbol glob patterns that should
+// not be marked external.
 static cl::opt<std::string>
     APIFile("internalize-public-api-file", cl::value_desc("filename"),
             cl::desc("A file containing list of symbol names to preserve"));
 
-// APIList - A list of symbols that should not be marked internal.
+// APIList - A list of symbol glob patterns that should not be marked internal.
 static cl::list<std::string>
     APIList("internalize-public-api-list", cl::value_desc("list"),
             cl::desc("A list of symbol names to preserve"), cl::CommaSeparated);
@@ -59,29 +60,44 @@ public:
   PreserveAPIList() {
     if (!APIFile.empty())
       LoadFile(APIFile);
-    ExternalNames.insert(APIList.begin(), APIList.end());
+    for (StringRef Pattern : APIList)
+      addGlob(Pattern);
   }
 
   bool operator()(const GlobalValue &GV) {
-    return ExternalNames.count(GV.getName());
+    return llvm::any_of(
+        ExternalNames, [&](GlobPattern &GP) { return GP.match(GV.getName()); });
   }
 
 private:
   // Contains the set of symbols loaded from file
-  StringSet<> ExternalNames;
+  SmallVector<GlobPattern> ExternalNames;
+
+  void addGlob(StringRef Pattern) {
+    auto GlobOrErr = GlobPattern::create(Pattern);
+    if (!GlobOrErr) {
+      errs() << "WARNING: when loading pattern: '"
+             << toString(GlobOrErr.takeError()) << "' ignoring";
+      return;
+    }
+    ExternalNames.emplace_back(std::move(*GlobOrErr));
+  }
 
   void LoadFile(StringRef Filename) {
     // Load the APIFile...
-    ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+    ErrorOr<std::unique_ptr<MemoryBuffer>> BufOrErr =
         MemoryBuffer::getFile(Filename);
-    if (!Buf) {
+    if (!BufOrErr) {
       errs() << "WARNING: Internalize couldn't load file '" << Filename
              << "'! Continuing as if it's empty.\n";
       return; // Just continue as if the file were empty
     }
-    for (line_iterator I(*Buf->get(), true), E; I != E; ++I)
-      ExternalNames.insert(*I);
+    Buf = std::move(*BufOrErr);
+    for (line_iterator I(*Buf, true), E; I != E; ++I)
+      addGlob(*I);
   }
+
+  std::shared_ptr<MemoryBuffer> Buf;
 };
 } // end anonymous namespace
 
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index d5f1d291f41f..6bf25df101fa 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -528,50 +528,8 @@ public:
   // arguments. For testing purposes only.
   static bool runForTesting(Module &M);
 };
-
-struct LowerTypeTests : public ModulePass {
-  static char ID;
-
-  bool UseCommandLine = false;
-
-  ModuleSummaryIndex *ExportSummary;
-  const ModuleSummaryIndex *ImportSummary;
-  bool DropTypeTests;
-
-  LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
-    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
-  }
-
-  LowerTypeTests(ModuleSummaryIndex *ExportSummary,
-                 const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
-      : ModulePass(ID), ExportSummary(ExportSummary),
-        ImportSummary(ImportSummary),
-        DropTypeTests(DropTypeTests || ClDropTypeTests) {
-    initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnModule(Module &M) override {
-    if (UseCommandLine)
-      return LowerTypeTestsModule::runForTesting(M);
-    return LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
-        .lower();
-  }
-};
-
 } // end anonymous namespace
 
-char LowerTypeTests::ID = 0;
-
-INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
-                false)
-
-ModulePass *
-llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
-                               const ModuleSummaryIndex *ImportSummary,
-                               bool DropTypeTests) {
-  return new LowerTypeTests(ExportSummary, ImportSummary, DropTypeTests);
-}
-
 /// Build a bit set for TypeId using the object layouts in
 /// GlobalLayout.
 BitSetInfo LowerTypeTestsModule::buildBitSet(
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 8e0ca8c6c997..0b42fc151991 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -4808,7 +4808,7 @@ void OpenMPOpt::registerAAs(bool IsModulePass) {
       if (auto *LI = dyn_cast<LoadInst>(&I)) {
         bool UsedAssumedInformation = false;
         A.getAssumedSimplified(IRPosition::value(*LI), /* AA */ nullptr,
-                               UsedAssumedInformation);
+                               UsedAssumedInformation, AA::Interprocedural);
       } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
         A.getOrCreateAAFor<AAIsDead>(IRPosition::value(*SI));
       }
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 8eef82675e86..f1b6f2bb7de4 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -91,14 +91,6 @@ cl::opt<bool> EnableDFAJumpThreading("enable-dfa-jump-thread",
                                      cl::desc("Enable DFA jump threading."),
                                      cl::init(false), cl::Hidden);
 
-static cl::opt<bool>
-    EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
-                            cl::desc("Enable preparation for ThinLTO."));
-
-static cl::opt<bool>
-    EnablePerformThinLTO("perform-thinlto", cl::init(false), cl::Hidden,
-                         cl::desc("Enable performing ThinLTO."));
-
 cl::opt<bool> EnableHotColdSplit("hot-cold-split",
                                  cl::desc("Enable hot-cold splitting pass"));
 
@@ -192,15 +184,6 @@ PassManagerBuilder::PassManagerBuilder() {
     VerifyInput = false;
     VerifyOutput = false;
     MergeFunctions = false;
-    PrepareForLTO = false;
-    EnablePGOInstrGen = false;
-    EnablePGOCSInstrGen = false;
-    EnablePGOCSInstrUse = false;
-    PGOInstrGen = "";
-    PGOInstrUse = "";
-    PGOSampleUse = "";
-    PrepareForThinLTO = EnablePrepareForThinLTO;
-    PerformThinLTO = EnablePerformThinLTO;
     DivergentTarget = false;
     CallGraphProfile = true;
 }
@@ -390,7 +373,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
                          /*AllowSpeculation=*/false));
   // Rotate Loop - disable header duplication at -Oz
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, false));
   // TODO: Investigate promotion cap for O1.
   MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap,
                          /*AllowSpeculation=*/true));
@@ -470,10 +453,6 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // Clean up after everything.
   MPM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, MPM);
-
-  if (EnableCHR && OptLevel >= 3 &&
-      (!PGOInstrUse.empty() || !PGOSampleUse.empty() || EnablePGOCSInstrGen))
-    MPM.add(createControlHeightReductionLegacyPass());
 }
 
 /// FIXME: Should LTO cause any differences to this set of passes?
@@ -598,15 +577,6 @@ void PassManagerBuilder::populateModulePassManager(
     legacy::PassManagerBase &MPM) {
   MPM.add(createAnnotation2MetadataLegacyPass());
 
-  if (!PGOSampleUse.empty()) {
-    MPM.add(createPruneEHPass());
-    // In ThinLTO mode, when flattened profile is used, all the available
-    // profile information will be annotated in PreLink phase so there is
-    // no need to load the profile again in PostLink.
-    if (!(FlattenedProfileUsed && PerformThinLTO))
-      MPM.add(createSampleProfileLoaderPass(PGOSampleUse));
-  }
-
   // Allow forcing function attributes as a debugging and tuning aid.
   MPM.add(createForceFunctionAttrsLegacyPass());
 
@@ -628,26 +598,8 @@ void PassManagerBuilder::populateModulePassManager(
     else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
       MPM.add(createBarrierNoopPass());
 
-    if (PerformThinLTO) {
-      MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
-      // Drop available_externally and unreferenced globals. This is necessary
-      // with ThinLTO in order to avoid leaving undefined references to dead
-      // globals in the object file.
-      MPM.add(createEliminateAvailableExternallyPass());
-      MPM.add(createGlobalDCEPass());
-    }
-
     addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
 
-    if (PrepareForLTO || PrepareForThinLTO) {
-      MPM.add(createCanonicalizeAliasesPass());
-      // Rename anon globals to be able to export them in the summary.
-      // This has to be done after we add the extensions to the pass manager
-      // as there could be passes (e.g. Adddress sanitizer) which introduce
-      // new unnamed globals.
-      MPM.add(createNameAnonGlobalPass());
-    }
-
     MPM.add(createAnnotationRemarksLegacyPass());
     return;
   }
@@ -658,25 +610,6 @@ void PassManagerBuilder::populateModulePassManager(
 
   addInitialAliasAnalysisPasses(MPM);
 
-  // For ThinLTO there are two passes of indirect call promotion. The
-  // first is during the compile phase when PerformThinLTO=false and
-  // intra-module indirect call targets are promoted. The second is during
-  // the ThinLTO backend when PerformThinLTO=true, when we promote imported
-  // inter-module indirect calls. For that we perform indirect call promotion
-  // earlier in the pass pipeline, here before globalopt. Otherwise imported
-  // available_externally functions look unreferenced and are removed.
-  if (PerformThinLTO) {
-    MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
-  }
-
-  // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
-  // as it will change the CFG too much to make the 2nd profile annotation
-  // in backend more difficult.
-  bool PrepareForThinLTOUsingPGOSampleProfile =
-      PrepareForThinLTO && !PGOSampleUse.empty();
-  if (PrepareForThinLTOUsingPGOSampleProfile)
-    DisableUnrollLoops = true;
-
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
@@ -744,7 +677,7 @@ void PassManagerBuilder::populateModulePassManager(
   if (RunPartialInlining)
     MPM.add(createPartialInliningPass());
 
-  if (OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO)
+  if (OptLevel > 1)
     // Remove avail extern fns and globals definitions if we aren't
     // compiling an object file for later LTO. For LTO we want to preserve
     // these so they are eligible for inlining at link-time. Note if they
@@ -756,9 +689,6 @@ void PassManagerBuilder::populateModulePassManager(
     // and saves running remaining passes on the eliminated functions.
     MPM.add(createEliminateAvailableExternallyPass());
 
-  if (EnableOrderFileInstrumentation)
-    MPM.add(createInstrOrderFilePass());
-
   MPM.add(createReversePostOrderFunctionAttrsPass());
 
   // The inliner performs some kind of dead code elimination as it goes,
@@ -772,24 +702,6 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createGlobalDCEPass());
   }
 
-  // If we are planning to perform ThinLTO later, let's not bloat the code with
-  // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
-  // during ThinLTO and perform the rest of the optimizations afterward.
-  if (PrepareForThinLTO) {
-    // Ensure we perform any last passes, but do so before renaming anonymous
-    // globals in case the passes add any.
-    addExtensionsToPM(EP_OptimizerLast, MPM);
-    MPM.add(createCanonicalizeAliasesPass());
-    // Rename anon globals to be able to export them in the summary.
-    MPM.add(createNameAnonGlobalPass());
-    return;
-  }
-
-  if (PerformThinLTO)
-    // Optimize globals now when performing ThinLTO, this enables more
-    // optimizations later.
-    MPM.add(createGlobalOptimizerPass());
-
   // Scheduling LoopVersioningLICM when inlining is over, because after that
   // we may see more accurate aliasing. Reason to run this late is that too
   // early versioning may prevent further inlining due to increase of code
@@ -834,7 +746,7 @@ void PassManagerBuilder::populateModulePassManager(
   // Re-rotate loops in all our loop nests. These may have fallout out of
   // rotated form due to GVN or other transformations, and the vectorizer relies
   // on the rotated form. Disable header duplication at -Oz.
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
+  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, false));
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
@@ -856,7 +768,7 @@ void PassManagerBuilder::populateModulePassManager(
 
   // See comment in the new PM for justification of scheduling splitting at
   // this stage (\ref buildModuleSimplificationPipeline).
-  if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO))
+  if (EnableHotColdSplit)
     MPM.add(createHotColdSplittingPass());
 
   if (EnableIROutliner)
@@ -865,10 +777,6 @@ void PassManagerBuilder::populateModulePassManager(
   if (MergeFunctions)
     MPM.add(createMergeFunctionsPass());
 
-  // Add Module flag "CG Profile" based on Branch Frequency Information.
-  if (CallGraphProfile)
-    MPM.add(createCGProfileLegacyPass());
-
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -889,12 +797,6 @@ void PassManagerBuilder::populateModulePassManager(
 
   addExtensionsToPM(EP_OptimizerLast, MPM);
 
-  if (PrepareForLTO) {
-    MPM.add(createCanonicalizeAliasesPass());
-    // Rename anon globals to be able to handle them in the summary
-    MPM.add(createNameAnonGlobalPass());
-  }
-
   MPM.add(createAnnotationRemarksLegacyPass());
 }
 
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 55fee213cd5f..f76b886e810a 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -546,53 +546,6 @@ private:
     return AnnotatedPassName.c_str();
   }
 };
-
-class SampleProfileLoaderLegacyPass : public ModulePass {
-public:
-  // Class identification, replacement for typeinfo
-  static char ID;
-
-  SampleProfileLoaderLegacyPass(
-      StringRef Name = SampleProfileFile,
-      ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
-      : ModulePass(ID), SampleLoader(
-                            Name, SampleProfileRemappingFile, LTOPhase,
-                            [&](Function &F) -> AssumptionCache & {
-                              return ACT->getAssumptionCache(F);
-                            },
-                            [&](Function &F) -> TargetTransformInfo & {
-                              return TTIWP->getTTI(F);
-                            },
-                            [&](Function &F) -> TargetLibraryInfo & {
-                              return TLIWP->getTLI(F);
-                            }) {
-    initializeSampleProfileLoaderLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  void dump() { SampleLoader.dump(); }
-
-  bool doInitialization(Module &M) override {
-    return SampleLoader.doInitialization(M);
-  }
-
-  StringRef getPassName() const override { return "Sample profile pass"; }
-  bool runOnModule(Module &M) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<ProfileSummaryInfoWrapperPass>();
-  }
-
-private:
-  SampleProfileLoader SampleLoader;
-  AssumptionCacheTracker *ACT = nullptr;
-  TargetTransformInfoWrapperPass *TTIWP = nullptr;
-  TargetLibraryInfoWrapperPass *TLIWP = nullptr;
-};
-
 } // end anonymous namespace
 
 ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
@@ -734,8 +687,8 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
 
   auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
     assert(L && R && "Expect non-null FunctionSamples");
-    if (L->getEntrySamples() != R->getEntrySamples())
-      return L->getEntrySamples() > R->getEntrySamples();
+    if (L->getHeadSamplesEstimate() != R->getHeadSamplesEstimate())
+      return L->getHeadSamplesEstimate() > R->getHeadSamplesEstimate();
     return FunctionSamples::getGUID(L->getName()) <
            FunctionSamples::getGUID(R->getName());
   };
@@ -750,7 +703,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
     // as that already includes both inlined callee and non-inlined ones..
     Sum = 0;
     for (const auto *const FS : CalleeSamples) {
-      Sum += FS->getEntrySamples();
+      Sum += FS->getHeadSamplesEstimate();
       R.push_back(FS);
     }
     llvm::sort(R, FSCompare);
@@ -771,7 +724,7 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
     if (M->empty())
       return R;
     for (const auto &NameFS : *M) {
-      Sum += NameFS.second.getEntrySamples();
+      Sum += NameFS.second.getHeadSamplesEstimate();
       R.push_back(&NameFS.second);
     }
     llvm::sort(R, FSCompare);
@@ -1090,7 +1043,7 @@ void SampleProfileLoader::findExternalInlineCandidate(
     bool PreInline =
         UsePreInlinerDecision &&
         CalleeSample->getContext().hasAttribute(ContextShouldBeInlined);
-    if (!PreInline && CalleeSample->getEntrySamples() < Threshold)
+    if (!PreInline && CalleeSample->getHeadSamplesEstimate() < Threshold)
       continue;
 
     StringRef Name = CalleeSample->getFuncName();
@@ -1171,7 +1124,8 @@ bool SampleProfileLoader::inlineHotFunctions(
               assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
                      "GUIDToFuncNameMap has to be populated");
               AllCandidates.push_back(CB);
-              if (FS->getEntrySamples() > 0 || FunctionSamples::ProfileIsCS)
+              if (FS->getHeadSamplesEstimate() > 0 ||
+                  FunctionSamples::ProfileIsCS)
                 LocalNotInlinedCallSites.try_emplace(CB, FS);
               if (callsiteIsHot(FS, PSI, ProfAccForSymsInList))
                 Hot = true;
@@ -1211,7 +1165,7 @@ bool SampleProfileLoader::inlineHotFunctions(
           if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList))
             continue;
 
-          Candidate = {I, FS, FS->getEntrySamples(), 1.0};
+          Candidate = {I, FS, FS->getHeadSamplesEstimate(), 1.0};
           if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum)) {
             LocalNotInlinedCallSites.erase(I);
             LocalChanged = true;
@@ -1325,7 +1279,7 @@ bool SampleProfileLoader::getInlineCandidate(InlineCandidate *NewCandidate,
     Factor = Probe->Factor;
 
   uint64_t CallsiteCount =
-      CalleeSamples ? CalleeSamples->getEntrySamples() * Factor : 0;
+      CalleeSamples ? CalleeSamples->getHeadSamplesEstimate() * Factor : 0;
   *NewCandidate = {CB, CalleeSamples, CallsiteCount, Factor};
   return true;
 }
@@ -1481,7 +1435,7 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
           continue;
         }
         uint64_t EntryCountDistributed =
-            FS->getEntrySamples() * Candidate.CallsiteDistribution;
+            FS->getHeadSamplesEstimate() * Candidate.CallsiteDistribution;
         // In addition to regular inline cost check, we also need to make sure
         // ICP isn't introducing excessive speculative checks even if individual
         // target looks beneficial to promote and inline. That means we should
@@ -1568,7 +1522,7 @@ void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
 
     ++NumCSNotInlined;
     const FunctionSamples *FS = Pair.getSecond();
-    if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
+    if (FS->getTotalSamples() == 0 && FS->getHeadSamplesEstimate() == 0) {
       continue;
     }
 
@@ -1586,7 +1540,7 @@ void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
         // Use entry samples as head samples during the merge, as inlinees
         // don't have head samples.
         const_cast<FunctionSamples *>(FS)->addHeadSamples(
-            FS->getEntrySamples());
+            FS->getHeadSamplesEstimate());
 
         // Note that we have to do the merge right after processing function.
         // This allows OutlineFS's profile to be used for annotation during
@@ -1599,7 +1553,7 @@ void SampleProfileLoader::promoteMergeNotInlinedContextSamples(
     } else {
       auto pair =
           notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
-      pair.first->second.entryCount += FS->getEntrySamples();
+      pair.first->second.entryCount += FS->getHeadSamplesEstimate();
     }
   }
 }
@@ -1663,7 +1617,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
             if (const FunctionSamplesMap *M =
                     FS->findFunctionSamplesMapAt(CallSite)) {
               for (const auto &NameFS : *M)
-                Sum += NameFS.second.getEntrySamples();
+                Sum += NameFS.second.getHeadSamplesEstimate();
             }
           }
           if (Sum)
@@ -1825,17 +1779,6 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   return Changed;
 }
 
-char SampleProfileLoaderLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
-                      "Sample Profile loader", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
-                    "Sample Profile loader", false, false)
-
 std::unique_ptr<ProfiledCallGraph>
 SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) {
   std::unique_ptr<ProfiledCallGraph> ProfiledCG;
@@ -2073,14 +2016,6 @@ bool SampleProfileLoader::doInitialization(Module &M,
   return true;
 }
 
-ModulePass *llvm::createSampleProfileLoaderPass() {
-  return new SampleProfileLoaderLegacyPass();
-}
-
-ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
-  return new SampleProfileLoaderLegacyPass(Name);
-}
-
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI, CallGraph *CG) {
   GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
@@ -2141,15 +2076,6 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
   return retval;
 }
 
-bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
-  ACT = &getAnalysis<AssumptionCacheTracker>();
-  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
-  TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>();
-  ProfileSummaryInfo *PSI =
-      &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
-}
-
 bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
   LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
   DILocation2SampleMap.clear();
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 898a213d0849..ad00c116ce0a 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -747,78 +747,8 @@ struct DevirtIndex {
 
   void run();
 };
-
-struct WholeProgramDevirt : public ModulePass {
-  static char ID;
-
-  bool UseCommandLine = false;
-
-  ModuleSummaryIndex *ExportSummary = nullptr;
-  const ModuleSummaryIndex *ImportSummary = nullptr;
-
-  WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
-    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
-  }
-
-  WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
-                     const ModuleSummaryIndex *ImportSummary)
-      : ModulePass(ID), ExportSummary(ExportSummary),
-        ImportSummary(ImportSummary) {
-    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnModule(Module &M) override {
-    if (skipModule(M))
-      return false;
-
-    // In the new pass manager, we can request the optimization
-    // remark emitter pass on a per-function-basis, which the
-    // OREGetter will do for us.
-    // In the old pass manager, this is harder, so we just build
-    // an optimization remark emitter on the fly, when we need it.
-    std::unique_ptr<OptimizationRemarkEmitter> ORE;
-    auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
-      ORE = std::make_unique<OptimizationRemarkEmitter>(F);
-      return *ORE;
-    };
-
-    auto LookupDomTree = [this](Function &F) -> DominatorTree & {
-      return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
-    };
-
-    if (UseCommandLine)
-      return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter,
-                                         LookupDomTree);
-
-    return DevirtModule(M, LegacyAARGetter(*this), OREGetter, LookupDomTree,
-                        ExportSummary, ImportSummary)
-        .run();
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-  }
-};
-
 } // end anonymous namespace
 
-INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
-                      "Whole program devirtualization", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
-                    "Whole program devirtualization", false, false)
-char WholeProgramDevirt::ID = 0;
-
-ModulePass *
-llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
-                                   const ModuleSummaryIndex *ImportSummary) {
-  return new WholeProgramDevirt(ExportSummary, ImportSummary);
-}
-
 PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
                                               ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 535a7736454c..4a459ec6c550 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1966,12 +1966,14 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
       return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));
   }
 
-  // If there's no chance any bit will need to borrow from an adjacent bit:
-  // sub C, X --> xor X, C
   const APInt *Op0C;
-  if (match(Op0, m_APInt(Op0C)) &&
-      (~computeKnownBits(Op1, 0, &I).Zero).isSubsetOf(*Op0C))
-    return BinaryOperator::CreateXor(Op1, Op0);
+  if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) {
+    // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
+    // zero.
+    KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
+    if ((*Op0C | RHSKnown.Zero).isAllOnes())
+      return BinaryOperator::CreateXor(Op1, Op0);
+  }
 
   {
     Value *Y;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index a8f2cd79830a..8253c575bc37 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2664,8 +2664,8 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // Inverted form (example):
   // (icmp slt (X | Y), 0) & (icmp sgt (X & Y), -1) -> (icmp slt (X ^ Y), 0)
   bool TrueIfSignedL, TrueIfSignedR;
-  if (InstCombiner::isSignBitCheck(PredL, *LHSC, TrueIfSignedL) &&
-      InstCombiner::isSignBitCheck(PredR, *RHSC, TrueIfSignedR) &&
+  if (isSignBitCheck(PredL, *LHSC, TrueIfSignedL) &&
+      isSignBitCheck(PredR, *RHSC, TrueIfSignedR) &&
       (RHS->hasOneUse() || LHS->hasOneUse())) {
     Value *X, *Y;
     if (IsAnd) {
@@ -3202,25 +3202,38 @@ Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // TODO: This can be generalized to compares of non-signbits using
   // decomposeBitTestICmp(). It could be enhanced more by using (something like)
   // foldLogOpOfMaskedICmps().
-  if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
+  const APInt *LC, *RC;
+  if (match(LHS1, m_APInt(LC)) && match(RHS1, m_APInt(RC)) &&
       LHS0->getType() == RHS0->getType() &&
-      LHS0->getType()->isIntOrIntVectorTy()) {
+      LHS0->getType()->isIntOrIntVectorTy() &&
+      (LHS->hasOneUse() || RHS->hasOneUse())) {
+    // Convert xor of signbit tests to signbit test of xor'd values:
     // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
     // (X <  0) ^ (Y <  0) --> (X ^ Y) < 0
-    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
-         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) ||
-        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
-         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())))
-      return Builder.CreateIsNeg(Builder.CreateXor(LHS0, RHS0));
-
     // (X > -1) ^ (Y <  0) --> (X ^ Y) > -1
     // (X <  0) ^ (Y > -1) --> (X ^ Y) > -1
-    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
-         PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) ||
-        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
-         PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())))
-      return Builder.CreateIsNotNeg(Builder.CreateXor(LHS0, RHS0));
-
+    bool TrueIfSignedL, TrueIfSignedR;
+    if (isSignBitCheck(PredL, *LC, TrueIfSignedL) &&
+        isSignBitCheck(PredR, *RC, TrueIfSignedR)) {
+      Value *XorLR = Builder.CreateXor(LHS0, RHS0);
+      return TrueIfSignedL == TrueIfSignedR ? Builder.CreateIsNeg(XorLR) :
+                                              Builder.CreateIsNotNeg(XorLR);
+    }
+
+    // (X > C) ^ (X < C + 2) --> X != C + 1
+    // (X < C + 2) ^ (X > C) --> X != C + 1
+    // Considering the correctness of this pattern, we should avoid that C is
+    // non-negative and C + 2 is negative, although it will be matched by other
+    // patterns.
+    const APInt *C1, *C2;
+    if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_APInt(C1)) &&
+         PredR == CmpInst::ICMP_SLT && match(RHS1, m_APInt(C2))) ||
+        (PredL == CmpInst::ICMP_SLT && match(LHS1, m_APInt(C2)) &&
+         PredR == CmpInst::ICMP_SGT && match(RHS1, m_APInt(C1))))
+      if (LHS0 == RHS0 && *C1 + 2 == *C2 &&
+          (C1->isNegative() || C2->isNonNegative()))
+        return Builder.CreateICmpNE(LHS0,
+                                    ConstantInt::get(LHS0->getType(), *C1 + 1));
   }
 
   // Instead of trying to imitate the folds for and/or, decompose this 'xor'
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index edfdf70c2b97..bc01d2ef7fe2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1140,8 +1140,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Value *V = simplifyCall(&CI, SQ.getWithInstruction(&CI)))
       return replaceInstUsesWith(CI, V);
 
-  if (isFreeCall(&CI, &TLI))
-    return visitFree(CI);
+  if (Value *FreedOp = getFreedOperand(&CI, &TLI))
+    return visitFree(CI, FreedOp);
 
   // If the caller function (i.e. us, the function that contains this CallInst)
   // is nounwind, mark the call as nounwind, even if the callee isn't.
@@ -1539,8 +1539,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Type *Ty = II->getType();
     unsigned BitWidth = Ty->getScalarSizeInBits();
     Constant *ShAmtC;
-    if (match(II->getArgOperand(2), m_ImmConstant(ShAmtC)) &&
-        !ShAmtC->containsConstantExpression()) {
+    if (match(II->getArgOperand(2), m_ImmConstant(ShAmtC))) {
       // Canonicalize a shift amount constant operand to modulo the bit-width.
       Constant *WidthC = ConstantInt::get(Ty, BitWidth);
       Constant *ModuloC =
@@ -2885,21 +2884,21 @@ bool InstCombinerImpl::annotateAnyAllocSite(CallBase &Call,
   // of the respective allocator declaration with generic attributes.
   bool Changed = false;
 
-  if (isAllocationFn(&Call, TLI)) {
-    uint64_t Size;
-    ObjectSizeOpts Opts;
-    if (getObjectSize(&Call, Size, DL, TLI, Opts) && Size > 0) {
-      // TODO: We really should just emit deref_or_null here and then
-      // let the generic inference code combine that with nonnull.
-      if (Call.hasRetAttr(Attribute::NonNull)) {
-        Changed = !Call.hasRetAttr(Attribute::Dereferenceable);
-        Call.addRetAttr(
-            Attribute::getWithDereferenceableBytes(Call.getContext(), Size));
-      } else {
-        Changed = !Call.hasRetAttr(Attribute::DereferenceableOrNull);
-        Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
-            Call.getContext(), Size));
-      }
+  if (!Call.getType()->isPointerTy())
+    return Changed;
+
+  Optional<APInt> Size = getAllocSize(&Call, TLI);
+  if (Size && *Size != 0) {
+    // TODO: We really should just emit deref_or_null here and then
+    // let the generic inference code combine that with nonnull.
+    if (Call.hasRetAttr(Attribute::NonNull)) {
+      Changed = !Call.hasRetAttr(Attribute::Dereferenceable);
+      Call.addRetAttr(Attribute::getWithDereferenceableBytes(
+          Call.getContext(), Size->getLimitedValue()));
+    } else {
+      Changed = !Call.hasRetAttr(Attribute::DereferenceableOrNull);
+      Call.addRetAttr(Attribute::getWithDereferenceableOrNullBytes(
+          Call.getContext(), Size->getLimitedValue()));
     }
   }
 
@@ -3079,8 +3078,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
             Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
     }
 
-  if (isAllocationFn(&Call, &TLI) &&
-      isAllocRemovable(&cast<CallBase>(Call), &TLI))
+  if (isRemovableAlloc(&Call, &TLI))
     return visitAllocSite(Call);
 
   // Handle intrinsics which can be used in both call and invoke context.
@@ -3242,15 +3240,16 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     // the call because there is no place to put the cast instruction (without
     // breaking the critical edge).  Bail out in this case.
     if (!Caller->use_empty()) {
-      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
-        for (User *U : II->users())
+      BasicBlock *PhisNotSupportedBlock = nullptr;
+      if (auto *II = dyn_cast<InvokeInst>(Caller))
+        PhisNotSupportedBlock = II->getNormalDest();
+      if (auto *CB = dyn_cast<CallBrInst>(Caller))
+        PhisNotSupportedBlock = CB->getDefaultDest();
+      if (PhisNotSupportedBlock)
+        for (User *U : Caller->users())
           if (PHINode *PN = dyn_cast<PHINode>(U))
-            if (PN->getParent() == II->getNormalDest() ||
-                PN->getParent() == II->getUnwindDest())
+            if (PN->getParent() == PhisNotSupportedBlock)
               return false;
-      // FIXME: Be conservative for callbr to avoid a quadratic search.
-      if (isa<CallBrInst>(Caller))
-        return false;
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 9f6d36b85522..158d2e8289e0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2002,9 +2002,12 @@ Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
                         Constant::getNullValue(Mul->getType()));
   }
 
+  if (MulC->isZero() || !(Mul->hasNoSignedWrap() || Mul->hasNoUnsignedWrap()))
+    return nullptr;
+
   // If the multiply does not wrap, try to divide the compare constant by the
   // multiplication factor.
-  if (Cmp.isEquality() && !MulC->isZero()) {
+  if (Cmp.isEquality()) {
     // (mul nsw X, MulC) == C --> X == C /s MulC
     if (Mul->hasNoSignedWrap() && C.srem(*MulC).isZero()) {
       Constant *NewC = ConstantInt::get(Mul->getType(), C.sdiv(*MulC));
@@ -2017,7 +2020,40 @@ Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
     }
   }
 
-  return nullptr;
+  Constant *NewC = nullptr;
+
+  // FIXME: Add assert that Pred is not equal to ICMP_SGE, ICMP_SLE,
+  // ICMP_UGE, ICMP_ULE.
+
+  if (Mul->hasNoSignedWrap()) {
+    if (MulC->isNegative()) {
+      // MININT / -1 --> overflow.
+      if (C.isMinSignedValue() && MulC->isAllOnes())
+        return nullptr;
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    }
+    if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE)
+      NewC = ConstantInt::get(
+          Mul->getType(),
+          APIntOps::RoundingSDiv(C, *MulC, APInt::Rounding::UP));
+    if (Pred == ICmpInst::ICMP_SLE || Pred == ICmpInst::ICMP_SGT)
+      NewC = ConstantInt::get(
+          Mul->getType(),
+          APIntOps::RoundingSDiv(C, *MulC, APInt::Rounding::DOWN));
+  }
+
+  if (Mul->hasNoUnsignedWrap()) {
+    if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)
+      NewC = ConstantInt::get(
+          Mul->getType(),
+          APIntOps::RoundingUDiv(C, *MulC, APInt::Rounding::UP));
+    if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT)
+      NewC = ConstantInt::get(
+          Mul->getType(),
+          APIntOps::RoundingUDiv(C, *MulC, APInt::Rounding::DOWN));
+  }
+
+  return NewC ? new ICmpInst(Pred, Mul->getOperand(0), NewC) : nullptr;
 }
 
 /// Fold icmp (shl 1, Y), C.
@@ -2235,13 +2271,22 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
 
   bool IsAShr = Shr->getOpcode() == Instruction::AShr;
   const APInt *ShiftValC;
-  if (match(Shr->getOperand(0), m_APInt(ShiftValC))) {
+  if (match(X, m_APInt(ShiftValC))) {
     if (Cmp.isEquality())
       return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftValC);
 
+    // (ShiftValC >> Y) >s -1 --> Y != 0 with ShiftValC < 0
+    // (ShiftValC >> Y) <s  0 --> Y == 0 with ShiftValC < 0
+    bool TrueIfSigned;
+    if (!IsAShr && ShiftValC->isNegative() &&
+        isSignBitCheck(Pred, C, TrueIfSigned))
+      return new ICmpInst(TrueIfSigned ? CmpInst::ICMP_EQ : CmpInst::ICMP_NE,
+                          Shr->getOperand(1),
+                          ConstantInt::getNullValue(X->getType()));
+
     // If the shifted constant is a power-of-2, test the shift amount directly:
-    // (ShiftValC >> X) >u C --> X <u (LZ(C) - LZ(ShiftValC))
-    // (ShiftValC >> X) <u C --> X >=u (LZ(C-1) - LZ(ShiftValC))
+    // (ShiftValC >> Y) >u C --> X <u (LZ(C) - LZ(ShiftValC))
+    // (ShiftValC >> Y) <u C --> X >=u (LZ(C-1) - LZ(ShiftValC))
     if (!IsAShr && ShiftValC->isPowerOf2() &&
         (Pred == CmpInst::ICMP_UGT || Pred == CmpInst::ICMP_ULT)) {
       bool IsUGT = Pred == CmpInst::ICMP_UGT;
@@ -2972,7 +3017,7 @@ Instruction *InstCombinerImpl::foldICmpBitCast(ICmpInst &Cmp) {
     const APInt *C;
     bool TrueIfSigned;
     if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
-        InstCombiner::isSignBitCheck(Pred, *C, TrueIfSigned)) {
+        isSignBitCheck(Pred, *C, TrueIfSigned)) {
       if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
           match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
         // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 271154bb3f5a..827b25533513 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -152,7 +152,7 @@ public:
   Instruction *visitGEPOfBitcast(BitCastInst *BCI, GetElementPtrInst &GEP);
   Instruction *visitAllocaInst(AllocaInst &AI);
   Instruction *visitAllocSite(Instruction &FI);
-  Instruction *visitFree(CallInst &FI);
+  Instruction *visitFree(CallInst &FI, Value *FreedOp);
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitAtomicRMWInst(AtomicRMWInst &SI);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index f4e2d1239f0f..13c98b935adf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -566,6 +566,13 @@ static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
         return false;
     return true;
   }
+  case Instruction::Mul: {
+    const APInt *MulConst;
+    // We can fold (shr (mul X, -(1 << C)), C) -> (and (neg X), C`)
+    return !IsLeftShift && match(I->getOperand(1), m_APInt(MulConst)) &&
+           MulConst->isNegatedPowerOf2() &&
+           MulConst->countTrailingZeros() == NumBits;
+  }
   }
 }
 
@@ -680,6 +687,17 @@ static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                                               isLeftShift, IC, DL));
     return PN;
   }
+  case Instruction::Mul: {
+    assert(!isLeftShift && "Unexpected shift direction!");
+    auto *Neg = BinaryOperator::CreateNeg(I->getOperand(0));
+    IC.InsertNewInstWith(Neg, *I);
+    unsigned TypeWidth = I->getType()->getScalarSizeInBits();
+    APInt Mask = APInt::getLowBitsSet(TypeWidth, TypeWidth - NumBits);
+    auto *And = BinaryOperator::CreateAnd(Neg,
+                                          ConstantInt::get(I->getType(), Mask));
+    And->takeName(I);
+    return IC.InsertNewInstWith(And, *I);
+  }
   }
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 75520a0c8d5f..71c763de43b4 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -994,6 +994,24 @@ Instruction *InstCombinerImpl::foldBinopOfSextBoolToSelect(BinaryOperator &BO) {
   return SelectInst::Create(X, TVal, FVal);
 }
 
+static Constant *constantFoldOperationIntoSelectOperand(
+    Instruction &I, SelectInst *SI, Value *SO) {
+  auto *ConstSO = dyn_cast<Constant>(SO);
+  if (!ConstSO)
+    return nullptr;
+
+  SmallVector<Constant *> ConstOps;
+  for (Value *Op : I.operands()) {
+    if (Op == SI)
+      ConstOps.push_back(ConstSO);
+    else if (auto *C = dyn_cast<Constant>(Op))
+      ConstOps.push_back(C);
+    else
+      llvm_unreachable("Operands should be select or constant");
+  }
+  return ConstantFoldInstOperands(&I, ConstOps, I.getModule()->getDataLayout());
+}
+
 static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
                                              InstCombiner::BuilderTy &Builder) {
   if (auto *Cast = dyn_cast<CastInst>(&I))
@@ -1101,8 +1119,17 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
     }
   }
 
-  Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
-  Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
+  // Make sure that one of the select arms constant folds successfully.
+  Value *NewTV = constantFoldOperationIntoSelectOperand(Op, SI, TV);
+  Value *NewFV = constantFoldOperationIntoSelectOperand(Op, SI, FV);
+  if (!NewTV && !NewFV)
+    return nullptr;
+
+  // Create an instruction for the arm that did not fold.
+  if (!NewTV)
+    NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
+  if (!NewFV)
+    NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
   return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
 }
 
@@ -2774,13 +2801,14 @@ static bool isAllocSiteRemovable(Instruction *AI,
           continue;
         }
 
-        if (isFreeCall(I, &TLI) && getAllocationFamily(I, &TLI) == Family) {
+        if (getFreedOperand(cast<CallBase>(I), &TLI) == PI &&
+            getAllocationFamily(I, &TLI) == Family) {
           assert(Family);
           Users.emplace_back(I);
           continue;
         }
 
-        if (isReallocLikeFn(I, &TLI) &&
+        if (getReallocatedOperand(cast<CallBase>(I), &TLI) == PI &&
             getAllocationFamily(I, &TLI) == Family) {
           assert(Family);
           Users.emplace_back(I);
@@ -2805,7 +2833,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
 }
 
 Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
-  assert(isa<AllocaInst>(MI) || isAllocRemovable(&cast<CallBase>(MI), &TLI));
+  assert(isa<AllocaInst>(MI) || isRemovableAlloc(&cast<CallBase>(MI), &TLI));
 
   // If we have a malloc call which is only used in any amount of comparisons to
   // null and free calls, delete the calls and replace the comparisons with true
@@ -3007,9 +3035,7 @@ static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
   return &FI;
 }
 
-Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
-  Value *Op = FI.getArgOperand(0);
-
+Instruction *InstCombinerImpl::visitFree(CallInst &FI, Value *Op) {
   // free undef -> unreachable.
   if (isa<UndefValue>(Op)) {
     // Leave a marker since we can't modify the CFG here.
@@ -3024,12 +3050,10 @@ Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
 
   // If we had free(realloc(...)) with no intervening uses, then eliminate the
   // realloc() entirely.
-  if (CallInst *CI = dyn_cast<CallInst>(Op)) {
-    if (CI->hasOneUse() && isReallocLikeFn(CI, &TLI)) {
-      return eraseInstFromFunction(
-          *replaceInstUsesWith(*CI, CI->getOperand(0)));
-    }
-  }
+  CallInst *CI = dyn_cast<CallInst>(Op);
+  if (CI && CI->hasOneUse())
+    if (Value *ReallocatedOp = getReallocatedOperand(CI, &TLI))
+      return eraseInstFromFunction(*replaceInstUsesWith(*CI, ReallocatedOp));
 
   // If we optimize for code size, try to move the call to free before the null
   // test so that simplify cfg can remove the empty block and dead code
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 4fed4bd18fb1..cf2754b1dd60 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -108,6 +108,7 @@ static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kRISCV64_ShadowOffset64 = 0xd55550000;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kFreeBSDAArch64_ShadowOffset64 = 1ULL << 47;
 static const uint64_t kFreeBSDKasan_ShadowOffset64 = 0xdffff7c000000000;
 static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
@@ -523,6 +524,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
       Mapping.Offset = kPPC64_ShadowOffset64;
     else if (IsSystemZ)
       Mapping.Offset = kSystemZ_ShadowOffset64;
+    else if (IsFreeBSD && IsAArch64)
+        Mapping.Offset = kFreeBSDAArch64_ShadowOffset64;
     else if (IsFreeBSD && !IsMIPS64) {
       if (IsKasan)
         Mapping.Offset = kFreeBSDKasan_ShadowOffset64;
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index 57c491436b93..27107f46ed92 100644
--- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -101,42 +101,6 @@ static bool runCGProfilePass(
   return addModuleFlags(M, Counts);
 }
 
-namespace {
-struct CGProfileLegacyPass final : public ModulePass {
-  static char ID;
-  CGProfileLegacyPass() : ModulePass(ID) {
-    initializeCGProfileLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<LazyBlockFrequencyInfoPass>();
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-  }
-
-  bool runOnModule(Module &M) override {
-    auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
-      return this->getAnalysis<LazyBlockFrequencyInfoPass>(F).getBFI();
-    };
-    auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
-      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    };
-
-    return runCGProfilePass(M, GetBFI, GetTTI, true);
-  }
-};
-
-} // namespace
-
-char CGProfileLegacyPass::ID = 0;
-
-INITIALIZE_PASS(CGProfileLegacyPass, "cg-profile", "Call Graph Profile", false,
-                false)
-
-ModulePass *llvm::createCGProfileLegacyPass() {
-  return new CGProfileLegacyPass();
-}
-
 PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index e5c0705b916e..adc007dacae4 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -103,47 +103,6 @@ static void parseCHRFilterFiles() {
 }
 
 namespace {
-class ControlHeightReductionLegacyPass : public FunctionPass {
-public:
-  static char ID;
-
-  ControlHeightReductionLegacyPass() : FunctionPass(ID) {
-    initializeControlHeightReductionLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-    parseCHRFilterFiles();
-  }
-
-  bool runOnFunction(Function &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<BlockFrequencyInfoWrapperPass>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<ProfileSummaryInfoWrapperPass>();
-    AU.addRequired<RegionInfoPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-};
-} // end anonymous namespace
-
-char ControlHeightReductionLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ControlHeightReductionLegacyPass,
-                      "chr",
-                      "Reduce control height in the hot paths",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
-INITIALIZE_PASS_END(ControlHeightReductionLegacyPass,
-                    "chr",
-                    "Reduce control height in the hot paths",
-                    false, false)
-
-FunctionPass *llvm::createControlHeightReductionLegacyPass() {
-  return new ControlHeightReductionLegacyPass();
-}
-
-namespace {
 
 struct CHRStats {
   CHRStats() = default;
@@ -2083,18 +2042,6 @@ bool CHR::run() {
   return Changed;
 }
 
-bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
-  BlockFrequencyInfo &BFI =
-      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  ProfileSummaryInfo &PSI =
-      getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
-  std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
-      std::make_unique<OptimizationRemarkEmitter>(&F);
-  return CHR(F, BFI, DT, PSI, RI, *OwnedORE).run();
-}
-
 namespace llvm {
 
 ControlHeightReductionPass::ControlHeightReductionPass() {
diff --git a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index 2091881c29fe..d7561c193aa3 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -163,42 +163,11 @@ public:
   }
 
 }; // End of InstrOrderFile struct
-
-class InstrOrderFileLegacyPass : public ModulePass {
-public:
-  static char ID;
-
-  InstrOrderFileLegacyPass() : ModulePass(ID) {
-    initializeInstrOrderFileLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-
-  bool runOnModule(Module &M) override;
-};
-
 } // End anonymous namespace
 
-bool InstrOrderFileLegacyPass::runOnModule(Module &M) {
-  if (skipModule(M))
-    return false;
-
-  return InstrOrderFile().run(M);
-}
-
 PreservedAnalyses
 InstrOrderFilePass::run(Module &M, ModuleAnalysisManager &AM) {
   if (InstrOrderFile().run(M))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
 }
-
-INITIALIZE_PASS_BEGIN(InstrOrderFileLegacyPass, "instrorderfile",
-                      "Instrumentation for Order File", false, false)
-INITIALIZE_PASS_END(InstrOrderFileLegacyPass, "instrorderfile",
-                    "Instrumentation for Order File", false, false)
-
-char InstrOrderFileLegacyPass::ID = 0;
-
-ModulePass *llvm::createInstrOrderFilePass() {
-  return new InstrOrderFileLegacyPass();
-}
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 3572cb3b50e2..5b7aa304b987 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -147,35 +147,6 @@ cl::opt<bool> SkipRetExitBlock(
     "skip-ret-exit-block", cl::init(true),
     cl::desc("Suppress counter promotion if exit blocks contain ret."));
 
-class InstrProfilingLegacyPass : public ModulePass {
-  InstrProfiling InstrProf;
-
-public:
-  static char ID;
-
-  InstrProfilingLegacyPass() : ModulePass(ID) {}
-  InstrProfilingLegacyPass(const InstrProfOptions &Options, bool IsCS = false)
-      : ModulePass(ID), InstrProf(Options, IsCS) {
-    initializeInstrProfilingLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
-
-  StringRef getPassName() const override {
-    return "Frontend instrumentation-based coverage lowering";
-  }
-
-  bool runOnModule(Module &M) override {
-    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
-      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-    };
-    return InstrProf.run(M, GetTLI);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-  }
-};
-
 ///
 /// A helper class to promote one counter RMW operation in the loop
 /// into register update.
@@ -439,21 +410,6 @@ PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
   return PreservedAnalyses::none();
 }
 
-char InstrProfilingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(InstrProfilingLegacyPass, "instrprof",
-                      "Frontend instrumentation-based coverage lowering.",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(InstrProfilingLegacyPass, "instrprof",
-                    "Frontend instrumentation-based coverage lowering.", false,
-                    false)
-
-ModulePass *
-llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
-                                     bool IsCS) {
-  return new InstrProfilingLegacyPass(Options, IsCS);
-}
-
 bool InstrProfiling::lowerIntrinsics(Function *F) {
   bool MadeChange = false;
   PromotionCandidates.clear();
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index 9ff0e632bd7f..bd575b6cf3b0 100644
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -94,11 +94,6 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeMemProfilerLegacyPassPass(Registry);
   initializeModuleMemProfilerLegacyPassPass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
-  initializeControlHeightReductionLegacyPassPass(Registry);
-  initializeCGProfileLegacyPassPass(Registry);
-  initializeInstrOrderFileLegacyPassPass(Registry);
-  initializeInstrProfilingLegacyPassPass(Registry);
-  initializeModuleSanitizerCoverageLegacyPassPass(Registry);
   initializeDataFlowSanitizerLegacyPassPass(Registry);
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4d72f6c3d1a9..4606bd5de6c3 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -417,6 +417,14 @@ static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
   0x01000000000,   // OriginBase
 };
 
+// aarch64 FreeBSD
+static const MemoryMapParams FreeBSD_AArch64_MemoryMapParams = {
+  0x1800000000000,  // AndMask
+  0x0400000000000,  // XorMask
+  0x0200000000000,  // ShadowBase
+  0x0700000000000,  // OriginBase
+};
+
 // i386 FreeBSD
 static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
   0x000180000000,  // AndMask
@@ -466,6 +474,11 @@ static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
   &Linux_AArch64_MemoryMapParams,
 };
 
+static const PlatformMemoryMapParams FreeBSD_ARM_MemoryMapParams = {
+  nullptr,
+  &FreeBSD_AArch64_MemoryMapParams,
+};
+
 static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
   &FreeBSD_I386_MemoryMapParams,
   &FreeBSD_X86_64_MemoryMapParams,
@@ -894,6 +907,9 @@ void MemorySanitizer::initializeModule(Module &M) {
     switch (TargetTriple.getOS()) {
       case Triple::FreeBSD:
         switch (TargetTriple.getArch()) {
+          case Triple::aarch64:
+            MapParams = FreeBSD_ARM_MemoryMapParams.bits64;
+            break;
           case Triple::x86_64:
             MapParams = FreeBSD_X86_MemoryMapParams.bits64;
             break;
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 3a29cd70e42e..c4512d0222cd 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -210,12 +210,11 @@ cl::opt<bool>
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data for Comdat functions, which often turns out to be false
 // positive due to the pre-instrumentation inline.
-static cl::opt<bool>
-    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
-                            cl::Hidden,
-                            cl::desc("The option is used to turn on/off "
-                                     "warnings about hash mismatch for comdat "
-                                     "functions."));
+static cl::opt<bool> NoPGOWarnMismatchComdatWeak(
+    "no-pgo-warn-mismatch-comdat-weak", cl::init(true), cl::Hidden,
+    cl::desc("The option is used to turn on/off "
+             "warnings about hash mismatch for comdat "
+             "or weak functions."));
 
 // Command line option to enable/disable select instruction instrumentation.
 static cl::opt<bool>
@@ -287,6 +286,11 @@ static cl::opt<unsigned> PGOVerifyBFICutoff(
     cl::desc("Set the threshold for pgo-verify-bfi: skip the counts whose "
              "profile count value is below."));
 
+static cl::opt<std::string> PGOTraceFuncHash(
+    "pgo-trace-func-hash", cl::init("-"), cl::Hidden,
+    cl::value_desc("function name"),
+    cl::desc("Trace the hash of the function with this name."));
+
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
@@ -630,6 +634,10 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
                       << ", High32 CRC = " << JCH.getCRC());
   }
   LLVM_DEBUG(dbgs() << ", Hash = " << FunctionHash << "\n";);
+
+  if (PGOTraceFuncHash != "-" && F.getName().contains(PGOTraceFuncHash))
+    dbgs() << "Funcname=" << F.getName() << ", Hash=" << FunctionHash
+           << " in building " << F.getParent()->getSourceFileName() << "\n";
 }
 
 // Check if we can safely rename this Comdat function.
@@ -832,8 +840,6 @@ static void instrumentOneFunc(
   auto CFGHash = ConstantInt::get(Type::getInt64Ty(M->getContext()),
                                   FuncInfo.FunctionHash);
   if (PGOFunctionEntryCoverage) {
-    assert(!IsCS &&
-           "entry coverge does not support context-sensitive instrumentation");
     auto &EntryBB = F.getEntryBlock();
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
     // llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
@@ -1216,8 +1222,9 @@ static void annotateFunctionWithHashMismatch(Function &F,
 bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
                               bool &AllMinusOnes) {
   auto &Ctx = M->getContext();
-  Expected<InstrProfRecord> Result =
-      PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+  uint64_t MismatchedFuncSum = 0;
+  Expected<InstrProfRecord> Result = PGOReader->getInstrProfRecord(
+      FuncInfo.FuncName, FuncInfo.FunctionHash, &MismatchedFuncSum);
   if (Error E = Result.takeError()) {
     handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
       auto Err = IPE.get();
@@ -1233,10 +1240,11 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
         IsCS ? NumOfCSPGOMismatch++ : NumOfPGOMismatch++;
         SkipWarning =
             NoPGOWarnMismatch ||
-            (NoPGOWarnMismatchComdat &&
-             (F.hasComdat() ||
+            (NoPGOWarnMismatchComdatWeak &&
+             (F.hasComdat() || F.getLinkage() == GlobalValue::WeakAnyLinkage ||
               F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
-        LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
+        LLVM_DEBUG(dbgs() << "hash mismatch (hash= " << FuncInfo.FunctionHash
+                          << " skip=" << SkipWarning << ")");
         // Emit function metadata indicating PGO profile mismatch.
         annotateFunctionWithHashMismatch(F, M->getContext());
       }
@@ -1245,9 +1253,11 @@ bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
       if (SkipWarning)
         return;
 
-      std::string Msg = IPE.message() + std::string(" ") + F.getName().str() +
-                        std::string(" Hash = ") +
-                        std::to_string(FuncInfo.FunctionHash);
+      std::string Msg =
+          IPE.message() + std::string(" ") + F.getName().str() +
+          std::string(" Hash = ") + std::to_string(FuncInfo.FunctionHash) +
+          std::string(" up to ") + std::to_string(MismatchedFuncSum) +
+          std::string(" count discarded");
 
       Ctx.diagnose(
           DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index d9d11cc90d3d..3ca476e74953 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -278,53 +278,6 @@ private:
   const SpecialCaseList *Allowlist;
   const SpecialCaseList *Blocklist;
 };
-
-class ModuleSanitizerCoverageLegacyPass : public ModulePass {
-public:
-  ModuleSanitizerCoverageLegacyPass(
-      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
-      const std::vector<std::string> &AllowlistFiles =
-          std::vector<std::string>(),
-      const std::vector<std::string> &BlocklistFiles =
-          std::vector<std::string>())
-      : ModulePass(ID), Options(Options) {
-    if (AllowlistFiles.size() > 0)
-      Allowlist = SpecialCaseList::createOrDie(AllowlistFiles,
-                                               *vfs::getRealFileSystem());
-    if (BlocklistFiles.size() > 0)
-      Blocklist = SpecialCaseList::createOrDie(BlocklistFiles,
-                                               *vfs::getRealFileSystem());
-    initializeModuleSanitizerCoverageLegacyPassPass(
-        *PassRegistry::getPassRegistry());
-  }
-  bool runOnModule(Module &M) override {
-    ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
-                                         Blocklist.get());
-    auto DTCallback = [this](Function &F) -> const DominatorTree * {
-      return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
-    };
-    auto PDTCallback = [this](Function &F) -> const PostDominatorTree * {
-      return &this->getAnalysis<PostDominatorTreeWrapperPass>(F)
-                  .getPostDomTree();
-    };
-    return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback);
-  }
-
-  static char ID; // Pass identification, replacement for typeid
-  StringRef getPassName() const override { return "ModuleSanitizerCoverage"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<PostDominatorTreeWrapperPass>();
-  }
-
-private:
-  SanitizerCoverageOptions Options;
-
-  std::unique_ptr<SpecialCaseList> Allowlist;
-  std::unique_ptr<SpecialCaseList> Blocklist;
-};
-
 } // namespace
 
 PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
@@ -1075,20 +1028,3 @@ ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const {
     return "\1section$end$__DATA$__" + Section;
   return "__stop___" + Section;
 }
-
-char ModuleSanitizerCoverageLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov",
-                      "Pass for instrumenting coverage on functions", false,
-                      false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
-                    "Pass for instrumenting coverage on functions", false,
-                    false)
-ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
-    const SanitizerCoverageOptions &Options,
-    const std::vector<std::string> &AllowlistFiles,
-    const std::vector<std::string> &BlocklistFiles) {
-  return new ModuleSanitizerCoverageLegacyPass(Options, AllowlistFiles,
-                                               BlocklistFiles);
-}
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 4c42869dbd58..3f0dad7ee769 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -776,6 +776,11 @@ struct DSEState {
   // fall back to CFG scan starting from all non-unreachable roots.
   bool AnyUnreachableExit;
 
+  // Whether or not we should iterate on removing dead stores at the end of the
+  // function due to removing a store causing a previously captured pointer to
+  // no longer be captured.
+  bool ShouldIterateEndOfFunctionDSE;
+
   // Class contains self-reference, make sure it's not copied/moved.
   DSEState(const DSEState &) = delete;
   DSEState &operator=(const DSEState &) = delete;
@@ -1103,9 +1108,8 @@ struct DSEState {
       return {std::make_pair(MemoryLocation(Ptr, Len), false)};
 
     if (auto *CB = dyn_cast<CallBase>(I)) {
-      if (isFreeCall(I, &TLI))
-        return {std::make_pair(MemoryLocation::getAfter(CB->getArgOperand(0)),
-                               true)};
+      if (Value *FreedOp = getFreedOperand(CB, &TLI))
+        return {std::make_pair(MemoryLocation::getAfter(FreedOp), true)};
     }
 
     return None;
@@ -1114,9 +1118,9 @@ struct DSEState {
   /// Returns true if \p I is a memory terminator instruction like
   /// llvm.lifetime.end or free.
   bool isMemTerminatorInst(Instruction *I) const {
-    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
-    return (II && II->getIntrinsicID() == Intrinsic::lifetime_end) ||
-           isFreeCall(I, &TLI);
+    auto *CB = dyn_cast<CallBase>(I);
+    return CB && (CB->getIntrinsicID() == Intrinsic::lifetime_end ||
+                  getFreedOperand(CB, &TLI) != nullptr);
   }
 
   /// Returns true if \p MaybeTerm is a memory terminator for \p Loc from
@@ -1598,6 +1602,14 @@ struct DSEState {
       if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
         if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
           SkipStores.insert(MD);
+          if (auto *SI = dyn_cast<StoreInst>(MD->getMemoryInst())) {
+            if (SI->getValueOperand()->getType()->isPointerTy()) {
+              const Value *UO = getUnderlyingObject(SI->getValueOperand());
+              if (CapturedBeforeReturn.erase(UO))
+                ShouldIterateEndOfFunctionDSE = true;
+              InvisibleToCallerAfterRet.erase(UO);
+            }
+          }
         }
 
         Updater.removeMemoryAccess(MA);
@@ -1671,33 +1683,36 @@ struct DSEState {
     LLVM_DEBUG(
         dbgs()
         << "Trying to eliminate MemoryDefs at the end of the function\n");
-    for (MemoryDef *Def : llvm::reverse(MemDefs)) {
-      if (SkipStores.contains(Def))
-        continue;
+    do {
+      ShouldIterateEndOfFunctionDSE = false;
+      for (MemoryDef *Def : llvm::reverse(MemDefs)) {
+        if (SkipStores.contains(Def))
+          continue;
 
-      Instruction *DefI = Def->getMemoryInst();
-      auto DefLoc = getLocForWrite(DefI);
-      if (!DefLoc || !isRemovable(DefI))
-        continue;
+        Instruction *DefI = Def->getMemoryInst();
+        auto DefLoc = getLocForWrite(DefI);
+        if (!DefLoc || !isRemovable(DefI))
+          continue;
 
-      // NOTE: Currently eliminating writes at the end of a function is limited
-      // to MemoryDefs with a single underlying object, to save compile-time. In
-      // practice it appears the case with multiple underlying objects is very
-      // uncommon. If it turns out to be important, we can use
-      // getUnderlyingObjects here instead.
-      const Value *UO = getUnderlyingObject(DefLoc->Ptr);
-      if (!isInvisibleToCallerAfterRet(UO))
-        continue;
+        // NOTE: Currently eliminating writes at the end of a function is
+        // limited to MemoryDefs with a single underlying object, to save
+        // compile-time. In practice it appears the case with multiple
+        // underlying objects is very uncommon. If it turns out to be important,
+        // we can use getUnderlyingObjects here instead.
+        const Value *UO = getUnderlyingObject(DefLoc->Ptr);
+        if (!isInvisibleToCallerAfterRet(UO))
+          continue;
 
-      if (isWriteAtEndOfFunction(Def)) {
-        // See through pointer-to-pointer bitcasts
-        LLVM_DEBUG(dbgs() << "   ... MemoryDef is not accessed until the end "
-                             "of the function\n");
-        deleteDeadInstruction(DefI);
-        ++NumFastStores;
-        MadeChange = true;
+        if (isWriteAtEndOfFunction(Def)) {
+          // See through pointer-to-pointer bitcasts
+          LLVM_DEBUG(dbgs() << "   ... MemoryDef is not accessed until the end "
+                               "of the function\n");
+          deleteDeadInstruction(DefI);
+          ++NumFastStores;
+          MadeChange = true;
+        }
       }
-    }
+    } while (ShouldIterateEndOfFunctionDSE);
     return MadeChange;
   }
 
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index a9ca0bdc8f7b..9698ed97379e 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1738,7 +1738,7 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // through *explicit* control flow.  We have to eliminate the possibility of
   // implicit exits (see below) before we know it's truly exact.
   const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(ExactBTC) || !isSafeToExpand(ExactBTC, *SE))
+  if (isa<SCEVCouldNotCompute>(ExactBTC) || !Rewriter.isSafeToExpand(ExactBTC))
     return false;
 
   assert(SE->isLoopInvariant(ExactBTC, L) && "BTC must be loop invariant");
@@ -1769,7 +1769,8 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
       return true;
 
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
-    if (isa<SCEVCouldNotCompute>(ExitCount) || !isSafeToExpand(ExitCount, *SE))
+    if (isa<SCEVCouldNotCompute>(ExitCount) ||
+        !Rewriter.isSafeToExpand(ExitCount))
       return true;
 
     assert(SE->isLoopInvariant(ExitCount, L) &&
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index b54cf5e7cb20..328615011ceb 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -1451,7 +1451,7 @@ bool LoopConstrainer::run() {
       return false;
     }
 
-    if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
+    if (!Expander.isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt)) {
       LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
                         << " preloop exit limit " << *ExitPreLoopAtSCEV
                         << " at block " << InsertPt->getParent()->getName()
@@ -1478,7 +1478,7 @@ bool LoopConstrainer::run() {
       return false;
     }
 
-    if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
+    if (!Expander.isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt)) {
       LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
                         << " main loop exit limit " << *ExitMainLoopAtSCEV
                         << " at block " << InsertPt->getParent()->getName()
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 9590fbbb1994..fd2eaee8b47d 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -388,15 +388,15 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
     if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
       continue;
 
+    BasicBlock *BB = P.InsertPt->getParent();
+    SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
     const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
       SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
       P.LSCEVAddRec->getStepRecurrence(*SE)));
-    if (!isSafeToExpand(NextLSCEV, *SE))
+    if (!SCEVE.isSafeToExpand(NextLSCEV))
       continue;
 
-    BasicBlock *BB = P.InsertPt->getParent();
     Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
-    SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
     Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
 
     IRBuilder<> Builder(P.InsertPt);
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index d908c151d9f2..3ed022f65d9a 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1129,7 +1129,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
-  if (!isSafeToExpand(Start, *SE))
+  if (!Expander.isSafeToExpand(Start))
     return Changed;
 
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
@@ -1163,7 +1163,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
-  if (!isSafeToExpand(NumBytesS, *SE))
+  if (!Expander.isSafeToExpand(NumBytesS))
     return Changed;
 
   Value *NumBytes =
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 1d3023d04463..18daa4295224 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -288,7 +288,6 @@ static void populateWorklist(Loop &L, LoopVector &LoopList) {
     Vec = &CurrentLoop->getSubLoops();
   }
   LoopList.push_back(CurrentLoop);
-  return;
 }
 
 namespace {
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index d0ee5b47a8ca..b327d38d2a84 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -275,7 +275,8 @@ class LoopPredication {
   /// which is that an expression *can be made* invariant via SCEVExpander.
   /// Thus, this version is only suitable for finding an insert point to be be
   /// passed to SCEVExpander!
-  Instruction *findInsertPt(Instruction *User, ArrayRef<const SCEV*> Ops);
+  Instruction *findInsertPt(const SCEVExpander &Expander, Instruction *User,
+                            ArrayRef<const SCEV *> Ops);
 
   /// Return true if the value is known to produce a single fixed value across
   /// all iterations on which it executes.  Note that this does not imply
@@ -418,13 +419,14 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander,
       return Builder.getFalse();
   }
 
-  Value *LHSV = Expander.expandCodeFor(LHS, Ty, findInsertPt(Guard, {LHS}));
-  Value *RHSV = Expander.expandCodeFor(RHS, Ty, findInsertPt(Guard, {RHS}));
+  Value *LHSV =
+      Expander.expandCodeFor(LHS, Ty, findInsertPt(Expander, Guard, {LHS}));
+  Value *RHSV =
+      Expander.expandCodeFor(RHS, Ty, findInsertPt(Expander, Guard, {RHS}));
   IRBuilder<> Builder(findInsertPt(Guard, {LHSV, RHSV}));
   return Builder.CreateICmp(Pred, LHSV, RHSV);
 }
 
-
 // Returns true if its safe to truncate the IV to RangeCheckType.
 // When the IV type is wider than the range operand type, we can still do loop
 // predication, by generating SCEVs for the range and latch that are of the
@@ -516,14 +518,15 @@ Instruction *LoopPredication::findInsertPt(Instruction *Use,
   return Preheader->getTerminator();
 }
 
-Instruction *LoopPredication::findInsertPt(Instruction *Use,
-                                           ArrayRef<const SCEV*> Ops) {
+Instruction *LoopPredication::findInsertPt(const SCEVExpander &Expander,
+                                           Instruction *Use,
+                                           ArrayRef<const SCEV *> Ops) {
   // Subtlety: SCEV considers things to be invariant if the value produced is
   // the same across iterations.  This is not the same as being able to
   // evaluate outside the loop, which is what we actually need here.
   for (const SCEV *Op : Ops)
     if (!SE->isLoopInvariant(Op, L) ||
-        !isSafeToExpandAt(Op, Preheader->getTerminator(), *SE))
+        !Expander.isSafeToExpandAt(Op, Preheader->getTerminator()))
       return Use;
   return Preheader->getTerminator();
 }
@@ -589,8 +592,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
     LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
     return None;
   }
-  if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
-      !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
+  if (!Expander.isSafeToExpandAt(LatchStart, Guard) ||
+      !Expander.isSafeToExpandAt(LatchLimit, Guard)) {
     LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
     return None;
   }
@@ -632,8 +635,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
     LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
     return None;
   }
-  if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
-      !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
+  if (!Expander.isSafeToExpandAt(LatchStart, Guard) ||
+      !Expander.isSafeToExpandAt(LatchLimit, Guard)) {
     LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
     return None;
   }
@@ -1159,7 +1162,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L);
   if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() ||
       !SE->isLoopInvariant(MinEC, L) ||
-      !isSafeToExpandAt(MinEC, WidenableBR, *SE))
+      !Rewriter.isSafeToExpandAt(MinEC, WidenableBR))
     return ChangedLoop;
 
   // Subtlety: We need to avoid inserting additional uses of the WC.  We know
@@ -1198,7 +1201,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
     if (isa<SCEVCouldNotCompute>(ExitCount) ||
         ExitCount->getType()->isPointerTy() ||
-        !isSafeToExpandAt(ExitCount, WidenableBR, *SE))
+        !Rewriter.isSafeToExpandAt(ExitCount, WidenableBR))
       continue;
 
     const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b7e0e32780b4..083f87436acd 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -576,6 +576,18 @@ public:
       return false;
     }
 
+    // TODO: Tokens may breach LCSSA form by default. However, the transform for
+    // dead exit blocks requires LCSSA form to be maintained for all values,
+    // tokens included, otherwise it may break use-def dominance (see PR56243).
+    if (!DeadExitBlocks.empty() && !L.isLCSSAForm(DT, /*IgnoreTokens*/ false)) {
+      assert(L.isLCSSAForm(DT, /*IgnoreTokens*/ true) &&
+             "LCSSA broken not by tokens?");
+      LLVM_DEBUG(dbgs() << "Give up constant terminator folding in loop "
+                        << Header->getName()
+                        << ": tokens uses potentially break LCSSA form.\n");
+      return false;
+    }
+
     SE.forgetTopmostLoop(&L);
     // Dump analysis results.
     LLVM_DEBUG(dump());
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 4ef7809c6681..a3434f8bc46d 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1950,6 +1950,7 @@ class LSRInstance {
   Loop *const L;
   MemorySSAUpdater *MSSAU;
   TTI::AddressingModeKind AMK;
+  mutable SCEVExpander Rewriter;
   bool Changed = false;
 
   /// This is the insert position that the current loop's induction variable
@@ -1998,7 +1999,7 @@ class LSRInstance {
                         SmallVectorImpl<ChainUsers> &ChainUsersVec);
   void FinalizeChain(IVChain &Chain);
   void CollectChains();
-  void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+  void GenerateIVChain(const IVChain &Chain,
                        SmallVectorImpl<WeakTrackingVH> &DeadInsts);
 
   void CollectInterestingTypesAndFactors();
@@ -2068,22 +2069,19 @@ class LSRInstance {
   void Solve(SmallVectorImpl<const Formula *> &Solution) const;
 
   BasicBlock::iterator
-    HoistInsertPosition(BasicBlock::iterator IP,
-                        const SmallVectorImpl<Instruction *> &Inputs) const;
-  BasicBlock::iterator
-    AdjustInsertPositionForExpand(BasicBlock::iterator IP,
-                                  const LSRFixup &LF,
-                                  const LSRUse &LU,
-                                  SCEVExpander &Rewriter) const;
+  HoistInsertPosition(BasicBlock::iterator IP,
+                      const SmallVectorImpl<Instruction *> &Inputs) const;
+  BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+                                                     const LSRFixup &LF,
+                                                     const LSRUse &LU) const;
 
   Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
-                BasicBlock::iterator IP, SCEVExpander &Rewriter,
+                BasicBlock::iterator IP,
                 SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
   void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
-                     const Formula &F, SCEVExpander &Rewriter,
+                     const Formula &F,
                      SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
   void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
-               SCEVExpander &Rewriter,
                SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
   void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
 
@@ -3183,7 +3181,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
 
 /// Generate an add or subtract for each IVInc in a chain to materialize the IV
 /// user's operand from the previous IV user's operand.
-void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+void LSRInstance::GenerateIVChain(const IVChain &Chain,
                                   SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   // Find the new IVOperand for the head of the chain. It may have been replaced
   // by LSR.
@@ -3335,7 +3333,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
-        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE) &&
+        if (SE.isLoopInvariant(N, L) && Rewriter.isSafeToExpand(N) &&
             (!NV->getType()->isPointerTy() ||
              SE.getPointerBase(N) == SE.getPointerBase(S))) {
           // S is normalized, so normalize N before folding it into S
@@ -3343,6 +3341,21 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
           N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
           Kind = LSRUse::ICmpZero;
           S = SE.getMinusSCEV(N, S);
+        } else if (L->isLoopInvariant(NV) &&
+                   (!isa<Instruction>(NV) ||
+                    DT.dominates(cast<Instruction>(NV), L->getHeader())) &&
+                   !NV->getType()->isPointerTy()) {
+          // If we can't generally expand the expression (e.g. it contains
+          // a divide), but it is already at a loop invariant point before the
+          // loop, wrap it in an unknown (to prevent the expander from trying
+          // to re-expand in a potentially unsafe way.)  The restriction to
+          // integer types is required because the unknown hides the base, and
+          // SCEV can't compute the difference of two unknown pointers.
+          N = SE.getUnknown(NV);
+          N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
+          Kind = LSRUse::ICmpZero;
+          S = SE.getMinusSCEV(N, S);
+          assert(!isa<SCEVCouldNotCompute>(S));
         }
 
         // -1 and the negations of all interesting strides (except the negation
@@ -3385,10 +3398,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
 /// Insert a formula for the given expression into the given use, separating out
 /// loop-variant portions from loop-invariant and loop-computable portions.
-void
-LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU,
+                                       size_t LUIdx) {
   // Mark uses whose expressions cannot be expanded.
-  if (!isSafeToExpand(S, SE, /*CanonicalMode*/ false))
+  if (!Rewriter.isSafeToExpand(S))
     LU.RigidFormula = true;
 
   Formula F;
@@ -5206,11 +5219,8 @@ LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
 
 /// Determine an input position which will be dominated by the operands and
 /// which will dominate the result.
-BasicBlock::iterator
-LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
-                                           const LSRFixup &LF,
-                                           const LSRUse &LU,
-                                           SCEVExpander &Rewriter) const {
+BasicBlock::iterator LSRInstance::AdjustInsertPositionForExpand(
+    BasicBlock::iterator LowestIP, const LSRFixup &LF, const LSRUse &LU) const {
   // Collect some instructions which must be dominated by the
   // expanding replacement. These must be dominated by any operands that
   // will be required in the expansion.
@@ -5273,14 +5283,13 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
 /// is called "expanding").
 Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
                            const Formula &F, BasicBlock::iterator IP,
-                           SCEVExpander &Rewriter,
                            SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
   if (LU.RigidFormula)
     return LF.OperandValToReplace;
 
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
-  IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+  IP = AdjustInsertPositionForExpand(IP, LF, LU);
   Rewriter.setInsertPoint(&*IP);
 
   // Inform the Rewriter if we have a post-increment use, so that it can
@@ -5452,7 +5461,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
 /// to be expanded in multiple places.
 void LSRInstance::RewriteForPHI(
     PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
-    SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
   DenseMap<BasicBlock *, Value *> Inserted;
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
     if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
@@ -5507,8 +5516,8 @@ void LSRInstance::RewriteForPHI(
       if (!Pair.second)
         PN->setIncomingValue(i, Pair.first->second);
       else {
-        Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
-                              Rewriter, DeadInsts);
+        Value *FullV =
+            Expand(LU, LF, F, BB->getTerminator()->getIterator(), DeadInsts);
 
         // If this is reuse-by-noop-cast, insert the noop cast.
         Type *OpTy = LF.OperandValToReplace->getType();
@@ -5567,15 +5576,14 @@ void LSRInstance::RewriteForPHI(
 /// is called "expanding"), and update the UserInst to reference the newly
 /// expanded value.
 void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
-                          const Formula &F, SCEVExpander &Rewriter,
+                          const Formula &F,
                           SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
   // First, find an insertion point that dominates UserInst. For PHI nodes,
   // find the nearest block which dominates all the relevant uses.
   if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
-    RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
+    RewriteForPHI(PN, LU, LF, F, DeadInsts);
   } else {
-    Value *FullV =
-      Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+    Value *FullV = Expand(LU, LF, F, LF.UserInst->getIterator(), DeadInsts);
 
     // If this is reuse-by-noop-cast, insert the noop cast.
     Type *OpTy = LF.OperandValToReplace->getType();
@@ -5609,13 +5617,6 @@ void LSRInstance::ImplementSolution(
   // we can remove them after we are done working.
   SmallVector<WeakTrackingVH, 16> DeadInsts;
 
-  SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr",
-                        false);
-#ifndef NDEBUG
-  Rewriter.setDebugType(DEBUG_TYPE);
-#endif
-  Rewriter.disableCanonicalMode();
-  Rewriter.enableLSRMode();
   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
 
   // Mark phi nodes that terminate chains so the expander tries to reuse them.
@@ -5627,12 +5628,12 @@ void LSRInstance::ImplementSolution(
   // Expand the new value definitions and update the users.
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
     for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
-      Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
+      Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], DeadInsts);
       Changed = true;
     }
 
   for (const IVChain &Chain : IVChainVec) {
-    GenerateIVChain(Chain, Rewriter, DeadInsts);
+    GenerateIVChain(Chain, DeadInsts);
     Changed = true;
   }
 
@@ -5697,8 +5698,10 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                          const TargetTransformInfo &TTI, AssumptionCache &AC,
                          TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
     : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
-      MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0 ?
-        PreferredAddresingMode : TTI.getPreferredAddressingMode(L, &SE)) {
+      MSSAU(MSSAU), AMK(PreferredAddresingMode.getNumOccurrences() > 0
+                            ? PreferredAddresingMode
+                            : TTI.getPreferredAddressingMode(L, &SE)),
+      Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr", false) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
@@ -5733,6 +5736,14 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
              L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
              dbgs() << ":\n");
 
+  // Configure SCEVExpander already now, so the correct mode is used for
+  // isSafeToExpand() checks.
+#ifndef NDEBUG
+  Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+  Rewriter.disableCanonicalMode();
+  Rewriter.enableLSRMode();
+
   // First, perform some low-level loop optimizations.
   OptimizeShadowIV();
   OptimizeLoopTermCond();
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 75f0896d4845..240fb5e60687 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -142,12 +142,21 @@ XorOpnd::XorOpnd(Value *V) {
   isOr = true;
 }
 
+/// Return true if I is an instruction with the FastMathFlags that are needed
+/// for general reassociation set.  This is not the same as testing
+/// Instruction::isAssociative() because it includes operations like fsub.
+/// (This routine is only intended to be called for floating-point operations.)
+static bool hasFPAssociativeFlags(Instruction *I) {
+  assert(I && I->getType()->isFPOrFPVectorTy() && "Should only check FP ops");
+  return I->hasAllowReassoc() && I->hasNoSignedZeros();
+}
+
 /// Return true if V is an instruction of the specified opcode and if it
 /// only has one use.
 static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
   auto *I = dyn_cast<Instruction>(V);
   if (I && I->hasOneUse() && I->getOpcode() == Opcode)
-    if (!isa<FPMathOperator>(I) || I->isFast())
+    if (!isa<FPMathOperator>(I) || hasFPAssociativeFlags(I))
       return cast<BinaryOperator>(I);
   return nullptr;
 }
@@ -157,7 +166,7 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
   auto *I = dyn_cast<Instruction>(V);
   if (I && I->hasOneUse() &&
       (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2))
-    if (!isa<FPMathOperator>(I) || I->isFast())
+    if (!isa<FPMathOperator>(I) || hasFPAssociativeFlags(I))
       return cast<BinaryOperator>(I);
   return nullptr;
 }
@@ -449,7 +458,8 @@ using RepeatedValue = std::pair<Value*, APInt>;
 /// of the expression) if it can turn them into binary operators of the right
 /// type and thus make the expression bigger.
 static bool LinearizeExprTree(Instruction *I,
-                              SmallVectorImpl<RepeatedValue> &Ops) {
+                              SmallVectorImpl<RepeatedValue> &Ops,
+                              ReassociatePass::OrderedSet &ToRedo) {
   assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) &&
          "Expected a UnaryOperator or BinaryOperator!");
   LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
@@ -572,23 +582,32 @@ static bool LinearizeExprTree(Instruction *I,
       assert((!isa<Instruction>(Op) ||
               cast<Instruction>(Op)->getOpcode() != Opcode
               || (isa<FPMathOperator>(Op) &&
-                  !cast<Instruction>(Op)->isFast())) &&
+                  !hasFPAssociativeFlags(cast<Instruction>(Op)))) &&
              "Should have been handled above!");
       assert(Op->hasOneUse() && "Has uses outside the expression tree!");
 
       // If this is a multiply expression, turn any internal negations into
-      // multiplies by -1 so they can be reassociated.
-      if (Instruction *Tmp = dyn_cast<Instruction>(Op))
-        if ((Opcode == Instruction::Mul && match(Tmp, m_Neg(m_Value()))) ||
-            (Opcode == Instruction::FMul && match(Tmp, m_FNeg(m_Value())))) {
-          LLVM_DEBUG(dbgs()
-                     << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
-          Tmp = LowerNegateToMultiply(Tmp);
-          LLVM_DEBUG(dbgs() << *Tmp << '\n');
-          Worklist.push_back(std::make_pair(Tmp, Weight));
-          Changed = true;
-          continue;
+      // multiplies by -1 so they can be reassociated.  Add any users of the
+      // newly created multiplication by -1 to the redo list, so any
+      // reassociation opportunities that are exposed will be reassociated
+      // further.
+      Instruction *Neg;
+      if (((Opcode == Instruction::Mul && match(Op, m_Neg(m_Value()))) ||
+           (Opcode == Instruction::FMul && match(Op, m_FNeg(m_Value())))) &&
+           match(Op, m_Instruction(Neg))) {
+        LLVM_DEBUG(dbgs()
+                   << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+        Instruction *Mul = LowerNegateToMultiply(Neg);
+        LLVM_DEBUG(dbgs() << *Mul << '\n');
+        Worklist.push_back(std::make_pair(Mul, Weight));
+        for (User *U : Mul->users()) {
+          if (BinaryOperator *UserBO = dyn_cast<BinaryOperator>(U))
+            ToRedo.insert(UserBO);
         }
+        ToRedo.insert(Neg);
+        Changed = true;
+        continue;
+      }
 
       // Failed to morph into an expression of the right type.  This really is
       // a leaf.
@@ -1141,7 +1160,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
     return nullptr;
 
   SmallVector<RepeatedValue, 8> Tree;
-  MadeChange |= LinearizeExprTree(BO, Tree);
+  MadeChange |= LinearizeExprTree(BO, Tree, RedoInsts);
   SmallVector<ValueEntry, 8> Factors;
   Factors.reserve(Tree.size());
   for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
@@ -2206,8 +2225,9 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
   if (Instruction *Res = canonicalizeNegFPConstants(I))
     I = Res;
 
-  // Don't optimize floating-point instructions unless they are 'fast'.
-  if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
+  // Don't optimize floating-point instructions unless they have the
+  // appropriate FastMathFlags for reassociation enabled.
+  if (I->getType()->isFPOrFPVectorTy() && !hasFPAssociativeFlags(I))
     return;
 
   // Do not reassociate boolean (i1) expressions.  We want to preserve the
@@ -2320,7 +2340,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
   // First, walk the expression tree, linearizing the tree, collecting the
   // operand information.
   SmallVector<RepeatedValue, 8> Tree;
-  MadeChange |= LinearizeExprTree(I, Tree);
+  MadeChange |= LinearizeExprTree(I, Tree, RedoInsts);
   SmallVector<ValueEntry, 8> Ops;
   Ops.reserve(Tree.size());
   for (const RepeatedValue &E : Tree)
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 51e4a5773f3e..baf407c5037b 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1702,10 +1702,20 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
       auto &Context = Call->getContext();
       auto &DL = Call->getModule()->getDataLayout();
       auto GetBaseAndOffset = [&](Value *Derived) {
-        assert(PointerToBase.count(Derived));
+        Value *Base = nullptr;
+        // Optimizations in unreachable code might substitute the real pointer
+        // with undef, poison or null-derived constant. Return null base for
+        // them to be consistent with the handling in the main algorithm in
+        // findBaseDefiningValue.
+        if (isa<Constant>(Derived))
+          Base =
+              ConstantPointerNull::get(cast<PointerType>(Derived->getType()));
+        else {
+          assert(PointerToBase.count(Derived));
+          Base = PointerToBase.find(Derived)->second;
+        }
         unsigned AddressSpace = Derived->getType()->getPointerAddressSpace();
         unsigned IntPtrSize = DL.getPointerSizeInBits(AddressSpace);
-        Value *Base = PointerToBase.find(Derived)->second;
         Value *Base_int = Builder.CreatePtrToInt(
             Base, Type::getIntNTy(Context, IntPtrSize));
         Value *Derived_int = Builder.CreatePtrToInt(
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index 008ddfc72740..5ab9e25577d8 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -111,8 +111,6 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopLoadEliminationPass(Registry);
   initializeLoopSimplifyCFGLegacyPassPass(Registry);
   initializeLoopVersioningLegacyPassPass(Registry);
-  initializeEntryExitInstrumenterPass(Registry);
-  initializePostInlineEntryExitInstrumenterPass(Registry);
 }
 
 void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index f6525ad7de9b..0b797abefe20 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -68,11 +68,6 @@ static cl::opt<bool>
                           cl::desc("Allow relaxed uniform region checks"),
                           cl::init(true));
 
-static cl::opt<unsigned>
-    ReorderNodeSize("structurizecfg-node-reorder-size",
-                     cl::desc("Limit region size for reordering nodes"),
-                     cl::init(100), cl::Hidden);
-
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -267,8 +262,6 @@ class StructurizeCFG {
 
   void orderNodes();
 
-  void reorderNodes();
-
   void analyzeLoops(RegionNode *N);
 
   Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
@@ -427,57 +420,6 @@ void StructurizeCFG::orderNodes() {
   }
 }
 
-/// Change the node ordering to decrease the range of live values, especially
-/// the values that capture the control flow path for branches. We do this
-/// by moving blocks with a single predecessor and successor to appear after
-/// predecessor. The motivation is to move some loop exit blocks into a loop.
-/// In cases where a loop has a large number of exit blocks, this reduces the
-/// amount of values needed across the loop boundary.
-void StructurizeCFG::reorderNodes() {
-  SmallVector<RegionNode *, 8> NewOrder;
-  DenseMap<BasicBlock *, unsigned> MoveTo;
-  BitVector Moved(Order.size());
-
-  // The benefits of reordering nodes occurs for large regions.
-  if (Order.size() <= ReorderNodeSize)
-    return;
-
-  // The algorithm works with two passes over Order. The first pass identifies
-  // the blocks to move and the position to move them to. The second pass
-  // creates the new order based upon this information. We move blocks with
-  // a single predecessor and successor. If there are multiple candidates then
-  // maintain the original order.
-  BBSet Seen;
-  for (int I = Order.size() - 1; I >= 0; --I) {
-    auto *BB = Order[I]->getEntry();
-    Seen.insert(BB);
-    auto *Pred = BB->getSinglePredecessor();
-    auto *Succ = BB->getSingleSuccessor();
-    // Consider only those basic blocks that have a predecessor in Order and a
-    // successor that exits the region. The region may contain subregions that
-    // have been structurized and are not included in Order.
-    if (Pred && Succ && Seen.count(Pred) && Succ == ParentRegion->getExit() &&
-        !MoveTo.count(Pred)) {
-      MoveTo[Pred] = I;
-      Moved.set(I);
-    }
-  }
-
-  // If no blocks have been moved then the original order is good.
-  if (!Moved.count())
-    return;
-
-  for (size_t I = 0, E = Order.size(); I < E; ++I) {
-    auto *BB = Order[I]->getEntry();
-    if (MoveTo.count(BB))
-      NewOrder.push_back(Order[MoveTo[BB]]);
-    if (!Moved[I])
-      NewOrder.push_back(Order[I]);
-  }
-
-  Order.assign(NewOrder);
-}
-
 /// Determine the end of the loops
 void StructurizeCFG::analyzeLoops(RegionNode *N) {
   if (N->isSubRegion()) {
@@ -1139,7 +1081,6 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
   ParentRegion = R;
 
   orderNodes();
-  reorderNodes();
   collectInfos();
   createFlow();
   insertConditions(false);
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 079b2fc973b9..e3cb5f359e34 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -80,7 +80,7 @@ void llvm::detachDeadBlocks(
       // contained within it must dominate their uses, that all uses will
       // eventually be removed (they are themselves dead).
       if (!I.use_empty())
-        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+        I.replaceAllUsesWith(PoisonValue::get(I.getType()));
       BB->getInstList().pop_back();
     }
     new UnreachableInst(BB->getContext(), BB);
@@ -188,8 +188,10 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
 
   // Don't break self-loops.
   if (PredBB == BB) return false;
-  // Don't break unwinding instructions.
-  if (PredBB->getTerminator()->isExceptionalTerminator())
+
+  // Don't break unwinding instructions or terminators with other side-effects.
+  Instruction *PTI = PredBB->getTerminator();
+  if (PTI->isExceptionalTerminator() || PTI->mayHaveSideEffects())
     return false;
 
   // Can't merge if there are multiple distinct successors.
@@ -202,7 +204,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   BasicBlock *NewSucc = nullptr;
   unsigned FallThruPath;
   if (PredecessorWithTwoSuccessors) {
-    if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator())))
+    if (!(PredBB_BI = dyn_cast<BranchInst>(PTI)))
       return false;
     BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator());
     if (!BB_JmpI || !BB_JmpI->isUnconditional())
@@ -256,7 +258,6 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
     Updates.push_back({DominatorTree::Delete, PredBB, BB});
   }
 
-  Instruction *PTI = PredBB->getTerminator();
   Instruction *STI = BB->getTerminator();
   Instruction *Start = &*BB->begin();
   // If there's nothing to move, mark the starting instruction as the last
@@ -1141,7 +1142,7 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
   if (Preds.empty()) {
     // Insert dummy values as the incoming value.
     for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
-      cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+      cast<PHINode>(I)->addIncoming(PoisonValue::get(I->getType()), NewBB);
   }
 
   // Update DominatorTree, LoopInfo, and LCCSA analysis information.
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index c4a58f36c171..e25ec74a0572 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -270,9 +270,6 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
 
   bool Changed = false;
 
-  if(!isLibFreeFunction(&F, TheLibFunc) && !isReallocLikeFn(&F,  &TLI))
-    Changed |= setDoesNotFreeMemory(F);
-
   if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT())
     Changed |= setNonLazyBind(F);
 
@@ -285,14 +282,14 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_strchr:
   case LibFunc_strrchr:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
-    return Changed;
+    break;
   case LibFunc_strtol:
   case LibFunc_strtod:
   case LibFunc_strtof:
@@ -304,7 +301,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_strcat:
   case LibFunc_strncat:
     Changed |= setOnlyAccessesArgMemory(F);
@@ -315,7 +312,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotAlias(F, 1);
-    return Changed;
+    break;
   case LibFunc_strcpy:
   case LibFunc_strncpy:
     Changed |= setReturnedArg(F, 0);
@@ -330,14 +327,14 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyReadsMemory(F, 1);
     Changed |= setDoesNotAlias(F, 0);
     Changed |= setDoesNotAlias(F, 1);
-    return Changed;
+    break;
   case LibFunc_strxfrm:
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_strcmp:      // 0,1
   case LibFunc_strspn:      // 0,1
   case LibFunc_strncmp:     // 0,1
@@ -348,7 +345,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_strcoll:
   case LibFunc_strcasecmp:  // 0,1
   case LibFunc_strncasecmp: //
@@ -359,7 +356,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_strstr:
   case LibFunc_strpbrk:
     Changed |= setOnlyAccessesArgMemory(F);
@@ -367,26 +364,26 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_strtok:
   case LibFunc_strtok_r:
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_scanf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_setbuf:
   case LibFunc_setvbuf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_strndup:
     Changed |= setArgNoUndef(F, 1);
     LLVM_FALLTHROUGH;
@@ -398,7 +395,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_stat:
   case LibFunc_statvfs:
     Changed |= setRetAndArgsNoUndef(F);
@@ -406,7 +403,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_sscanf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -414,7 +411,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_sprintf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -423,7 +420,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_snprintf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -432,7 +429,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
+    break;
   case LibFunc_setitimer:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -440,13 +437,13 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_system:
     // May throw; "system" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_aligned_alloc:
     Changed |= setAlignedAllocParam(F, 0);
     Changed |= setAllocSize(F, 1, None);
@@ -464,7 +461,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed;
+    break;
   case LibFunc_memcmp:
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyReadsMemory(F);
@@ -472,21 +469,21 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_memchr:
   case LibFunc_memrchr:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setWillReturn(F);
-    return Changed;
+    break;
   case LibFunc_modf:
   case LibFunc_modff:
   case LibFunc_modfl:
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_memcpy:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
@@ -497,7 +494,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotAlias(F, 1);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_memmove:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
@@ -506,7 +503,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_mempcpy:
   case LibFunc_memccpy:
     Changed |= setWillReturn(F);
@@ -519,7 +516,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotAlias(F, 1);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_memalign:
     Changed |= setAllocFamily(F, "malloc");
     Changed |= setAllocKind(F, AllocFnKind::Alloc | AllocFnKind::Aligned |
@@ -531,19 +528,19 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed;
+    break;
   case LibFunc_mkdir:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_mktime:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_realloc:
   case LibFunc_reallocf:
   case LibFunc_vec_realloc:
@@ -559,17 +556,17 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setArgNoUndef(F, 1);
-    return Changed;
+    break;
   case LibFunc_read:
     // May throw; "read" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_rewind:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_rmdir:
   case LibFunc_remove:
   case LibFunc_realpath:
@@ -577,7 +574,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_rename:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -585,20 +582,20 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_readlink:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_write:
     // May throw; "write" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_bcopy:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
@@ -607,7 +604,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyWritesMemory(F, 1);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_bcmp:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
@@ -615,14 +612,14 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_bzero:
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyWritesMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_calloc:
   case LibFunc_vec_calloc:
     Changed |= setAllocFamily(F, TheLibFunc == LibFunc_vec_calloc ? "vec_malloc"
@@ -634,21 +631,21 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setWillReturn(F);
-    return Changed;
+    break;
   case LibFunc_chmod:
   case LibFunc_chown:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_ctermid:
   case LibFunc_clearerr:
   case LibFunc_closedir:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_atoi:
   case LibFunc_atol:
   case LibFunc_atof:
@@ -657,13 +654,13 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyReadsMemory(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_access:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_fopen:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -672,19 +669,19 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_fdopen:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_feof:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_free:
   case LibFunc_vec_free:
     Changed |= setAllocFamily(F, TheLibFunc == LibFunc_vec_free ? "vec_malloc"
@@ -696,7 +693,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_fseek:
   case LibFunc_ftell:
   case LibFunc_fgetc:
@@ -713,45 +710,45 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_ferror:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F);
-    return Changed;
+    break;
   case LibFunc_fputc:
   case LibFunc_fputc_unlocked:
   case LibFunc_fstat:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_frexp:
   case LibFunc_frexpf:
   case LibFunc_frexpl:
     Changed |= setDoesNotThrow(F);
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_fstatvfs:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_fgets:
   case LibFunc_fgets_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 2);
-    return Changed;
+    break;
   case LibFunc_fread:
   case LibFunc_fread_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
-    return Changed;
+    break;
   case LibFunc_fwrite:
   case LibFunc_fwrite_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
@@ -759,7 +756,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 3);
     // FIXME: readonly #1?
-    return Changed;
+    break;
   case LibFunc_fputs:
   case LibFunc_fputs_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
@@ -767,7 +764,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_fscanf:
   case LibFunc_fprintf:
     Changed |= setRetAndArgsNoUndef(F);
@@ -775,73 +772,73 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_fgetpos:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_getc:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_getlogin_r:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_getc_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_getenv:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setOnlyReadsMemory(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_gets:
   case LibFunc_getchar:
   case LibFunc_getchar_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
-    return Changed;
+    break;
   case LibFunc_getitimer:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_getpwnam:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_ungetc:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_uname:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_unlink:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_unsetenv:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_utime:
   case LibFunc_utimes:
     Changed |= setRetAndArgsNoUndef(F);
@@ -850,13 +847,13 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_putc:
   case LibFunc_putc_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_puts:
   case LibFunc_printf:
   case LibFunc_perror:
@@ -864,23 +861,23 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_pread:
     // May throw; "pread" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_pwrite:
     // May throw; "pwrite" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_putchar:
   case LibFunc_putchar_unlocked:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
-    return Changed;
+    break;
   case LibFunc_popen:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -889,18 +886,18 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_pclose:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_vscanf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_vsscanf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -908,20 +905,20 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_vfscanf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_vprintf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_vfprintf:
   case LibFunc_vsprintf:
     Changed |= setRetAndArgsNoUndef(F);
@@ -929,63 +926,63 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_vsnprintf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 2);
     Changed |= setOnlyReadsMemory(F, 2);
-    return Changed;
+    break;
   case LibFunc_open:
     // May throw; "open" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_opendir:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_tmpfile:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
-    return Changed;
+    break;
   case LibFunc_times:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_htonl:
   case LibFunc_htons:
   case LibFunc_ntohl:
   case LibFunc_ntohs:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotAccessMemory(F);
-    return Changed;
+    break;
   case LibFunc_lstat:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_lchown:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_qsort:
     // May throw; places call through function pointer.
     // Cannot give undef pointer/size
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 3);
-    return Changed;
+    break;
   case LibFunc_dunder_strndup:
     Changed |= setArgNoUndef(F, 1);
     LLVM_FALLTHROUGH;
@@ -995,28 +992,28 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setWillReturn(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_dunder_strtok_r:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_under_IO_getc:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_under_IO_putc:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_dunder_isoc99_scanf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_stat64:
   case LibFunc_lstat64:
   case LibFunc_statvfs64:
@@ -1025,7 +1022,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_dunder_isoc99_sscanf:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -1033,7 +1030,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_fopen64:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
@@ -1042,30 +1039,30 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 0);
     Changed |= setOnlyReadsMemory(F, 1);
-    return Changed;
+    break;
   case LibFunc_fseeko64:
   case LibFunc_ftello64:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
-    return Changed;
+    break;
   case LibFunc_tmpfile64:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setRetDoesNotAlias(F);
-    return Changed;
+    break;
   case LibFunc_fstat64:
   case LibFunc_fstatvfs64:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_open64:
     // May throw; "open" is a valid pthread cancellation point.
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setOnlyReadsMemory(F, 0);
-    return Changed;
+    break;
   case LibFunc_gettimeofday:
     // Currently some platforms have the restrict keyword on the arguments to
     // gettimeofday. To be conservative, do not add noalias to gettimeofday's
@@ -1074,7 +1071,7 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
     Changed |= setDoesNotCapture(F, 1);
-    return Changed;
+    break;
   case LibFunc_memset_pattern4:
   case LibFunc_memset_pattern8:
   case LibFunc_memset_pattern16:
@@ -1089,18 +1086,18 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setOnlyAccessesArgMemory(F);
     Changed |= setOnlyWritesMemory(F, 0);
     Changed |= setDoesNotThrow(F);
-    return Changed;
+    break;
   // int __nvvm_reflect(const char *)
   case LibFunc_nvvm_reflect:
     Changed |= setRetAndArgsNoUndef(F);
     Changed |= setDoesNotAccessMemory(F);
     Changed |= setDoesNotThrow(F);
-    return Changed;
+    break;
   case LibFunc_ldexp:
   case LibFunc_ldexpf:
   case LibFunc_ldexpl:
     Changed |= setWillReturn(F);
-    return Changed;
+    break;
   case LibFunc_abs:
   case LibFunc_acos:
   case LibFunc_acosf:
@@ -1227,12 +1224,17 @@ bool llvm::inferNonMandatoryLibFuncAttrs(Function &F,
     Changed |= setDoesNotFreeMemory(F);
     Changed |= setOnlyWritesMemory(F);
     Changed |= setWillReturn(F);
-    return Changed;
+    break;
   default:
     // FIXME: It'd be really nice to cover all the library functions we're
     // aware of here.
-    return false;
+    break;
   }
+  // We have to do this step after AllocKind has been inferred on functions so
+  // we can reliably identify free-like and realloc-like functions.
+  if (!isLibFreeFunction(&F, TheLibFunc) && !isReallocLikeFn(&F, &TLI))
+    Changed |= setDoesNotFreeMemory(F);
+  return Changed;
 }
 
 static void setArgExtAttr(Function &F, unsigned ArgNo,
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp b/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
index f229d4bf14e9..9101a1e41f7b 100644
--- a/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
+++ b/llvm/lib/Transforms/Utils/CanonicalizeAliases.cpp
@@ -65,23 +65,6 @@ static bool canonicalizeAliases(Module &M) {
     canonicalizeAlias(&GA, Changed);
   return Changed;
 }
-
-// Legacy pass that canonicalizes aliases.
-class CanonicalizeAliasesLegacyPass : public ModulePass {
-
-public:
-  /// Pass identification, replacement for typeid
-  static char ID;
-
-  /// Specify pass name for debug output
-  StringRef getPassName() const override { return "Canonicalize Aliases"; }
-
-  explicit CanonicalizeAliasesLegacyPass() : ModulePass(ID) {}
-
-  bool runOnModule(Module &M) override { return canonicalizeAliases(M); }
-};
-char CanonicalizeAliasesLegacyPass::ID = 0;
-
 } // anonymous namespace
 
 PreservedAnalyses CanonicalizeAliasesPass::run(Module &M,
@@ -91,14 +74,3 @@ PreservedAnalyses CanonicalizeAliasesPass::run(Module &M,
 
   return PreservedAnalyses::none();
 }
-
-INITIALIZE_PASS_BEGIN(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
-                      "Canonicalize aliases", false, false)
-INITIALIZE_PASS_END(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
-                    "Canonicalize aliases", false, false)
-
-namespace llvm {
-ModulePass *createCanonicalizeAliasesPass() {
-  return new CanonicalizeAliasesLegacyPass();
-}
-} // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 8f053cd56e0e..1d348213bfdb 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -206,9 +206,20 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     };
 
     // Avoid cloning types, compile units, and (other) subprograms.
-    for (DISubprogram *ISP : DIFinder->subprograms())
-      if (ISP != SPClonedWithinModule)
+    SmallPtrSet<const DISubprogram *, 16> MappedToSelfSPs;
+    for (DISubprogram *ISP : DIFinder->subprograms()) {
+      if (ISP != SPClonedWithinModule) {
         mapToSelfIfNew(ISP);
+        MappedToSelfSPs.insert(ISP);
+      }
+    }
+
+    // If a subprogram isn't going to be cloned skip its lexical blocks as well.
+    for (DIScope *S : DIFinder->scopes()) {
+      auto *LScope = dyn_cast<DILocalScope>(S);
+      if (LScope && MappedToSelfSPs.count(LScope->getSubprogram()))
+        mapToSelfIfNew(S);
+    }
 
     for (DICompileUnit *CU : DIFinder->compile_units())
       mapToSelfIfNew(CU);
@@ -723,14 +734,14 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
     }
 
     // If the loops above have made these phi nodes have 0 or 1 operand,
-    // replace them with undef or the input value.  We must do this for
+    // replace them with poison or the input value.  We must do this for
     // correctness, because 0-operand phis are not valid.
     PN = cast<PHINode>(NewBB->begin());
     if (PN->getNumIncomingValues() == 0) {
       BasicBlock::iterator I = NewBB->begin();
       BasicBlock::const_iterator OldI = OldBB->begin();
       while ((PN = dyn_cast<PHINode>(I++))) {
-        Value *NV = UndefValue::get(PN->getType());
+        Value *NV = PoisonValue::get(PN->getType());
         PN->replaceAllUsesWith(NV);
         assert(VMap[&*OldI] == PN && "VMap mismatch");
         VMap[&*OldI] = NV;
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index e3e8f63383df..60f910bceab8 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -117,65 +117,6 @@ static bool runOnFunction(Function &F, bool PostInlining) {
   return Changed;
 }
 
-namespace {
-struct EntryExitInstrumenter : public FunctionPass {
-  static char ID;
-  EntryExitInstrumenter() : FunctionPass(ID) {
-    initializeEntryExitInstrumenterPass(*PassRegistry::getPassRegistry());
-  }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-  }
-  bool runOnFunction(Function &F) override { return ::runOnFunction(F, false); }
-};
-char EntryExitInstrumenter::ID = 0;
-
-struct PostInlineEntryExitInstrumenter : public FunctionPass {
-  static char ID;
-  PostInlineEntryExitInstrumenter() : FunctionPass(ID) {
-    initializePostInlineEntryExitInstrumenterPass(
-        *PassRegistry::getPassRegistry());
-  }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
-  }
-  bool runOnFunction(Function &F) override { return ::runOnFunction(F, true); }
-};
-char PostInlineEntryExitInstrumenter::ID = 0;
-}
-
-INITIALIZE_PASS_BEGIN(
-    EntryExitInstrumenter, "ee-instrument",
-    "Instrument function entry/exit with calls to e.g. mcount() (pre inlining)",
-    false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(
-    EntryExitInstrumenter, "ee-instrument",
-    "Instrument function entry/exit with calls to e.g. mcount() (pre inlining)",
-    false, false)
-
-INITIALIZE_PASS_BEGIN(
-    PostInlineEntryExitInstrumenter, "post-inline-ee-instrument",
-    "Instrument function entry/exit with calls to e.g. mcount() "
-    "(post inlining)",
-    false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(
-    PostInlineEntryExitInstrumenter, "post-inline-ee-instrument",
-    "Instrument function entry/exit with calls to e.g. mcount() "
-    "(post inlining)",
-    false, false)
-
-FunctionPass *llvm::createEntryExitInstrumenterPass() {
-  return new EntryExitInstrumenter();
-}
-
-FunctionPass *llvm::createPostInlineEntryExitInstrumenterPass() {
-  return new PostInlineEntryExitInstrumenter();
-}
-
 PreservedAnalyses
 llvm::EntryExitInstrumenterPass::run(Function &F, FunctionAnalysisManager &AM) {
   runOnFunction(F, PostInlining);
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index 7b8d8553bac2..7509fde6df9d 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -301,9 +301,9 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
     LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
 
     if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
-      if (!SI->isSimple()) {
-        LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
-        return false;  // no volatile/atomic accesses.
+      if (SI->isVolatile()) {
+        LLVM_DEBUG(dbgs() << "Store is volatile! Can not evaluate.\n");
+        return false;  // no volatile accesses.
       }
       Constant *Ptr = getVal(SI->getOperand(1));
       Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
@@ -337,10 +337,10 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
       if (!Res.first->second.write(Val, Offset, DL))
         return false;
     } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
-      if (!LI->isSimple()) {
+      if (LI->isVolatile()) {
         LLVM_DEBUG(
-            dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
-        return false;  // no volatile/atomic accesses.
+            dbgs() << "Found a Load! Volatile load, can not evaluate.\n");
+        return false;  // no volatile accesses.
       }
 
       Constant *Ptr = getVal(LI->getOperand(0));
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 193806d9cc87..8e6d4626c9fd 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -35,6 +35,13 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition(
 bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
     const GlobalValue *SGV, ValueInfo VI) {
   assert(SGV->hasLocalLinkage());
+
+  // Ifuncs and ifunc alias does not have summary.
+  if (isa<GlobalIFunc>(SGV) ||
+      (isa<GlobalAlias>(SGV) &&
+       isa<GlobalIFunc>(cast<GlobalAlias>(SGV)->getAliaseeObject())))
+    return false;
+
   // Both the imported references and the original local variable must
   // be promoted.
   if (!isPerformingImport() && !isModuleExporting())
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 2fb00f95b749..00387ec426bf 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2194,9 +2194,11 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
         CI->setTailCallKind(ChildTCK);
         InlinedMustTailCalls |= CI->isMustTailCall();
 
-        // Calls inlined through a 'nounwind' call site should be marked
-        // 'nounwind'.
-        if (MarkNoUnwind)
+        // Call sites inlined through a 'nounwind' call site should be
+        // 'nounwind' as well. However, avoid marking call sites explicitly
+        // where possible. This helps expose more opportunities for CSE after
+        // inlining, commonly when the callee is an intrinsic.
+        if (MarkNoUnwind && !CI->doesNotThrow())
           CI->setDoesNotThrow();
       }
     }
@@ -2625,7 +2627,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   } else if (!CB.use_empty()) {
     // No returns, but something is using the return value of the call.  Just
     // nuke the result.
-    CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+    CB.replaceAllUsesWith(PoisonValue::get(CB.getType()));
   }
 
   // Since we are now done with the Call/Invoke, we can delete it.
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index b203259db1c6..2f1d0c2f9012 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -439,6 +439,10 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
     return true;
   }
 
+  if (auto *CB = dyn_cast<CallBase>(I))
+    if (isRemovableAlloc(CB, TLI))
+      return true;
+
   if (!I->willReturn())
     return false;
 
@@ -489,16 +493,13 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
     }
   }
 
-  if (isAllocationFn(I, TLI) && isAllocRemovable(cast<CallBase>(I), TLI))
-    return true;
-
-  if (CallInst *CI = isFreeCall(I, TLI))
-    if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
-      return C->isNullValue() || isa<UndefValue>(C);
-
-  if (auto *Call = dyn_cast<CallBase>(I))
+  if (auto *Call = dyn_cast<CallBase>(I)) {
+    if (Value *FreedOp = getFreedOperand(Call, TLI))
+      if (Constant *C = dyn_cast<Constant>(FreedOp))
+        return C->isNullValue() || isa<UndefValue>(C);
     if (isMathLibCallNoop(Call, TLI))
       return true;
+  }
 
   // Non-volatile atomic loads from constants can be removed.
   if (auto *LI = dyn_cast<LoadInst>(I))
@@ -637,7 +638,7 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
     // won't prove fruitful.
     if (!Visited.insert(I).second) {
       // Break the cycle and delete the instruction and its operands.
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+      I->replaceAllUsesWith(PoisonValue::get(I->getType()));
       (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
       return true;
     }
@@ -750,8 +751,8 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
   // If BB has single-entry PHI nodes, fold them.
   while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
     Value *NewVal = PN->getIncomingValue(0);
-    // Replace self referencing PHI with undef, it must be dead.
-    if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+    // Replace self referencing PHI with poison, it must be dead.
+    if (NewVal == PN) NewVal = PoisonValue::get(PN->getType());
     PN->replaceAllUsesWith(NewVal);
     PN->eraseFromParent();
   }
@@ -2105,7 +2106,7 @@ llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
     // Delete the next to last instruction.
     Instruction *Inst = &*--EndInst->getIterator();
     if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
-      Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+      Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType()));
     if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
       EndInst = Inst;
       continue;
@@ -2144,7 +2145,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
   BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
   while (BBI != BBE) {
     if (!BBI->use_empty())
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+      BBI->replaceAllUsesWith(PoisonValue::get(BBI->getType()));
     BB->getInstList().erase(BBI++);
     ++NumInstrsRemoved;
   }
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 0f33559c7e70..597c88ad13df 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -622,7 +622,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
         // We only need to split loop exit edges.
         Loop *PredLoop = LI->getLoopFor(ExitPred);
         if (!PredLoop || PredLoop->contains(Exit) ||
-            ExitPred->getTerminator()->isIndirectTerminator())
+            isa<IndirectBrInst>(ExitPred->getTerminator()))
           continue;
         SplitLatchEdge |= L->getLoopLatch() == ExitPred;
         BasicBlock *ExitSplit = SplitCriticalEdge(
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 55d5c733733b..2ff8a3f7b228 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -127,7 +127,7 @@ BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
       // If the loop is branched to from an indirect terminator, we won't
       // be able to fully transform the loop, because it prohibits
       // edge splitting.
-      if (P->getTerminator()->isIndirectTerminator())
+      if (isa<IndirectBrInst>(P->getTerminator()))
         return nullptr;
 
       // Keep track of it.
@@ -256,7 +256,7 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
     if (PN->getIncomingValue(i) != PN ||
         !L->contains(PN->getIncomingBlock(i))) {
       // We can't split indirect control flow edges.
-      if (PN->getIncomingBlock(i)->getTerminator()->isIndirectTerminator())
+      if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
         return nullptr;
       OuterLoopPreds.push_back(PN->getIncomingBlock(i));
     }
@@ -375,7 +375,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   std::vector<BasicBlock*> BackedgeBlocks;
   for (BasicBlock *P : predecessors(Header)) {
     // Indirect edges cannot be split, so we must fail if we find one.
-    if (P->getTerminator()->isIndirectTerminator())
+    if (isa<IndirectBrInst>(P->getTerminator()))
       return nullptr;
 
     if (P != Preheader) BackedgeBlocks.push_back(P);
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 82f993b4ceab..349063dd5e89 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -602,10 +602,10 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
     // loop will be already eliminated and we have less work to do but according
     // to API doc of User::dropAllReferences only valid operation after dropping
     // references, is deletion. So let's substitute all usages of
-    // instruction from the loop with undef value of corresponding type first.
+    // instruction from the loop with poison value of corresponding type first.
     for (auto *Block : L->blocks())
       for (Instruction &I : *Block) {
-        auto *Undef = UndefValue::get(I.getType());
+        auto *Poison = PoisonValue::get(I.getType());
         for (Use &U : llvm::make_early_inc_range(I.uses())) {
           if (auto *Usr = dyn_cast<Instruction>(U.getUser()))
             if (L->contains(Usr->getParent()))
@@ -615,7 +615,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
           if (DT)
             assert(!DT->isReachableFromEntry(U) &&
                    "Unexpected user in reachable block");
-          U.set(Undef);
+          U.set(Poison);
         }
         auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I);
         if (!DVI)
@@ -1357,7 +1357,7 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
         if (isa<SCEVCouldNotCompute>(ExitValue) ||
             !SE->isLoopInvariant(ExitValue, L) ||
-            !isSafeToExpand(ExitValue, *SE)) {
+            !Rewriter.isSafeToExpand(ExitValue)) {
           // TODO: This should probably be sunk into SCEV in some way; maybe a
           // getSCEVForExit(SCEV*, L, ExitingBB)?  It can be generalized for
           // most SCEV expressions and other recurrence types (e.g. shift
@@ -1370,7 +1370,7 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
               ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
           if (isa<SCEVCouldNotCompute>(ExitValue) ||
               !SE->isLoopInvariant(ExitValue, L) ||
-              !isSafeToExpand(ExitValue, *SE))
+              !Rewriter.isSafeToExpand(ExitValue))
             continue;
         }
 
diff --git a/llvm/lib/Transforms/Utils/LowerAtomic.cpp b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
index 9914a5ca6c5e..2247b8107739 100644
--- a/llvm/lib/Transforms/Utils/LowerAtomic.cpp
+++ b/llvm/lib/Transforms/Utils/LowerAtomic.cpp
@@ -31,7 +31,7 @@ bool llvm::lowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
   Value *Res = Builder.CreateSelect(Equal, Val, Orig);
   Builder.CreateStore(Res, Ptr);
 
-  Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+  Res = Builder.CreateInsertValue(PoisonValue::get(CXI->getType()), Orig, 0);
   Res = Builder.CreateInsertValue(Res, Equal, 1);
 
   CXI->replaceAllUsesWith(Res);
diff --git a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
index deaee467531d..d4ab4504064f 100644
--- a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
+++ b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -81,26 +81,6 @@ bool llvm::nameUnamedGlobals(Module &M) {
   return Changed;
 }
 
-namespace {
-
-// Legacy pass that provides a name to every anon globals.
-class NameAnonGlobalLegacyPass : public ModulePass {
-
-public:
-  /// Pass identification, replacement for typeid
-  static char ID;
-
-  /// Specify pass name for debug output
-  StringRef getPassName() const override { return "Name Anon Globals"; }
-
-  explicit NameAnonGlobalLegacyPass() : ModulePass(ID) {}
-
-  bool runOnModule(Module &M) override { return nameUnamedGlobals(M); }
-};
-char NameAnonGlobalLegacyPass::ID = 0;
-
-} // anonymous namespace
-
 PreservedAnalyses NameAnonGlobalPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
   if (!nameUnamedGlobals(M))
@@ -108,14 +88,3 @@ PreservedAnalyses NameAnonGlobalPass::run(Module &M,
 
   return PreservedAnalyses::none();
 }
-
-INITIALIZE_PASS_BEGIN(NameAnonGlobalLegacyPass, "name-anon-globals",
-                      "Provide a name to nameless globals", false, false)
-INITIALIZE_PASS_END(NameAnonGlobalLegacyPass, "name-anon-globals",
-                    "Provide a name to nameless globals", false, false)
-
-namespace llvm {
-ModulePass *createNameAnonGlobalPass() {
-  return new NameAnonGlobalLegacyPass();
-}
-}
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 0c8bf3827256..372cd74ea01d 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2568,9 +2568,7 @@ namespace {
 // only needed when the expression includes some subexpression that is not IV
 // derived.
 //
-// Currently, we only allow division by a nonzero constant here. If this is
-// inadequate, we could easily allow division by SCEVUnknown by using
-// ValueTracking to check isKnownNonZero().
+// Currently, we only allow division by a value provably non-zero here.
 //
 // We cannot generally expand recurrences unless the step dominates the loop
 // header. The expander handles the special case of affine recurrences by
@@ -2588,8 +2586,7 @@ struct SCEVFindUnsafe {
 
   bool follow(const SCEV *S) {
     if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
-      const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
-      if (!SC || SC->getValue()->isZero()) {
+      if (!SE.isKnownNonZero(D->getRHS())) {
         IsUnsafe = true;
         return false;
       }
@@ -2613,18 +2610,17 @@ struct SCEVFindUnsafe {
   }
   bool isDone() const { return IsUnsafe; }
 };
-}
+} // namespace
 
-namespace llvm {
-bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE, bool CanonicalMode) {
+bool SCEVExpander::isSafeToExpand(const SCEV *S) const {
   SCEVFindUnsafe Search(SE, CanonicalMode);
   visitAll(S, Search);
   return !Search.IsUnsafe;
 }
 
-bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
-                      ScalarEvolution &SE) {
-  if (!isSafeToExpand(S, SE))
+bool SCEVExpander::isSafeToExpandAt(const SCEV *S,
+                                    const Instruction *InsertionPoint) const {
+  if (!isSafeToExpand(S))
     return false;
   // We have to prove that the expanded site of S dominates InsertionPoint.
   // This is easy when not in the same block, but hard when S is an instruction
@@ -2674,4 +2670,3 @@ void SCEVExpanderCleaner::cleanup() {
     I->eraseFromParent();
   }
 }
-}
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4b5ade99767b..1806081678a8 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -4851,7 +4851,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
       PN.moveBefore(InsertPt);
       // Also, add a dummy incoming value for the original BB itself,
       // so that the PHI is well-formed until we drop said predecessor.
-      PN.addIncoming(UndefValue::get(PN.getType()), BB);
+      PN.addIncoming(PoisonValue::get(PN.getType()), BB);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index af15e0c31b75..0ab79a32f526 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -593,7 +593,7 @@ bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
   }
 
   // Trunc no longer needed.
-  TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+  TI->replaceAllUsesWith(PoisonValue::get(TI->getType()));
   DeadInsts.emplace_back(TI);
   return true;
 }
@@ -660,7 +660,7 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
 
   auto *IP = GetLoopInvariantInsertPosition(L, I);
 
-  if (!isSafeToExpandAt(S, IP, *SE)) {
+  if (!Rewriter.isSafeToExpandAt(S, IP)) {
     LLVM_DEBUG(dbgs() << "INDVARS: Can not replace IV user: " << *I
                       << " with non-speculable loop invariant: " << *S << '\n');
     return false;
@@ -679,20 +679,30 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
 
 /// Eliminate redundant type cast between integer and float.
 bool SimplifyIndvar::replaceFloatIVWithIntegerIV(Instruction *UseInst) {
-  if (UseInst->getOpcode() != CastInst::SIToFP)
+  if (UseInst->getOpcode() != CastInst::SIToFP &&
+      UseInst->getOpcode() != CastInst::UIToFP)
     return false;
 
   Value *IVOperand = UseInst->getOperand(0);
   // Get the symbolic expression for this instruction.
-  ConstantRange IVRange = SE->getSignedRange(SE->getSCEV(IVOperand));
+  const SCEV *IV = SE->getSCEV(IVOperand);
+  unsigned MaskBits;
+  if (UseInst->getOpcode() == CastInst::SIToFP)
+    MaskBits = SE->getSignedRange(IV).getMinSignedBits();
+  else
+    MaskBits = SE->getUnsignedRange(IV).getActiveBits();
   unsigned DestNumSigBits = UseInst->getType()->getFPMantissaWidth();
-  if (IVRange.getActiveBits() <= DestNumSigBits) {
+  if (MaskBits <= DestNumSigBits) {
     for (User *U : UseInst->users()) {
-      // Match for fptosi of sitofp and with same type.
-      auto *CI = dyn_cast<FPToSIInst>(U);
+      // Match for fptosi/fptoui of sitofp and with same type.
+      auto *CI = dyn_cast<CastInst>(U);
       if (!CI || IVOperand->getType() != CI->getType())
         continue;
 
+      CastInst::CastOps Opcode = CI->getOpcode();
+      if (Opcode != CastInst::FPToSI && Opcode != CastInst::FPToUI)
+        continue;
+
       CI->replaceAllUsesWith(IVOperand);
       DeadInsts.push_back(CI);
       LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *CI
@@ -1015,7 +1025,7 @@ class WidenIV {
 
   SmallPtrSet<Instruction *,16> Widened;
 
-  enum ExtendKind { ZeroExtended, SignExtended, Unknown };
+  enum class ExtendKind { Zero, Sign, Unknown };
 
   // A map tracking the kind of extension used to widen each narrow IV
   // and narrow IV user.
@@ -1172,7 +1182,7 @@ WidenIV::WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv,
         HasGuards(HasGuards), UsePostIncrementRanges(UsePostIncrementRanges),
         DeadInsts(DI) {
     assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV");
-    ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended;
+    ExtendKindMap[OrigPhi] = WI.IsSigned ? ExtendKind::Sign : ExtendKind::Zero;
 }
 
 Value *WidenIV::createExtendInst(Value *NarrowOper, Type *WideType,
@@ -1225,7 +1235,7 @@ Instruction *WidenIV::cloneBitwiseIVUser(WidenIV::NarrowIVDefUse DU) {
   // about the narrow operand yet so must insert a [sz]ext. It is probably loop
   // invariant and will be folded or hoisted. If it actually comes from a
   // widened IV, it should be removed during a future call to widenIVUse.
-  bool IsSigned = getExtendKind(NarrowDef) == SignExtended;
+  bool IsSigned = getExtendKind(NarrowDef) == ExtendKind::Sign;
   Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
                    ? WideDef
                    : createExtendInst(NarrowUse->getOperand(0), WideType,
@@ -1290,7 +1300,7 @@ Instruction *WidenIV::cloneArithmeticIVUser(WidenIV::NarrowIVDefUse DU,
     return WideUse == WideAR;
   };
 
-  bool SignExtend = getExtendKind(NarrowDef) == SignExtended;
+  bool SignExtend = getExtendKind(NarrowDef) == ExtendKind::Sign;
   if (!GuessNonIVOperand(SignExtend)) {
     SignExtend = !SignExtend;
     if (!GuessNonIVOperand(SignExtend))
@@ -1350,7 +1360,7 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
   // Only Add/Sub/Mul instructions supported yet.
   if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
       OpCode != Instruction::Mul)
-    return {nullptr, Unknown};
+    return {nullptr, ExtendKind::Unknown};
 
   // One operand (NarrowDef) has already been extended to WideDef. Now determine
   // if extending the other will lead to a recurrence.
@@ -1362,14 +1372,14 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
   const OverflowingBinaryOperator *OBO =
     cast<OverflowingBinaryOperator>(DU.NarrowUse);
   ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
-  if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
+  if (ExtKind == ExtendKind::Sign && OBO->hasNoSignedWrap())
     ExtendOperExpr = SE->getSignExtendExpr(
       SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
+  else if (ExtKind == ExtendKind::Zero && OBO->hasNoUnsignedWrap())
     ExtendOperExpr = SE->getZeroExtendExpr(
       SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
   else
-    return {nullptr, Unknown};
+    return {nullptr, ExtendKind::Unknown};
 
   // When creating this SCEV expr, don't apply the current operations NSW or NUW
   // flags. This instruction may be guarded by control flow that the no-wrap
@@ -1387,7 +1397,7 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
       dyn_cast<SCEVAddRecExpr>(getSCEVByOpCode(lhs, rhs, OpCode));
 
   if (!AddRec || AddRec->getLoop() != L)
-    return {nullptr, Unknown};
+    return {nullptr, ExtendKind::Unknown};
 
   return {AddRec, ExtKind};
 }
@@ -1396,17 +1406,17 @@ WidenIV::getExtendedOperandRecurrence(WidenIV::NarrowIVDefUse DU) {
 /// widening it's type? In other words, can the extend be safely hoisted out of
 /// the loop with SCEV reducing the value to a recurrence on the same loop. If
 /// so, return the extended recurrence and the kind of extension used. Otherwise
-/// return {nullptr, Unknown}.
+/// return {nullptr, ExtendKind::Unknown}.
 WidenIV::WidenedRecTy WidenIV::getWideRecurrence(WidenIV::NarrowIVDefUse DU) {
   if (!DU.NarrowUse->getType()->isIntegerTy())
-    return {nullptr, Unknown};
+    return {nullptr, ExtendKind::Unknown};
 
   const SCEV *NarrowExpr = SE->getSCEV(DU.NarrowUse);
   if (SE->getTypeSizeInBits(NarrowExpr->getType()) >=
       SE->getTypeSizeInBits(WideType)) {
     // NarrowUse implicitly widens its operand. e.g. a gep with a narrow
     // index. So don't follow this use.
-    return {nullptr, Unknown};
+    return {nullptr, ExtendKind::Unknown};
   }
 
   const SCEV *WideExpr;
@@ -1414,21 +1424,21 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(WidenIV::NarrowIVDefUse DU) {
   if (DU.NeverNegative) {
     WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
     if (isa<SCEVAddRecExpr>(WideExpr))
-      ExtKind = SignExtended;
+      ExtKind = ExtendKind::Sign;
     else {
       WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
-      ExtKind = ZeroExtended;
+      ExtKind = ExtendKind::Zero;
     }
-  } else if (getExtendKind(DU.NarrowDef) == SignExtended) {
+  } else if (getExtendKind(DU.NarrowDef) == ExtendKind::Sign) {
     WideExpr = SE->getSignExtendExpr(NarrowExpr, WideType);
-    ExtKind = SignExtended;
+    ExtKind = ExtendKind::Sign;
   } else {
     WideExpr = SE->getZeroExtendExpr(NarrowExpr, WideType);
-    ExtKind = ZeroExtended;
+    ExtKind = ExtendKind::Zero;
   }
   const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(WideExpr);
   if (!AddRec || AddRec->getLoop() != L)
-    return {nullptr, Unknown};
+    return {nullptr, ExtendKind::Unknown};
   return {AddRec, ExtKind};
 }
 
@@ -1468,7 +1478,7 @@ bool WidenIV::widenLoopCompare(WidenIV::NarrowIVDefUse DU) {
   //
   //      (A) == icmp slt i32 sext(%narrow), sext(%val)
   //          == icmp slt i32 zext(%narrow), sext(%val)
-  bool IsSigned = getExtendKind(DU.NarrowDef) == SignExtended;
+  bool IsSigned = getExtendKind(DU.NarrowDef) == ExtendKind::Sign;
   if (!(DU.NeverNegative || IsSigned == Cmp->isSigned()))
     return false;
 
@@ -1533,8 +1543,8 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
   const OverflowingBinaryOperator *OBO =
     cast<OverflowingBinaryOperator>(NarrowUse);
   ExtendKind ExtKind = getExtendKind(NarrowDef);
-  bool CanSignExtend = ExtKind == SignExtended && OBO->hasNoSignedWrap();
-  bool CanZeroExtend = ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap();
+  bool CanSignExtend = ExtKind == ExtendKind::Sign && OBO->hasNoSignedWrap();
+  bool CanZeroExtend = ExtKind == ExtendKind::Zero && OBO->hasNoUnsignedWrap();
   auto AnotherOpExtKind = ExtKind;
 
   // Check that all uses are either:
@@ -1564,14 +1574,14 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
       // predicates. For equality, it's legal to widen icmp for either sign and
       // zero extend. For sign extend, we can also do so for signed predicates,
       // likeweise for zero extend we can widen icmp for unsigned predicates.
-      if (ExtKind == ZeroExtended && ICmpInst::isSigned(Pred))
+      if (ExtKind == ExtendKind::Zero && ICmpInst::isSigned(Pred))
         return false;
-      if (ExtKind == SignExtended && ICmpInst::isUnsigned(Pred))
+      if (ExtKind == ExtendKind::Sign && ICmpInst::isUnsigned(Pred))
         return false;
       ICmpUsers.push_back(ICmp);
       continue;
     }
-    if (ExtKind == SignExtended)
+    if (ExtKind == ExtendKind::Sign)
       User = dyn_cast<SExtInst>(User);
     else
       User = dyn_cast<ZExtInst>(User);
@@ -1594,7 +1604,7 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
     // will most likely not see it. Let's try to prove it.
     if (OpCode != Instruction::Add)
       return false;
-    if (ExtKind != ZeroExtended)
+    if (ExtKind != ExtendKind::Zero)
       return false;
     const SCEV *LHS = SE->getSCEV(OBO->getOperand(0));
     const SCEV *RHS = SE->getSCEV(OBO->getOperand(1));
@@ -1609,7 +1619,7 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
       return false;
     // In fact, our 'add' is 'sub nuw'. We will need to widen the 2nd operand as
     // neg(zext(neg(op))), which is basically sext(op).
-    AnotherOpExtKind = SignExtended;
+    AnotherOpExtKind = ExtendKind::Sign;
   }
 
   // Verifying that Defining operand is an AddRec
@@ -1621,14 +1631,16 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
   LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
 
   // Generating a widening use instruction.
-  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(0), WideType,
-                                      AnotherOpExtKind, NarrowUse);
-  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
-                   ? WideDef
-                   : createExtendInst(NarrowUse->getOperand(1), WideType,
-                                      AnotherOpExtKind, NarrowUse);
+  Value *LHS =
+      (NarrowUse->getOperand(0) == NarrowDef)
+          ? WideDef
+          : createExtendInst(NarrowUse->getOperand(0), WideType,
+                             AnotherOpExtKind == ExtendKind::Sign, NarrowUse);
+  Value *RHS =
+      (NarrowUse->getOperand(1) == NarrowDef)
+          ? WideDef
+          : createExtendInst(NarrowUse->getOperand(1), WideType,
+                             AnotherOpExtKind == ExtendKind::Sign, NarrowUse);
 
   auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
   auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
@@ -1667,7 +1679,7 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) {
     auto ExtendedOp = [&](Value * V)->Value * {
       if (V == NarrowUse)
         return WideBO;
-      if (ExtKind == ZeroExtended)
+      if (ExtKind == ExtendKind::Zero)
         return Builder.CreateZExt(V, WideBO->getType());
       else
         return Builder.CreateSExt(V, WideBO->getType());
@@ -1723,10 +1735,10 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, SCEVExpander &Rewri
   // This narrow use can be widened by a sext if it's non-negative or its narrow
   // def was widended by a sext. Same for zext.
   auto canWidenBySExt = [&]() {
-    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == SignExtended;
+    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign;
   };
   auto canWidenByZExt = [&]() {
-    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ZeroExtended;
+    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Zero;
   };
 
   // Our raison d'etre! Eliminate sign and zero extension.
@@ -1774,7 +1786,8 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, SCEVExpander &Rewri
   if (!WideAddRec.first)
     WideAddRec = getWideRecurrence(DU);
 
-  assert((WideAddRec.first == nullptr) == (WideAddRec.second == Unknown));
+  assert((WideAddRec.first == nullptr) ==
+         (WideAddRec.second == ExtendKind::Unknown));
   if (!WideAddRec.first) {
     // If use is a loop condition, try to promote the condition instead of
     // truncating the IV first.
@@ -1869,7 +1882,7 @@ PHINode *WidenIV::createWideIV(SCEVExpander &Rewriter) {
     return nullptr;
 
   // Widen the induction variable expression.
-  const SCEV *WideIVExpr = getExtendKind(OrigPhi) == SignExtended
+  const SCEV *WideIVExpr = getExtendKind(OrigPhi) == ExtendKind::Sign
                                ? SE->getSignExtendExpr(AddRec, WideType)
                                : SE->getZeroExtendExpr(AddRec, WideType);
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index b359717424a6..bca3b0538c5d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1948,14 +1948,16 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
   if (Value *Sqrt = replacePowWithSqrt(Pow, B))
     return Sqrt;
 
+  // If we can approximate pow:
   // pow(x, n) -> powi(x, n) * sqrt(x) if n has exactly a 0.5 fraction
+  // pow(x, n) -> powi(x, n) if n is a constant signed integer value
   const APFloat *ExpoF;
-  if (match(Expo, m_APFloat(ExpoF)) && !ExpoF->isExactlyValue(0.5) &&
-      !ExpoF->isExactlyValue(-0.5)) {
+  if (AllowApprox && match(Expo, m_APFloat(ExpoF)) &&
+      !ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)) {
     APFloat ExpoA(abs(*ExpoF));
     APFloat ExpoI(*ExpoF);
     Value *Sqrt = nullptr;
-    if (AllowApprox && !ExpoA.isInteger()) {
+    if (!ExpoA.isInteger()) {
       APFloat Expo2 = ExpoA;
       // To check if ExpoA is an integer + 0.5, we add it to itself. If there
       // is no floating point exception and the result is an integer, then
@@ -1979,7 +1981,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
         return nullptr;
     }
 
-    // pow(x, n) -> powi(x, n) if n is a constant signed integer value
+    // 0.5 fraction is now optionally handled.
+    // Do pow -> powi for remaining integer exponent
     APSInt IntExpo(TLI->getIntSize(), /*isUnsigned=*/false);
     if (ExpoF->isInteger() &&
         ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) ==
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 832353741500..9bbfe06b9abb 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -145,8 +145,6 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   // locate the exit blocks.
   SetVector<BasicBlock *> ExitingBlocks;
   SetVector<BasicBlock *> Exits;
-  // Record the exit blocks that branch to the same block.
-  MapVector<BasicBlock *, SetVector<BasicBlock *> > CommonSuccs;
 
   // We need SetVectors, but the Loop API takes a vector, so we use a temporary.
   SmallVector<BasicBlock *, 8> Temp;
@@ -160,11 +158,6 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
       if (SL == L || L->contains(SL))
         continue;
       Exits.insert(S);
-      // The typical case for reducing the number of guard blocks occurs when
-      // the exit block has a single predecessor and successor.
-      if (S->getSinglePredecessor())
-        if (auto *Succ = S->getSingleSuccessor())
-          CommonSuccs[Succ].insert(S);
     }
   }
 
@@ -179,39 +172,13 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
       for (auto EB : ExitingBlocks) {
         dbgs() << " " << EB->getName();
       }
-      dbgs() << "\n";
-
-      dbgs() << "Exit blocks with a common successor:\n";
-      for (auto CS : CommonSuccs) {
-        dbgs() << "  Succ " << CS.first->getName() << ", exits:";
-        for (auto Exit : CS.second)
-          dbgs() << " " << Exit->getName();
-        dbgs() << "\n";
-      });
+      dbgs() << "\n";);
 
   if (Exits.size() <= 1) {
     LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n");
     return false;
   }
 
-  // When multiple exit blocks branch to the same block, change the control
-  // flow hub to after the exit blocks rather than before. This reduces the
-  // number of guard blocks needed after the loop.
-  for (auto CS : CommonSuccs) {
-    auto CB = CS.first;
-    auto Preds = CS.second;
-    if (Exits.contains(CB))
-      continue;
-    if (Preds.size() < 2 || Preds.size() == Exits.size())
-      continue;
-    for (auto Exit : Preds) {
-      Exits.remove(Exit);
-      ExitingBlocks.remove(Exit->getSinglePredecessor());
-      ExitingBlocks.insert(Exit);
-    }
-    Exits.insert(CB);
-  }
-
   SmallVector<BasicBlock *, 8> GuardBlocks;
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks,
@@ -231,17 +198,6 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   if (auto ParentLoop = L->getParentLoop()) {
     for (auto G : GuardBlocks) {
       ParentLoop->addBasicBlockToLoop(G, LI);
-      // Ensure the guard block predecessors are in a valid loop. After the
-      // change to the control flow hub for common successors, a guard block
-      // predecessor may not be in a loop or may be in an outer loop.
-      for (auto Pred : predecessors(G)) {
-        auto PredLoop = LI.getLoopFor(Pred);
-        if (!ParentLoop->contains(PredLoop)) {
-          if (PredLoop)
-            LI.removeBlock(Pred);
-          ParentLoop->addBasicBlockToLoop(Pred, LI);
-        }
-      }
     }
     ParentLoop->verifyLoop();
   }
diff --git a/llvm/lib/Transforms/Utils/Utils.cpp b/llvm/lib/Transforms/Utils/Utils.cpp
index f34f2df971b1..d002922cfd30 100644
--- a/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/llvm/lib/Transforms/Utils/Utils.cpp
@@ -28,7 +28,6 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeAssumeSimplifyPassLegacyPassPass(Registry);
   initializeAssumeBuilderPassLegacyPassPass(Registry);
   initializeBreakCriticalEdgesPass(Registry);
-  initializeCanonicalizeAliasesLegacyPassPass(Registry);
   initializeCanonicalizeFreezeInLoopsPass(Registry);
   initializeInstNamerPass(Registry);
   initializeLCSSAWrapperPassPass(Registry);
@@ -37,7 +36,6 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeLowerGlobalDtorsLegacyPassPass(Registry);
   initializeLowerInvokeLegacyPassPass(Registry);
   initializeLowerSwitchLegacyPassPass(Registry);
-  initializeNameAnonGlobalLegacyPassPass(Registry);
   initializePromoteLegacyPassPass(Registry);
   initializeStripNonLineTableDebugLegacyPassPass(Registry);
   initializeUnifyFunctionExitNodesLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0777a1385916..b887ea41676b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -92,6 +92,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -473,7 +474,7 @@ public:
   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
 
   /// Widen a single call instruction within the innermost loop.
-  void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
+  void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
                             VPTransformState &State);
 
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
@@ -1447,15 +1448,14 @@ public:
   // through scalar predication or masked load/store or masked gather/scatter.
   // \p VF is the vectorization factor that will be used to vectorize \p I.
   // Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I, ElementCount VF,
-                        bool IsKnownUniform = false) {
-    // When we know the load is uniform and the original scalar loop was not
-    // predicated we don't need to mark it as a predicated instruction. Any
-    // vectorised blocks created when tail-folding are something artificial we
-    // have introduced and we know there is always at least one active lane.
-    // That's why we call Legal->blockNeedsPredication here because it doesn't
-    // query tail-folding.
-    if (IsKnownUniform && isa<LoadInst>(I) &&
+  bool isPredicatedInst(Instruction *I, ElementCount VF) {
+    // When we know the load's address is loop invariant and the instruction
+    // in the original scalar loop was unconditionally executed then we
+    // don't need to mark it as a predicated instruction. Tail folding may
+    // introduce additional predication, but we're guaranteed to always have
+    // at least one active lane.  We call Legal->blockNeedsPredication here
+    // because it doesn't query tail-folding.
+    if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&
         !Legal->blockNeedsPredication(I->getParent()))
       return false;
     if (!blockNeedsPredicationForAnyReason(I->getParent()))
@@ -1657,10 +1657,6 @@ private:
   InstructionCost getScalarizationOverhead(Instruction *I,
                                            ElementCount VF) const;
 
-  /// Returns whether the instruction is a load or store and will be a emitted
-  /// as a vector operation.
-  bool isConsecutiveLoadOrStore(Instruction *I);
-
   /// Returns true if an artificially high cost for emulated masked memrefs
   /// should be used.
   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
@@ -1919,10 +1915,13 @@ public:
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
       if (DiffChecks) {
+        Value *RuntimeVF = nullptr;
         MemRuntimeCheckCond = addDiffRuntimeChecks(
             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
-            [VF](IRBuilderBase &B, unsigned Bits) {
-              return getRuntimeVF(B, B.getIntNTy(Bits), VF);
+            [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
+              if (!RuntimeVF)
+                RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
+              return RuntimeVF;
             },
             IC);
       } else {
@@ -2947,11 +2946,17 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   // If tail is to be folded, vector loop takes care of all iterations.
   Type *CountTy = Count->getType();
   Value *CheckMinIters = Builder.getFalse();
-  auto CreateStep = [&]() {
+  auto CreateStep = [&]() -> Value * {
     // Create step with max(MinProTripCount, UF * VF).
-    if (UF * VF.getKnownMinValue() < MinProfitableTripCount.getKnownMinValue())
-      return createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
-    return createStepForVF(Builder, CountTy, VF, UF);
+    if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
+      return createStepForVF(Builder, CountTy, VF, UF);
+
+    Value *MinProfTC =
+        createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
+    if (!VF.isScalable())
+      return MinProfTC;
+    return Builder.CreateBinaryIntrinsic(
+        Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
   };
 
   if (!Cost->foldTailByMasking())
@@ -4168,46 +4173,26 @@ bool InnerLoopVectorizer::useOrderedReductions(
   return Cost->useOrderedReductions(RdxDesc);
 }
 
-/// A helper function for checking whether an integer division-related
-/// instruction may divide by zero (in which case it must be predicated if
-/// executed conditionally in the scalar code).
-/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
-/// Non-zero divisors that are non compile-time constants will not be
-/// converted into multiplication, so we will still end up scalarizing
-/// the division, but can do so w/o predication.
-static bool mayDivideByZero(Instruction &I) {
-  assert((I.getOpcode() == Instruction::UDiv ||
-          I.getOpcode() == Instruction::SDiv ||
-          I.getOpcode() == Instruction::URem ||
-          I.getOpcode() == Instruction::SRem) &&
-         "Unexpected instruction");
-  Value *Divisor = I.getOperand(1);
-  auto *CInt = dyn_cast<ConstantInt>(Divisor);
-  return !CInt || CInt->isZero();
-}
-
-void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
+void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
                                                VPUser &ArgOperands,
                                                VPTransformState &State) {
-  assert(!isa<DbgInfoIntrinsic>(I) &&
+  assert(!isa<DbgInfoIntrinsic>(CI) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
-  State.setDebugLocFromInst(&I);
-
-  Module *M = I.getParent()->getParent()->getParent();
-  auto *CI = cast<CallInst>(&I);
+  State.setDebugLocFromInst(&CI);
 
   SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI->args())
+  for (Value *ArgOperand : CI.args())
     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
 
-  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);
 
   // The flag shows whether we use Intrinsic or a usual Call for vectorized
   // version of the instruction.
   // Is it beneficial to perform intrinsic call compared to lib call?
   bool NeedToScalarize = false;
-  InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
-  InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
+  InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);
+  InstructionCost IntrinsicCost =
+      ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
   assert((UseVectorIntrinsic || !NeedToScalarize) &&
          "Instruction should be scalarized elsewhere.");
@@ -4215,7 +4200,7 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
          "Either the intrinsic cost or vector call cost must be valid");
 
   for (unsigned Part = 0; Part < UF; ++Part) {
-    SmallVector<Type *, 2> TysForDecl = {CI->getType()};
+    SmallVector<Type *, 2> TysForDecl = {CI.getType()};
     SmallVector<Value *, 4> Args;
     for (auto &I : enumerate(ArgOperands.operands())) {
       // Some intrinsics have a scalar argument - don't replace it with a
@@ -4235,27 +4220,28 @@ void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
     if (UseVectorIntrinsic) {
       // Use vector version of the intrinsic.
       if (VF.isVector())
-        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+        TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
+      Module *M = State.Builder.GetInsertBlock()->getModule();
       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
       assert(VectorF && "Can't retrieve vector intrinsic.");
     } else {
       // Use vector version of the function call.
-      const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
+      const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);
 #ifndef NDEBUG
-      assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
+      assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
              "Can't create vector function.");
 #endif
-        VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
+      VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
     }
       SmallVector<OperandBundleDef, 1> OpBundles;
-      CI->getOperandBundlesAsDefs(OpBundles);
+      CI.getOperandBundlesAsDefs(OpBundles);
       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
 
       if (isa<FPMathOperator>(V))
-        V->copyFastMathFlags(CI);
+        V->copyFastMathFlags(&CI);
 
       State.set(Def, V, Part);
-      State.addMetadata(V, &I);
+      State.addMetadata(V, &CI);
   }
 }
 
@@ -4470,7 +4456,9 @@ bool LoopVectorizationCostModel::isScalarWithPredication(
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem:
-    return mayDivideByZero(*I);
+    // TODO: We can use the loop-preheader as context point here and get
+    // context sensitive reasoning
+    return !isSafeToSpeculativelyExecute(I);
   }
   return false;
 }
@@ -5406,7 +5394,7 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
   }
 
   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
-                 ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
+                 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
              << "LV: Vectorization seems to be not beneficial, "
              << "but was forced by a user.\n");
   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
@@ -6069,7 +6057,8 @@ bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
   // from moving "masked load/store" check from legality to cost model.
   // Masked Load/Gather emulation was previously never allowed.
   // Limited number of Masked Store/Scatter emulation was allowed.
-  assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
+  assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&
+         "Expecting a scalar emulated instruction");
   return isa<LoadInst>(I) ||
          (isa<StoreInst>(I) &&
           NumPredStores > NumberOfStoresToPredicate);
@@ -6779,19 +6768,29 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
         NumPredStores++;
 
       if (Legal->isUniformMemOp(I)) {
-        // TODO: Avoid replicating loads and stores instead of
-        // relying on instcombine to remove them.
+        // Lowering story for uniform memory ops is currently a bit complicated.
+        // Scalarization works for everything which isn't a store with scalable
+        // VF.  Fixed len VFs just scalarize and then DCE later; scalarization
+        // knows how to handle uniform-per-part values (i.e. the first lane
+        // in each unrolled VF) and can thus handle scalable loads too.  For
+        // scalable stores, we use a scatter if legal.  If not, we have no way
+        // to lower (currently) and thus have to abort vectorization.
+        if (isa<StoreInst>(&I) && VF.isScalable()) {
+          if (isLegalGatherOrScatter(&I, VF))
+            setWideningDecision(&I, VF, CM_GatherScatter,
+                                getGatherScatterCost(&I, VF));
+          else
+            // Error case, abort vectorization
+            setWideningDecision(&I, VF, CM_Scalarize,
+                                InstructionCost::getInvalid());
+          continue;
+        }
         // Load: Scalar load + broadcast
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
-        InstructionCost Cost;
-        if (isa<StoreInst>(&I) && VF.isScalable() &&
-            isLegalGatherOrScatter(&I, VF)) {
-          Cost = getGatherScatterCost(&I, VF);
-          setWideningDecision(&I, VF, CM_GatherScatter, Cost);
-        } else {
-          Cost = getUniformMemOpCost(&I, VF);
-          setWideningDecision(&I, VF, CM_Scalarize, Cost);
-        }
+        // TODO: Avoid replicating loads and stores instead of relying on
+        // instcombine to remove them.
+        setWideningDecision(&I, VF, CM_Scalarize,
+                            getUniformMemOpCost(&I, VF));
         continue;
       }
 
@@ -7146,13 +7145,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
       InstWidening Decision = getWideningDecision(I, Width);
       assert(Decision != CM_Unknown &&
              "CM decision should be taken at this point");
-      if (Decision == CM_Scalarize) {
-        if (VF.isScalable() && isa<StoreInst>(I))
-          // We can't scalarize a scalable vector store (even a uniform one
-          // currently), return an invalid cost so as to prevent vectorization.
-          return InstructionCost::getInvalid();
+      if (getWideningCost(I, VF) == InstructionCost::getInvalid())
+        return InstructionCost::getInvalid();
+      if (Decision == CM_Scalarize)
         Width = ElementCount::getFixed(1);
-      }
     }
     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
     return getMemoryInstructionCost(I, VF);
@@ -7308,14 +7304,6 @@ Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
 
 } // end namespace llvm
 
-bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
-  // Check if the pointer operand of a load or store instruction is
-  // consecutive.
-  if (auto *Ptr = getLoadStorePointerOperand(Inst))
-    return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
-  return false;
-}
-
 void LoopVectorizationCostModel::collectValuesToIgnore() {
   // Ignore ephemeral values.
   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
@@ -8370,7 +8358,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
       Range);
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
+      [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },
       Range);
 
   // Even if the instruction is not marked as uniform, there are certain
@@ -8406,8 +8394,6 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
 
   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
                                        IsUniform, IsPredicated);
-  setRecipe(I, Recipe);
-  Plan->addVPValue(I, Recipe);
 
   // Find if I uses a predicated instruction. If so, it will use its scalar
   // value. Avoid hoisting the insert-element which packs the scalar value into
@@ -8426,6 +8412,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
   // Finalize the recipe for Instr, first if it is not predicated.
   if (!IsPredicated) {
     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+    setRecipe(I, Recipe);
+    Plan->addVPValue(I, Recipe);
     VPBB->appendRecipe(Recipe);
     return VPBB;
   }
@@ -8436,7 +8424,7 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
                        "predicated replication.");
   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
   // Record predicated instructions for above packing optimizations.
-  VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+  VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
   VPBlockUtils::insertBlockAfter(Region, VPBB);
   auto *RegSucc = new VPBasicBlock();
   VPBlockUtils::insertBlockAfter(RegSucc, Region);
@@ -8444,11 +8432,12 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
   return RegSucc;
 }
 
-VPRegionBlock *VPRecipeBuilder::createReplicateRegion(
-    Instruction *Instr, VPReplicateRecipe *PredRecipe, VPlanPtr &Plan) {
+VPRegionBlock *
+VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
+                                       VPlanPtr &Plan) {
+  Instruction *Instr = PredRecipe->getUnderlyingInstr();
   // Instructions marked for predication are replicated and placed under an
   // if-then construct to prevent side-effects.
-
   // Generate recipes to compute the block mask for this region.
   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
 
@@ -8461,9 +8450,13 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(
                         ? nullptr
                         : new VPPredInstPHIRecipe(PredRecipe);
   if (PHIRecipe) {
-    Plan->removeVPValueFor(Instr);
+    setRecipe(Instr, PHIRecipe);
     Plan->addVPValue(Instr, PHIRecipe);
+  } else {
+    setRecipe(Instr, PredRecipe);
+    Plan->addVPValue(Instr, PredRecipe);
   }
+
   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
@@ -9564,12 +9557,19 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
     return;
   }
 
-  // Generate scalar instances for all VF lanes of all UF parts, unless the
-  // instruction is uniform inwhich case generate only the first lane for each
-  // of the UF parts.
-  unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
-  assert((!State.VF.isScalable() || IsUniform) &&
-         "Can't scalarize a scalable vector");
+  if (IsUniform) {
+    // Uniform within VL means we need to generate lane 0 only for each
+    // unrolled copy.
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
+                                      VPIteration(Part, 0), IsPredicated,
+                                      State);
+    return;
+  }
+
+  // Generate scalar instances for all VF lanes of all UF parts.
+  assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
+  const unsigned EndLane = State.VF.getKnownMinValue();
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
@@ -9577,52 +9577,6 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
                                       State);
 }
 
-void VPPredInstPHIRecipe::execute(VPTransformState &State) {
-  assert(State.Instance && "Predicated instruction PHI works per instance.");
-  Instruction *ScalarPredInst =
-      cast<Instruction>(State.get(getOperand(0), *State.Instance));
-  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
-  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
-  assert(PredicatingBB && "Predicated block has no single predecessor.");
-  assert(isa<VPReplicateRecipe>(getOperand(0)) &&
-         "operand must be VPReplicateRecipe");
-
-  // By current pack/unpack logic we need to generate only a single phi node: if
-  // a vector value for the predicated instruction exists at this point it means
-  // the instruction has vector users only, and a phi for the vector value is
-  // needed. In this case the recipe of the predicated instruction is marked to
-  // also do that packing, thereby "hoisting" the insert-element sequence.
-  // Otherwise, a phi node for the scalar value is needed.
-  unsigned Part = State.Instance->Part;
-  if (State.hasVectorValue(getOperand(0), Part)) {
-    Value *VectorValue = State.get(getOperand(0), Part);
-    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
-    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
-    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
-    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
-    if (State.hasVectorValue(this, Part))
-      State.reset(this, VPhi, Part);
-    else
-      State.set(this, VPhi, Part);
-    // NOTE: Currently we need to update the value of the operand, so the next
-    // predicated iteration inserts its generated value in the correct vector.
-    State.reset(getOperand(0), VPhi, Part);
-  } else {
-    Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
-    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
-    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
-                     PredicatingBB);
-    Phi->addIncoming(ScalarPredInst, PredicatedBB);
-    if (State.hasScalarValue(this, *State.Instance))
-      State.reset(this, Phi, *State.Instance);
-    else
-      State.set(this, Phi, *State.Instance);
-    // NOTE: Currently we need to update the value of the operand, so the next
-    // predicated iteration inserts its generated value in the correct vector.
-    State.reset(getOperand(0), Phi, *State.Instance);
-  }
-}
-
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
 
@@ -9793,8 +9747,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
   };
 
   // 4) if the TTI hook indicates this is profitable, request predication.
-  if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
-                                       LVL.getLAI()))
+  if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))
     return CM_ScalarEpilogueNotNeededUsePredicate;
 
   return CM_ScalarEpilogueAllowed;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e136cd9aedac..cd044c78d900 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3744,7 +3744,7 @@ void BoUpSLP::reorderTopToBottom() {
       unsigned Opcode0 = TE->getOpcode();
       unsigned Opcode1 = TE->getAltOpcode();
       // The opcode mask selects between the two opcodes.
-      SmallBitVector OpcodeMask(TE->Scalars.size(), 0);
+      SmallBitVector OpcodeMask(TE->Scalars.size(), false);
       for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
         if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
           OpcodeMask.set(Lane);
@@ -4814,6 +4814,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
     return;
   }
 
+  // Don't go into catchswitch blocks, which can happen with PHIs.
+  // Such blocks can only have PHIs and the catchswitch.  There is no
+  // place to insert a shuffle if we need to, so just avoid that issue.
+  if (isa<CatchSwitchInst>(BB->getTerminator())) {
+    LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
+    newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+    return;
+  }
+
   // Check that every instruction appears once in this bundle.
   if (!TryToFindDuplicates(S))
     return;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index c7949c42c03e..07d3fa56020b 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -157,10 +157,8 @@ public:
     return Ingredient2Recipe[I];
   }
 
-  /// Create a replicating region for instruction \p I that requires
-  /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
-  VPRegionBlock *createReplicateRegion(Instruction *I,
-                                       VPReplicateRecipe *PredRecipe,
+  /// Create a replicating region for \p PredRecipe.
+  VPRegionBlock *createReplicateRegion(VPReplicateRecipe *PredRecipe,
                                        VPlanPtr &Plan);
 
   /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index fdd901a4a70d..cb7507264667 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -892,6 +892,52 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   ReplaceInstWithInst(CurrentTerminator, CondBr);
 }
 
+void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+  assert(State.Instance && "Predicated instruction PHI works per instance.");
+  Instruction *ScalarPredInst =
+      cast<Instruction>(State.get(getOperand(0), *State.Instance));
+  BasicBlock *PredicatedBB = ScalarPredInst->getParent();
+  BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
+  assert(PredicatingBB && "Predicated block has no single predecessor.");
+  assert(isa<VPReplicateRecipe>(getOperand(0)) &&
+         "operand must be VPReplicateRecipe");
+
+  // By current pack/unpack logic we need to generate only a single phi node: if
+  // a vector value for the predicated instruction exists at this point it means
+  // the instruction has vector users only, and a phi for the vector value is
+  // needed. In this case the recipe of the predicated instruction is marked to
+  // also do that packing, thereby "hoisting" the insert-element sequence.
+  // Otherwise, a phi node for the scalar value is needed.
+  unsigned Part = State.Instance->Part;
+  if (State.hasVectorValue(getOperand(0), Part)) {
+    Value *VectorValue = State.get(getOperand(0), Part);
+    InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
+    PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
+    VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
+    VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+    if (State.hasVectorValue(this, Part))
+      State.reset(this, VPhi, Part);
+    else
+      State.set(this, VPhi, Part);
+    // NOTE: Currently we need to update the value of the operand, so the next
+    // predicated iteration inserts its generated value in the correct vector.
+    State.reset(getOperand(0), VPhi, Part);
+  } else {
+    Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
+    PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
+    Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
+                     PredicatingBB);
+    Phi->addIncoming(ScalarPredInst, PredicatedBB);
+    if (State.hasScalarValue(this, *State.Instance))
+      State.reset(this, Phi, *State.Instance);
+    else
+      State.set(this, Phi, *State.Instance);
+    // NOTE: Currently we need to update the value of the operand, so the next
+    // predicated iteration inserts its generated value in the correct vector.
+    State.reset(getOperand(0), Phi, *State.Instance);
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
                                 VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 3501de6ab38e..43e0a40fedb9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -133,7 +133,9 @@ void VPlanVerifier::verifyHierarchicalCFG(
   verifyRegionRec(TopRegion);
 }
 
-static bool verifyVPBasicBlock(const VPBasicBlock *VPBB) {
+static bool
+verifyVPBasicBlock(const VPBasicBlock *VPBB,
+                   DenseMap<const VPBlockBase *, unsigned> &BlockNumbering) {
   // Verify that phi-like recipes are at the beginning of the block, with no
   // other recipes in between.
   auto RecipeI = VPBB->begin();
@@ -165,15 +167,71 @@ static bool verifyVPBasicBlock(const VPBasicBlock *VPBB) {
     RecipeI++;
   }
 
+  // Verify that defs in VPBB dominate all their uses. The current
+  // implementation is still incomplete.
+  DenseMap<const VPRecipeBase *, unsigned> RecipeNumbering;
+  unsigned Cnt = 0;
+  for (const VPRecipeBase &R : *VPBB)
+    RecipeNumbering[&R] = Cnt++;
+
+  for (const VPRecipeBase &R : *VPBB) {
+    for (const VPValue *V : R.definedValues()) {
+      for (const VPUser *U : V->users()) {
+        auto *UI = dyn_cast<VPRecipeBase>(U);
+        if (!UI || isa<VPHeaderPHIRecipe>(UI))
+          continue;
+
+        // If the user is in the same block, check it comes after R in the
+        // block.
+        if (UI->getParent() == VPBB) {
+          if (RecipeNumbering[UI] < RecipeNumbering[&R]) {
+            errs() << "Use before def!\n";
+            return false;
+          }
+          continue;
+        }
+
+        // Skip blocks outside any region for now and blocks outside
+        // replicate-regions.
+        auto *ParentR = VPBB->getParent();
+        if (!ParentR || !ParentR->isReplicator())
+          continue;
+
+        // For replicators, verify that VPPRedInstPHIRecipe defs are only used
+        // in subsequent blocks.
+        if (isa<VPPredInstPHIRecipe>(&R)) {
+          auto I = BlockNumbering.find(UI->getParent());
+          unsigned BlockNumber = I == BlockNumbering.end() ? std::numeric_limits<unsigned>::max() : I->second;
+          if (BlockNumber < BlockNumbering[ParentR]) {
+            errs() << "Use before def!\n";
+            return false;
+          }
+          continue;
+        }
+
+        // All non-VPPredInstPHIRecipe recipes in the block must be used in
+        // the replicate region only.
+        if (UI->getParent()->getParent() != ParentR) {
+          errs() << "Use before def!\n";
+          return false;
+        }
+      }
+    }
+  }
   return true;
 }
 
 bool VPlanVerifier::verifyPlanIsValid(const VPlan &Plan) {
+  DenseMap<const VPBlockBase *, unsigned> BlockNumbering;
+  unsigned Cnt = 0;
   auto Iter = depth_first(
       VPBlockRecursiveTraversalWrapper<const VPBlockBase *>(Plan.getEntry()));
-  for (const VPBasicBlock *VPBB :
-       VPBlockUtils::blocksOnly<const VPBasicBlock>(Iter)) {
-    if (!verifyVPBasicBlock(VPBB))
+  for (const VPBlockBase *VPB : Iter) {
+    BlockNumbering[VPB] = Cnt++;
+    auto *VPBB = dyn_cast<VPBasicBlock>(VPB);
+    if (!VPBB)
+      continue;
+    if (!verifyVPBasicBlock(VPBB, BlockNumbering))
       return false;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d12624ffb824..a38936644bd3 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1302,7 +1302,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
     for (ShuffleVectorInst *SV : Shuffles) {
       for (auto U : SV->users()) {
         ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
-        if (SSV && isa<UndefValue>(SSV->getOperand(1)))
+        if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
           Shuffles.push_back(SSV);
       }
     }